In [2]:
#Group 5
#Julia Aptekar, DePaul University, japtekar@depaul.edu
#John Leniart, DePaul University, jleniart@depaul.edu
#Arham Mehdi, DePaul University kmehdi@depaul.edu
#Natalie Olechno, DePaul University, nolechno@depaul.edu

import pandas as pd 
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import accuracy_score, classification_report
import re

file_path = "/kaggle/input/impact-genome/Combined Data.xlsx"
data = pd.read_excel(file_path)

data = data.dropna(subset=['programdescription', 'outcomeid'])

data.fillna({'impactarea': 'unknown', 'genome': 'unknown'}, inplace=True)

id_lst = [168, 186, 3238, 3461, 3473, 362, 3677, 3744, 3746, 3770, 
          3794, 4012, 4250, 453, 464, 471, 4815, 6917, 7555, 780, 877]

data.drop(data[data['programreportid'].isin(id_lst)].index, inplace=True)

# Converting all text columns to lowercase for consistency
for col in ['programdescription', 'impactarea', 'genome', 'outcomeid']:
    data[col] = data[col].astype(str).str.lower()

# Mapping old outcome IDs to the larger (current) outcome IDs.
outcome_id_mapping = {
    "625": "861",  # Current Financial Stability
    "626": "863",  # Financial Access
    "78": "860",   # Financial Resilience
    "53": "859"    # Future Security
}
mask = (data['impactarea'] == 'economic development') & (data['genome'] == 'financial health')
data.loc[mask, 'outcomeid'] = data.loc[mask, 'outcomeid'].replace(outcome_id_mapping)

# Cleaning program descriptions (remove special characters)
data['programdescription'] = data['programdescription'].apply(lambda x: re.sub(r"[^A-Za-z0-9 :.,'-]+", "", x))

# Encode categorical labels into numerical values
label_encoders = {}
for col in ['impactarea', 'genome', 'outcomeid']:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le

# Storing class counts
impactarea_classes = len(label_encoders['impactarea'].classes_)
genome_classes = len(label_encoders['genome'].classes_)
outcomeid_classes = len(label_encoders['outcomeid'].classes_)

# Here I define Dataset and Model
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding='max_length',
            max_length=self.max_len,
            return_tensors="pt"
        )
        return encoding.input_ids.squeeze(0), encoding.attention_mask.squeeze(0), torch.tensor(self.labels[idx])

class BertClassifier(nn.Module):
    def __init__(self, num_classes):
        super(BertClassifier, self).__init__()
        self.bert = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_classes)

    def forward(self, input_ids, attention_mask):
        return self.bert(input_ids=input_ids, attention_mask=attention_mask).logits

# Training and Evaluation Functions
def train_model(model, train_loader, num_epochs=3, lr=2e-5, class_weights=None):
    """
    Trains a BERT-based classifier using CrossEntropy loss and AdamW optimizer.
    """
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    optimizer = optim.AdamW(model.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss(weight=class_weights.to(device) if class_weights is not None else None)

    for epoch in range(num_epochs):
        model.train()
        total_loss = 0

        for input_ids, attention_mask, labels in train_loader:
            input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        print(f"Epoch {epoch+1}/{num_epochs}, Loss: {total_loss / len(train_loader)}")

def evaluate_model(model, test_loader):
    """
    Evaluates the model and prints accuracy and classification report.
    """
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    model.eval()
    predictions, true_labels = [], []

    with torch.no_grad():
        for input_ids, attention_mask, labels in test_loader:
            input_ids, attention_mask = input_ids.to(device), attention_mask.to(device)
            outputs = model(input_ids, attention_mask)
            preds = torch.argmax(outputs, dim=1).cpu().numpy()
            predictions.extend(preds)
            true_labels.extend(labels.numpy())

    accuracy = accuracy_score(true_labels, predictions)
    print(f"Accuracy: {accuracy}\n{classification_report(true_labels, predictions)}")
    return accuracy

# Tokenizer initialization
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def train_and_predict(feature, target, num_classes, epochs):
    """
    Trains and predicts using a hierarchical approach.
    """
    if target == 'outcomeid':
        # No stratification for outcomeid due to rare classes
        X_train, X_test, y_train, y_test = train_test_split(
            data[feature], data[target], test_size=0.2, random_state=42, stratify=None
        )
    else:
        # Stratified sampling for impactarea and genome
        X_train, X_test, y_train, y_test = train_test_split(
            data[feature], data[target], test_size=0.2, random_state=42, stratify=data[target]
        )

    # Compute class weights using the full dataset
    class_weights = compute_class_weight('balanced', classes=np.unique(data[target]), y=data[target])
    class_weights_tensor = torch.tensor(class_weights, dtype=torch.float)

    train_dataset = TextDataset(X_train.tolist(), y_train.tolist(), tokenizer)
    test_dataset = TextDataset(X_test.tolist(), y_test.tolist(), tokenizer)
    train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

    model = BertClassifier(num_classes=num_classes)

    train_model(model, train_loader, num_epochs=epochs, class_weights=class_weights_tensor)
    evaluate_model(model, test_loader)

    # Predict on the entire dataset
    predicted_labels = []
    model.eval()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    with torch.no_grad():
        for text in data[feature]:
            enc = tokenizer(text, return_tensors="pt", truncation=True, padding="max_length", max_length=128)
            enc_input_ids = enc.input_ids.to(device)
            enc_attention_mask = enc.attention_mask.to(device)
            output = model(enc_input_ids, enc_attention_mask)
            predicted_labels.append(torch.argmax(output, dim=1).item())

    data[f'predicted_{target}'] = predicted_labels
    return model

# Step 1: Predict Impact Area from Program Description
impactarea_model = train_and_predict('programdescription', 'impactarea', impactarea_classes, epochs=4)

# Step 2: Predict Genome using both Program Description + Predicted Impact Area
data['predicted_impactarea'] = data['predicted_impactarea'].astype(str)
data['genome_features'] = data['programdescription'] + " " + data['predicted_impactarea']
genome_model = train_and_predict('genome_features', 'genome', genome_classes, epochs=5)

# Step 3: Predict OutcomeID using Program Description + Predicted Impact Area + Predicted Genome
data['predicted_genome'] = [str(g) for g in data['predicted_genome']]
data['outcomeid_features'] = data['programdescription'] + " " + data['predicted_impactarea'] + " " + data['predicted_genome']
outcomeid_model = train_and_predict('outcomeid_features', 'outcomeid', outcomeid_classes, epochs=8)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/4, Loss: 0.9764552333863019
Epoch 2/4, Loss: 0.3954422038557125
Epoch 3/4, Loss: 0.2522857382213113
Epoch 4/4, Loss: 0.1794920494604677
Accuracy: 0.8730378578024007
              precision    recall  f1-score   support

           0       0.89      0.85      0.87        84
           1       0.87      0.95      0.91        21
           2       0.92      0.90      0.91       333
           3       0.92      0.95      0.93        93
           4       0.89      0.84      0.87       396
           5       0.89      0.86      0.87       353
           6       0.92      0.88      0.90       153
           7       0.96      0.93      0.94       351
           8       0.81      0.86      0.84        44
           9       0.97      0.94      0.95        88
          10       0.56      0.69      0.62       116
          11       0.71      0.86      0.77       134

    accuracy                           0.87      2166
   macro avg       0.86      0.88      0.86      2166
weighted avg   

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/5, Loss: 2.665037598557138
Epoch 2/5, Loss: 0.9646851142174203
Epoch 3/5, Loss: 0.546162434527135
Epoch 4/5, Loss: 0.37897252636477075
Epoch 5/5, Loss: 0.2772877818227804
Accuracy: 0.863342566943675
              precision    recall  f1-score   support

           0       0.82      0.92      0.87        53
           1       0.91      1.00      0.95        10
           2       1.00      1.00      1.00         6
           3       0.94      0.89      0.91        35
           4       0.96      0.94      0.95        85
           5       0.64      0.66      0.65        58
           6       0.88      0.88      0.88        17
           7       0.92      0.95      0.93        37
           8       0.87      0.94      0.90        48
           9       0.87      0.86      0.86       122
          10       0.88      0.77      0.82        30
          11       1.00      1.00      1.00        10
          12       0.80      0.80      0.80         5
          13       0.92      0.97   

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/8, Loss: 5.2756373987866505
Epoch 2/8, Loss: 4.207019878489505
Epoch 3/8, Loss: 3.3973206224476717
Epoch 4/8, Loss: 2.730222543227277
Epoch 5/8, Loss: 2.197706792847257
Epoch 6/8, Loss: 1.7506494203176886
Epoch 7/8, Loss: 1.4044801908445534
Epoch 8/8, Loss: 1.1284356823266652


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy: 0.6634349030470914
              precision    recall  f1-score   support

           0       0.80      0.67      0.73         6
           1       0.70      0.88      0.78         8
           2       0.50      1.00      0.67         1
           3       1.00      1.00      1.00         3
           4       1.00      1.00      1.00         5
           5       1.00      1.00      1.00         7
           6       0.83      0.83      0.83         6
           7       0.71      0.38      0.50        13
           8       0.83      1.00      0.91         5
           9       0.75      0.50      0.60         6
          10       0.86      0.75      0.80         8
          11       1.00      0.75      0.86         4
          12       1.00      1.00      1.00         4
          14       0.47      0.28      0.35        29
          15       0.80      0.80      0.80         5
          16       0.20      0.29      0.24         7
          17       0.83      1.00      0.91         