Group 5

Julia Aptekar, DePaul University, japtekar@depaul.edu

John Leniart, DePaul University, jleniart@depaul.edu

Arham Mehdi, DePaul University kmehdi@depaul.edu

Natalie Olechno, DePaul University, nolechno@depaul.edu



In [None]:
# IMPORTANT: SOME KAGGLE DATA SOURCES ARE PRIVATE
# RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES.
import kagglehub
kagglehub.login()


In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

nat7574_combined_data_path = kagglehub.dataset_download('nat7574/combined-data')

print('Data source import complete.')


In [None]:
#from google.colab import drive
#drive.mount('/content/drive')

#file_1 = '/content/drive/MyDrive/Data Science Capstone/Original Data/Combined Data.xlsx'
#data = pd.read_excel(file_1)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import re
import random
import time
import datetime
from collections import Counter

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, TensorDataset, random_split, RandomSampler, WeightedRandomSampler, Subset, SequentialSampler

from transformers import BertTokenizer, BertModel, BertForSequenceClassification
from transformers import AdamW, get_linear_schedule_with_warmup

from sklearn.metrics import f1_score, precision_score, recall_score, classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

#basic BERT
class BasicBertClassifier(nn.Module):
    def __init__(self, num_outcome_labels, dropout_rate=0.3):
        super(BasicBertClassifier, self).__init__()
        self.bert = BertModel.from_pretrained("bert-base-uncased")
        self.dropout = nn.Dropout(dropout_rate)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_outcome_labels)

    def forward(self, input_ids, attention_mask):
        bert_output = self.bert(input_ids=input_ids, attention_mask=attention_mask).pooler_output
        bert_output = self.dropout(bert_output)
        logits = self.classifier(bert_output)
        return logits


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

input_file='/kaggle/input/combined-data/Combined Data.xlsx'
print(f"Loading data from {input_file}")
data = pd.read_excel(input_file)

data.dropna(inplace=True)
data['outcomeid'] = data['outcomeid'].astype(int)
data['programdescription'] = data['programdescription'].apply(lambda x: re.sub(r"[^A-Za-z0-9 :.,'-]+", "", x))

outcome_encoder = LabelEncoder()
data['encoded_outcome_labels'] = outcome_encoder.fit_transform(data['outcomeid'])
num_outcome_labels = len(outcome_encoder.classes_)
print(f"Number of outcome labels: {num_outcome_labels}")

label_counts = Counter(data['encoded_outcome_labels'])
min_class_size = min(label_counts.values())
print(f"Smallest class has {min_class_size} samples")

stratify_option = data['encoded_outcome_labels'] if min_class_size > 1 else None

sentences = data['programdescription'].tolist()
outcome_labels = torch.tensor(data['encoded_outcome_labels'].tolist())

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True)

def tokenize_data(sentences, max_length=240):
    input_ids = []
    attention_masks = []
    for sent in sentences:
        encoded_dict = tokenizer.encode_plus(
            sent,
            add_special_tokens=True,
            max_length=max_length,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])
    return torch.cat(input_ids), torch.cat(attention_masks)

input_ids, attention_masks = tokenize_data(sentences)

dataset = TensorDataset(input_ids, attention_masks, outcome_labels)

train_idx, val_idx = train_test_split(
    range(len(dataset)),
    test_size=0.2,
    random_state=42,
    stratify=stratify_option
)

train_dataset = TensorDataset(
    input_ids[train_idx],
    attention_masks[train_idx],
    outcome_labels[train_idx]
)

val_dataset = TensorDataset(
    input_ids[val_idx],
    attention_masks[val_idx],
    outcome_labels[val_idx]
)

print(f"Training samples: {len(train_dataset)}")
print(f"Validation samples: {len(val_dataset)}")

batch_size = 16
train_dataloader = DataLoader(
    train_dataset,
    batch_size=batch_size
)

validation_dataloader = DataLoader(
    val_dataset,
    sampler=SequentialSampler(val_dataset),
    batch_size=batch_size
)

model = BasicBertClassifier(num_outcome_labels)
model.to(device)

outcome_loss_fn = nn.CrossEntropyLoss()

optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)

def format_time(elapsed):
    elapsed_rounded = int(round((elapsed)))
    return str(datetime.timedelta(seconds=elapsed_rounded))

seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

print('Training...')
t0 = time.time()
total_train_loss = 0
model.train()

outcome_correct = 0
total_examples = 0

for step, batch in enumerate(train_dataloader):
    if step % 40 == 0 and step != 0:
        elapsed = format_time(time.time() - t0)
        print(f'  Batch {step:>5,} of {len(train_dataloader):>5,}. Elapsed: {elapsed}.')

    input_ids, attention_mask, outcome_labels = [b.to(device) for b in batch]
    optimizer.zero_grad()
    logits = model(input_ids, attention_mask)
    loss = outcome_loss_fn(logits, outcome_labels)
    loss.backward()
    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

    optimizer.step()

    total_train_loss += loss.item()
    total_examples += input_ids.size(0)

    _, outcome_preds = torch.max(logits, dim=1)
    outcome_correct += (outcome_preds == outcome_labels).sum().item()

avg_train_loss = total_train_loss / len(train_dataloader)
outcome_accuracy = outcome_correct / total_examples

training_time = format_time(time.time() - t0)
print(f"  Average training loss: {avg_train_loss:.4f}")
print(f"  Outcome Accuracy: {outcome_accuracy:.4f}")
print(f"  Training took: {training_time}")

print("\nRunning Validation...")
t0 = time.time()
model.eval()

total_eval_loss = 0
val_outcome_correct = 0
val_total = 0

all_outcome_preds = []
all_outcome_labels = []

with torch.no_grad():
    for batch in validation_dataloader:
        input_ids, attention_mask, outcome_labels = [b.to(device) for b in batch]

        logits = model(input_ids, attention_mask)

        loss = outcome_loss_fn(logits, outcome_labels)
        total_eval_loss += loss.item()
        val_total += input_ids.size(0)

        _, outcome_preds = torch.max(logits, dim=1)

        all_outcome_preds.extend(outcome_preds.cpu().numpy())
        all_outcome_labels.extend(outcome_labels.cpu().numpy())

        val_outcome_correct += (outcome_preds == outcome_labels).sum().item()

avg_val_loss = total_eval_loss / len(validation_dataloader)
val_outcome_accuracy = val_outcome_correct / val_total
outcome_precision = precision_score(all_outcome_labels, all_outcome_preds, average='macro')
outcome_recall = recall_score(all_outcome_labels, all_outcome_preds, average='macro')
outcome_f1 = f1_score(all_outcome_labels, all_outcome_preds, average='macro')

validation_time = format_time(time.time() - t0)

print(f"  Validation Loss: {avg_val_loss:.4f}")
print(f"  Outcome Accuracy: {val_outcome_accuracy:.4f}")
print(f"  Outcome Precision: {outcome_precision:.4f}")
print(f"  Outcome Recall: {outcome_recall:.4f}")
print(f"  Outcome F1 Score: {outcome_f1:.4f}")
print(f"  Validation took: {validation_time}")

model_path = '/kaggle/working/basic_bert_model.pt'
torch.save({
    'model': model,
    'outcome_encoder': outcome_encoder,
    'model_class': BasicBertClassifier,
    'f1_score': outcome_f1,
    'precision': outcome_precision,
    'recall': outcome_recall
}, model_path)

print(f"\nModel saved to {model_path}")
print(f"Final outcome F1 score: {outcome_f1:.4f}")

Using device: cuda
Loading data from /kaggle/input/combined-data/Combined Data.xlsx
Number of outcome labels: 289
Smallest class has 1 samples


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Training samples: 8680
Validation samples: 2171


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]



Training...
  Batch    40 of   543. Elapsed: 0:00:24.
  Batch    80 of   543. Elapsed: 0:00:46.
  Batch   120 of   543. Elapsed: 0:01:09.
  Batch   160 of   543. Elapsed: 0:01:32.
  Batch   200 of   543. Elapsed: 0:01:57.
  Batch   240 of   543. Elapsed: 0:02:22.
  Batch   280 of   543. Elapsed: 0:02:47.
  Batch   320 of   543. Elapsed: 0:03:11.
  Batch   360 of   543. Elapsed: 0:03:36.
  Batch   400 of   543. Elapsed: 0:04:01.
  Batch   440 of   543. Elapsed: 0:04:25.
  Batch   480 of   543. Elapsed: 0:04:50.
  Batch   520 of   543. Elapsed: 0:05:15.
  Average training loss: 4.9729
  Outcome Accuracy: 0.1116
  Training took: 0:05:29

Running Validation...


  _warn_prf(average, modifier, msg_start, len(result))


  Validation Loss: 4.1819
  Outcome Accuracy: 0.2358
  Outcome Precision: 0.0909
  Outcome Recall: 0.1406
  Outcome F1 Score: 0.0900
  Validation took: 0:00:27

Model saved to /kaggle/working/basic_bert_model.pt
Final outcome F1 score: 0.0900
