In [84]:
import os
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd
from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup


import torch
from tqdm.notebook import tqdm

from transformers import BertTokenizer
from torch.utils.data import TensorDataset

from transformers import BertForSequenceClassification


# Helper Function

In [13]:
import re
def remove_numbers(text):
    if isinstance(text, str):
        return re.sub(r'\d+', '', text)
    return text


def remove_years(text):
    return text


# Import Data 

In [102]:

dataset = pd.read_csv('/Users/bhumikornkongtaveelert/Documents/Spring 2024/CS 224N/final project/BERT-based classifier/BIL Launchpad Case studies - Sheet1.csv')

"""
Cleaning the data:
- only include columns we will use: 'Project Name', 'Project Description', 'Applicants', 'opportunitytitle', 'description'
- drop column with missing information
- currently, the grant opportunities include years, i will omit this from their title to group recurring grants together
"""
dataset = dataset[['Project Name', 'Project Description', 'Applicants', 'opportunitytitle', 'description']]
clean_dataset = dataset.dropna()
print(len(clean_dataset['opportunitytitle'].unique()))
# print(dataset['opportunitytitle'].unique())
clean_dataset['opportunitytitle'] = clean_dataset['opportunitytitle'].apply(remove_numbers)
print(len(clean_dataset['opportunitytitle'].unique()))
# print(dataset['opportunitytitle'].unique())

79
55


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clean_dataset['opportunitytitle'] = clean_dataset['opportunitytitle'].apply(remove_numbers)


In [104]:
clean_dataset

Unnamed: 0,Project Name,Project Description,Applicants,opportunitytitle,description
0,Fairbanks International Airport,This award funds the replacement of the passen...,Fairbanks International Airport,FY Notice of Funding Opportunity: Bipartisan ...,"The Department of Transportation (DOT), Federa..."
1,Ted Stevens Anchorage International Airport,This award funds the installation of 110 audio...,Ted Stevens Anchorage International Airport,FY Notice of Funding Opportunity: Bipartisan ...,"The Department of Transportation (DOT), Federa..."
2,Ted Stevens Anchorage International Airport,This award funds the installation of a new pas...,Ted Stevens Anchorage International Airport,FY Notice of Funding Opportunity: Bipartisan ...,"The Department of Transportation (DOT), Federa..."
3,Phoenix Sky Harbor International Airport,"This award funds the replacement of chillers, ...",Phoenix Sky Harbor International Airport,FY Notice of Funding Opportunity: Bipartisan ...,"The Department of Transportation (DOT), Federa..."
4,Yuma International Airport,This award funds upgrading and replacing secur...,Yuma International Airport,FY Notice of Funding Opportunity: Bipartisan ...,"The Department of Transportation (DOT), Federa..."
...,...,...,...,...,...
5664,buy 52 new light rail vehicles,The Maryland Department of Transportation Mary...,Maryland Department of Transportation (Marylan...,FY Competitive Funding Opportunity: Rail Vehi...,The Federal Transit Administration (FTA) annou...
5665,buy up to 200 new rail cars to replace older r...,The Southeastern Pennsylvania Transportation A...,Southeastern Pennsylvania Transportation Autho...,FY Competitive Funding Opportunity: Rail Vehi...,The Federal Transit Administration (FTA) annou...
5666,buy up to 200 new rail cars to replace older r...,The Southeastern Pennsylvania Transportation A...,Southeastern Pennsylvania Transportation Autho...,FY Competitive Funding Opportunity: Rail Vehi...,The Federal Transit Administration (FTA) annou...
5681,Colorado Department of Transportation_CO_Buses...,"The Colorado Department of Transportation, on ...",Colorado Department of Transportation,*Grants for Buses and Bus Facilities Program,The Federal Transit Administration (FTA) annou...


# Experiment 1: Classification of project <> grant name (multi-class text classification) (ignore description of program)

In [57]:
# Create new dataset of project + project name and grant title
simple_df = clean_dataset[['Project Name', 'Project Description', 'Applicants', 'opportunitytitle']]
simple_df['project_profile'] =  simple_df['Applicants'] + ': ' + simple_df['Project Description'] + ' ' + simple_df['Project Name'] 
simple_df = simple_df[['project_profile','opportunitytitle'] ]
possible_grants = simple_df['opportunitytitle'].unique()

#substitute label with number instead
label_dict = {}
for index, possible_label in enumerate(possible_grants):
    label_dict[possible_label] = index

simple_df['label'] = simple_df['opportunitytitle'].replace(label_dict)
simple_df


  simple_df['label'] = simple_df['opportunitytitle'].replace(label_dict)


Unnamed: 0,project_profile,opportunitytitle,label
0,Fairbanks International Airport: This award fu...,FY Notice of Funding Opportunity: Bipartisan ...,0
1,Ted Stevens Anchorage International Airport: T...,FY Notice of Funding Opportunity: Bipartisan ...,0
2,Ted Stevens Anchorage International Airport: T...,FY Notice of Funding Opportunity: Bipartisan ...,0
3,Phoenix Sky Harbor International Airport: This...,FY Notice of Funding Opportunity: Bipartisan ...,0
4,Yuma International Airport: This award funds u...,FY Notice of Funding Opportunity: Bipartisan ...,0
...,...,...,...
5664,Maryland Department of Transportation (Marylan...,FY Competitive Funding Opportunity: Rail Vehi...,53
5665,Southeastern Pennsylvania Transportation Autho...,FY Competitive Funding Opportunity: Rail Vehi...,53
5666,Southeastern Pennsylvania Transportation Autho...,FY Competitive Funding Opportunity: Rail Vehi...,53
5681,Colorado Department of Transportation: The Col...,*Grants for Buses and Bus Facilities Program,54


In [90]:
simple_df['project_profile'][0]

'Fairbanks International Airport: This award funds the replacement of the passenger boarding bridge at Gate 3. Fairbanks International Airport'

# Train validation split

In [63]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(simple_df.index.values, 
                                                  simple_df.label.values, 
                                                  test_size=0.15, 
                                                  random_state=42, 
                                                  stratify=simple_df.label.values)

simple_df['data_type'] = ['not_set']*simple_df.shape[0]

simple_df.loc[X_train, 'data_type'] = 'train'
simple_df.loc[X_val, 'data_type'] = 'val'

simple_df.groupby(['opportunitytitle', 'label', 'data_type']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,project_profile
opportunitytitle,label,data_type,Unnamed: 3_level_1
Competitive Funding Opportunity: Pilot Program for Transit-Oriented Development (TOD) Planning,14,train,17
Competitive Funding Opportunity: Pilot Program for Transit-Oriented Development (TOD) Planning,14,val,3
Tribal Transportation Program Safety Fund,8,train,149
Tribal Transportation Program Safety Fund,8,val,26
*FY Notice of Funding Opportunity: Bipartisan Infrastructure Law (BIL) Airport Terminal Program (ATP) Grants,33,train,159
...,...,...,...
Strengthening Mobility and Revolutionizing Transportation (SMART) Grants Program,40,val,9
The Infrastructure Investment and Jobs Act (IIJA) Notice of Funding Opportunity for America&#;s Marine Highway Program,32,train,10
The Infrastructure Investment and Jobs Act (IIJA) Notice of Funding Opportunity for America&#;s Marine Highway Program,32,val,2
United States Marine Highway Grants,51,train,7


# Tokenisation

In [64]:
"""
Constructs a BERT tokenizer. Based on WordPiece.
Instantiate a pre-trained BERT model configuration to encode our data.
To convert all the titles from text into encoded form, we use a function called batch_encode_plus , and we will proceed train and validation data separately.
The 1st parameter inside the above function is the title text.
add_special_tokens=True means the sequences will be encoded with the special tokens relative to their model.
When batching sequences together, we set return_attention_mask=True, so it will return the attention mask according to the specific tokenizer defined by the max_length attribute.
We also want to pad all the titles to certain maximum length.
We actually do not need to set max_length=256, but just to play it safe.
return_tensors='pt' to return PyTorch.
And then we need to split the data into input_ids, attention_masks and labels.
Finally, after we get encoded data set, we can create training data and validation data.
"""
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', 
                                          do_lower_case=True)
                                          
encoded_data_train = tokenizer.batch_encode_plus(
    simple_df[simple_df.data_type=='train'].project_profile.values, 
    add_special_tokens=True, 
    return_attention_mask=True, 
    pad_to_max_length=True, 
    max_length=256, 
    return_tensors='pt'
)

encoded_data_val = tokenizer.batch_encode_plus(
    simple_df[simple_df.data_type=='val'].project_profile.values, 
    add_special_tokens=True, 
    return_attention_mask=True, 
    pad_to_max_length=True, 
    max_length=256, 
    return_tensors='pt'
)

input_ids_train = encoded_data_train['input_ids']
attention_masks_train = encoded_data_train['attention_mask']
labels_train = torch.tensor(simple_df[simple_df.data_type=='train'].label.values)

input_ids_val = encoded_data_val['input_ids']
attention_masks_val = encoded_data_val['attention_mask']
labels_val = torch.tensor(simple_df[simple_df.data_type=='val'].label.values)

dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)
dataset_val = TensorDataset(input_ids_val, attention_masks_val, labels_val)


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


# BERT

In [67]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                      num_labels=len(label_dict),
                                                      output_attentions=False,
                                                      output_hidden_states=False)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [69]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

batch_size = 3

dataloader_train = DataLoader(dataset_train, 
                              sampler=RandomSampler(dataset_train), 
                              batch_size=batch_size)

dataloader_validation = DataLoader(dataset_val, 
                                   sampler=SequentialSampler(dataset_val), 
                                   batch_size=batch_size)

In [70]:
from transformers import AdamW, get_linear_schedule_with_warmup

optimizer = AdamW(model.parameters(),
                  lr=1e-5, 
                  eps=1e-8)
                  
epochs = 5

scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps=0,
                                            num_training_steps=len(dataloader_train)*epochs)



In [78]:
from sklearn.metrics import f1_score
import numpy as np

def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average='weighted')

def accuracy_per_class(preds, labels, label_dict):
    label_dict_inverse = {v: k for k, v in label_dict.items()}
    
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()

    class_accuracies = {}
    for label in np.unique(labels_flat):
        y_preds = preds_flat[labels_flat == label]
        y_true = labels_flat[labels_flat == label]
        accuracy = len(y_preds[y_preds == label]) / len(y_true)
        class_accuracies[label_dict_inverse[label]] = accuracy
        print(f'Class: {label_dict_inverse[label]}')
        print(f'Accuracy: {len(y_preds[y_preds == label])}/{len(y_true)} ({accuracy:.4f})\n')

    return class_accuracies

def recall_at_k(y_true, y_pred, k=5):
    num_samples = len(y_true)
    recall_count = 0
    
    for true_label, predictions in zip(y_true, y_pred):
        top_k_predictions = np.argsort(predictions)[-k:]  # Get indices of the top k predictions
        if true_label in top_k_predictions:
            recall_count += 1
    
    recall_at_k_score = recall_count / num_samples
    return recall_at_k_score

def evaluate(dataloader_val):
    model.eval()
    
    loss_val_total = 0
    predictions, true_vals = [], []
    
    for batch in dataloader_val:
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }

        with torch.no_grad():        
            outputs = model(**inputs)
            
        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)
    
    loss_val_avg = loss_val_total / len(dataloader_val)
    
    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)
    
    recall_at_5 = recall_at_k(true_vals, predictions, k=5)
    
    return loss_val_avg, predictions, true_vals, recall_at_5


In [101]:
import random

seed_val = 17
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

!pip install ipywidgets
! pip install jupyter
! pip install ipywidgets widgetsnbextension pandas-profiling
! pip install jupyter_contrib_nbextensions
! jupyter contrib nbextension install --user
!jupyter nbextension enable --py widgetsnbextension
for epoch in tqdm(range(1, epochs+1)):
    
    model.train()
    
    loss_train_total = 0

    progress_bar = tqdm(dataloader_train, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
    for batch in progress_bar:

        model.zero_grad()
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }       

        outputs = model(**inputs)
        
        loss = outputs[0]
        loss_train_total += loss.item()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()
        
        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})

    torch.save(model.state_dict(), f'data_volume/finetuned_BERT_epoch_{epoch}.model')
        
    tqdm.write(f'\nEpoch {epoch}')
    
    loss_train_avg = loss_train_total/len(dataloader_train)            
    tqdm.write(f'Training loss: {loss_train_avg}')
     
    loss_val_avg, predictions, true_vals, recall_at_5 = evaluate(dataloader_validation)
    val_f1 = f1_score_func(predictions, true_vals)
    tqdm.write(f'Validation loss: {loss_val_avg}')
    tqdm.write(f'F1 Score (Weighted): {val_f1}')
    tqdm.write(f'Recall@5: {recall_at_5}')

Collecting pandas-profiling
  Downloading pandas_profiling-3.6.6-py2.py3-none-any.whl.metadata (4.5 kB)
Collecting ydata-profiling (from pandas-profiling)
  Downloading ydata_profiling-4.8.3-py2.py3-none-any.whl.metadata (20 kB)
Collecting matplotlib<3.9,>=3.2 (from ydata-profiling->pandas-profiling)
  Downloading matplotlib-3.8.4-cp310-cp310-macosx_10_12_x86_64.whl.metadata (5.8 kB)
Collecting pydantic>=2 (from ydata-profiling->pandas-profiling)
  Downloading pydantic-2.7.1-py3-none-any.whl.metadata (107 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m107.3/107.3 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
Collecting visions<0.7.7,>=0.7.5 (from visions[type_image_path]<0.7.7,>=0.7.5->ydata-profiling->pandas-profiling)
  Downloading visions-0.7.6-py3-none-any.whl.metadata (11 kB)
Collecting htmlmin==0.1.12 (from ydata-profiling->pandas-profiling)
  Downloading htmlmin-0.1.12.tar.gz (19 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting phik<0.13,>=

ImportError: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html

In [93]:
! which python
! which ipywidgets


/Users/bhumikornkongtaveelert/miniconda3/envs/cs224n-cpu/bin/python
ipywidgets not found


In [None]:

"""
we will summarise the project down to a multi-class text classification. 
To do so:
1. We will concatenate 'project name' and 'project description' together
2. 
"""