In [None]:
import os
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd
from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup


import torch
from tqdm.notebook import tqdm

from transformers import BertTokenizer
from torch.utils.data import TensorDataset

from transformers import BertForSequenceClassification


# Helper Function

In [None]:
import re
def remove_numbers(text):
    if isinstance(text, str):
        return re.sub(r'\d+', '', text)
    return text


def remove_year(string):
    return re.sub(r'\s20\d{2}(\s|\-\d{2})?', ' ', string)

# Import Data

In [None]:

dataset = pd.read_csv('BIL Launchpad Case studies - Sheet1.csv')

"""
Cleaning the data:
- only include columns we will use: 'Project Name', 'Project Description', 'Applicants', 'opportunitytitle', 'description'
- drop column with missing information
- currently, the grant opportunities include years, i will omit this from their title to group recurring grants together
"""
dataset = dataset[['Project Name', 'Project Description', 'Applicants', 'opportunitytitle', 'description']]
clean_dataset = dataset.dropna()
print(len(clean_dataset['opportunitytitle'].unique()))
# print(dataset['opportunitytitle'].unique())
clean_dataset['description'] = clean_dataset['description'].apply(remove_numbers)
clean_dataset['opportunitytitle'] = clean_dataset['opportunitytitle'].apply(remove_numbers)

print(len(clean_dataset['opportunitytitle'].unique()))
# print(dataset['opportunitytitle'].unique())

79
55


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clean_dataset['description'] = clean_dataset['description'].apply(remove_numbers)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clean_dataset['opportunitytitle'] = clean_dataset['opportunitytitle'].apply(remove_numbers)


# Experiment 1: Classification of project <> grant name (multi-class text classification) (ignore description of program)

In [None]:
# Create new dataset of project + project name and grant title
simple_df = clean_dataset[['Project Name', 'Project Description', 'Applicants', 'opportunitytitle']]
simple_df['project_profile'] =  simple_df['Applicants'] + ': ' + simple_df['Project Description'] + ' ' + simple_df['Project Name']
simple_df = simple_df[['project_profile','opportunitytitle'] ]
possible_grants = simple_df['opportunitytitle'].unique()

#substitute label with number instead
label_dict = {}
for index, possible_label in enumerate(possible_grants):
    label_dict[possible_label] = index

simple_df['label'] = simple_df['opportunitytitle'].replace(label_dict)
simple_df


Unnamed: 0,project_profile,opportunitytitle,label
0,Fairbanks International Airport: This award fu...,FY Notice of Funding Opportunity: Bipartisan ...,0
1,Ted Stevens Anchorage International Airport: T...,FY Notice of Funding Opportunity: Bipartisan ...,0
2,Ted Stevens Anchorage International Airport: T...,FY Notice of Funding Opportunity: Bipartisan ...,0
3,Phoenix Sky Harbor International Airport: This...,FY Notice of Funding Opportunity: Bipartisan ...,0
4,Yuma International Airport: This award funds u...,FY Notice of Funding Opportunity: Bipartisan ...,0
...,...,...,...
5664,Maryland Department of Transportation (Marylan...,FY Competitive Funding Opportunity: Rail Vehi...,53
5665,Southeastern Pennsylvania Transportation Autho...,FY Competitive Funding Opportunity: Rail Vehi...,53
5666,Southeastern Pennsylvania Transportation Autho...,FY Competitive Funding Opportunity: Rail Vehi...,53
5681,Colorado Department of Transportation: The Col...,*Grants for Buses and Bus Facilities Program,54


In [None]:
simple_df['project_profile'][0]

'Fairbanks International Airport: This award funds the replacement of the passenger boarding bridge at Gate 3. Fairbanks International Airport'

# Experiment 2: Classification of project <> grant name (multi-class text classification) (with description of program)

In [None]:
def remove_year(string):
    return re.sub(r'\s20\d{2}(\s|\-\d{2})?', ' ', string)

In [None]:
dataset = pd.read_csv('BIL Launchpad Case studies - Sheet1.csv')

"""
Cleaning the data:
- only include columns we will use: 'Project Name', 'Project Description', 'Applicants', 'opportunitytitle', 'description'
- drop column with missing information
- currently, the grant opportunities include years, i will omit this from their title to group recurring grants together
"""
dataset = dataset[['Project Name', 'Project Description', 'Applicants', 'opportunitytitle', 'description']]
clean_dataset = dataset.dropna()
print(len(clean_dataset['opportunitytitle'].unique()))
# print(dataset['opportunitytitle'].unique())
clean_dataset['description'] = clean_dataset['description'].apply(remove_numbers)
clean_dataset['opportunitytitle'] = clean_dataset['opportunitytitle'].apply(remove_numbers)

print(len(clean_dataset['opportunitytitle'].unique()))
# print(dataset['opportunitytitle'].unique())

simple_df = clean_dataset
simple_df['project_profile'] =  simple_df['Applicants'] + ': ' + simple_df['Project Description'] + ' ' + simple_df['Project Name']
simple_df['grant_profile'] =  simple_df['opportunitytitle'] + ': ' + simple_df['description']
simple_df = simple_df[['project_profile','grant_profile'] ]
possible_grants = simple_df['grant_profile'].unique()

#substitute label with number instead
label_dict = {}
for index, possible_label in enumerate(possible_grants):
    label_dict[possible_label] = index

simple_df['label'] = simple_df['grant_profile'].replace(label_dict)
simple_df['grant_profile'].unique()[0]

79
55


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clean_dataset['description'] = clean_dataset['description'].apply(remove_numbers)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clean_dataset['opportunitytitle'] = clean_dataset['opportunitytitle'].apply(remove_numbers)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  simple_df['project_profile'] = 

'FY  Notice of Funding Opportunity: Bipartisan Infrastructure Law (BIL) Airport Terminal Program (ATP) Grants: The Department of Transportation (DOT), Federal Aviation Administration (FAA) announces the opportunity to apply for approximately $ billion in FY  discretionary funds for the Airport Terminal Program (ATP), made available under the Infrastructure Investment and Jobs Act of  (IIJA), Pub. L. -, herein referred to as the Bipartisan Infrastructure Law (BIL). The purpose of the ATP is to make annual grants available to eligible airports for airport terminal and airport-owned Airport Traffic Control Towers development projects that address the aging infrastructure of our nation&#;s airports. In addition, ATP grants will align with DOT&#;s Strategic Framework FY- at https://www.transportation.gov/administrations/office-policy/fy--strategic-framework. The FY  ATP will be implemented consistent with law and in alignment with the priorities in Executive Order , Implementation of the In

# Train validation split

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(simple_df.index.values,
                                                  simple_df.label.values,
                                                  test_size=0.15,
                                                  random_state=42,
                                                  stratify=simple_df.label.values)

simple_df['data_type'] = ['not_set']*simple_df.shape[0]

simple_df.loc[X_train, 'data_type'] = 'train'
simple_df.loc[X_val, 'data_type'] = 'val'

simple_df.groupby(['grant_profile', 'label', 'data_type']).count()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  simple_df['data_type'] = ['not_set']*simple_df.shape[0]


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,project_profile
grant_profile,label,data_type,Unnamed: 3_level_1
"Competitive Funding Opportunity: Pilot Program for Transit-Oriented Development (TOD) Planning: The Pilot Program for TOD Planning helps support FTA&#;s mission of improving public transportation for America&#;s communities by providing funding to local communities to integrate land use and transportation planning around a new fixed guideway or core capacity improvement project. Per statute, any comprehensive planning funded through the program must examine ways to improve economic development and ridership, foster multimodal connectivity and accessibility, improve transit access for pedestrian and bicycle traffic, engage the private sector, identify infrastructure needs, and enable mixed-use development near transit stations. Additional information is available at: https://www.transit.dot.gov/TODPilot.Notice of Funding Opportunity (NOFO): Notice of Funding Opportunity for PILOT PROGRAM FOR TRANSIT-ORIENTED DEVELOPMENT (TOD) PLANNING. The Federal Transit Administration (FTA) announces the opportunity to apply for $,, of funding under the Pilot Program for TOD Planning (Catalog of Federal Domestic Assistance #.). As required by federal transit law and subject to funding availability, funds will be awarded competitively to support comprehensive planning associated with new fixed guideway and core capacity improvement projects. FTA anticipates minimum grant awards of $, and maximum grant awards of $,,. Synopses and full announcement are posted on Grants.gov as opportunity FTA---TPE-TODP. The Pilot Program for TOD Planning is intended to fund comprehensive planning that supports economic development, ridership, multimodal connectivity and accessibility, increased transit access for pedestrian and bicycle traffic, and mixed-use development near transit stations. The program also encourages identification of infrastructure needs and engagement with the private sector.Consistent with statutory direction, FTA is seeking comprehensive planning projects covering an entire transit capital project corridor, rather than proposals that involve planning for individual station areas or only a small section of the corridor. To ensure any proposed planning work reflects the needs and aspirations of the local community and results in concrete, specific deliverables and outcomes, transit project sponsors must partner with entities with land use planning authority in the transit project corridor to conduct the planning work.Link and Instructions for attaching the supplemental form to the SF-: All applicants must complete a supplemental form (PDF) specific to the Pilot Program for TOD Planning and attach it to their submission in Grants.gov. See section D of the NOFO for information about additional required application contents.Webinar: FTA will conduct a webinar for prospective applicants after the NOFO is published in the Federal Register. Further information will be posted on FTA&#;s website (https://www.transit.dot.gov/TODPilot) when available. Dates: An applicant must submit a proposal electronically by : p.m. Eastern Daylight Time on June , . Any agency intending to apply should initiate the process of registering on the Grants.gov site immediately to ensure completion of registration before the submission deadline.",17,train,17
"Competitive Funding Opportunity: Pilot Program for Transit-Oriented Development (TOD) Planning: The Pilot Program for TOD Planning helps support FTA&#;s mission of improving public transportation for America&#;s communities by providing funding to local communities to integrate land use and transportation planning around a new fixed guideway or core capacity improvement project. Per statute, any comprehensive planning funded through the program must examine ways to improve economic development and ridership, foster multimodal connectivity and accessibility, improve transit access for pedestrian and bicycle traffic, engage the private sector, identify infrastructure needs, and enable mixed-use development near transit stations. Additional information is available at: https://www.transit.dot.gov/TODPilot.Notice of Funding Opportunity (NOFO): Notice of Funding Opportunity for PILOT PROGRAM FOR TRANSIT-ORIENTED DEVELOPMENT (TOD) PLANNING. The Federal Transit Administration (FTA) announces the opportunity to apply for $,, of funding under the Pilot Program for TOD Planning (Catalog of Federal Domestic Assistance #.). As required by federal transit law and subject to funding availability, funds will be awarded competitively to support comprehensive planning associated with new fixed guideway and core capacity improvement projects. FTA anticipates minimum grant awards of $, and maximum grant awards of $,,. Synopses and full announcement are posted on Grants.gov as opportunity FTA---TPE-TODP. The Pilot Program for TOD Planning is intended to fund comprehensive planning that supports economic development, ridership, multimodal connectivity and accessibility, increased transit access for pedestrian and bicycle traffic, and mixed-use development near transit stations. The program also encourages identification of infrastructure needs and engagement with the private sector.Consistent with statutory direction, FTA is seeking comprehensive planning projects covering an entire transit capital project corridor, rather than proposals that involve planning for individual station areas or only a small section of the corridor. To ensure any proposed planning work reflects the needs and aspirations of the local community and results in concrete, specific deliverables and outcomes, transit project sponsors must partner with entities with land use planning authority in the transit project corridor to conduct the planning work.Link and Instructions for attaching the supplemental form to the SF-: All applicants must complete a supplemental form (PDF) specific to the Pilot Program for TOD Planning and attach it to their submission in Grants.gov. See section D of the NOFO for information about additional required application contents.Webinar: FTA will conduct a webinar for prospective applicants after the NOFO is published in the Federal Register. Further information will be posted on FTA&#;s website (https://www.transit.dot.gov/TODPilot) when available. Dates: An applicant must submit a proposal electronically by : p.m. Eastern Daylight Time on June , . Any agency intending to apply should initiate the process of registering on the Grants.gov site immediately to ensure completion of registration before the submission deadline.",17,val,3
"Tribal Transportation Program Safety Fund: A Notice of Funding Availability (NOFA) for Tribal Transportation Program Safety Funds (TTPSF) has been published. This notice announces the availability of $. million for projects in three categories: safety plans; data assessment, improvement, and analysis activities; and infrastructure improvement and other eligible activities as listed in U.S.C.(a)() The deadline for submissions is //. For complete instructions on preparing and submitting an application for TTPSF, please download the application information and NOFO at https://flh.fhwa.dot.gov/programs/ttp/safety/ttpsf.htm Please submit your application directly to FHWA through the &quot;apply now&quot; button at: https://flh.fhwa.dot.gov/programs/ttp/safety/ttpsf.htm",8,train,76
"Tribal Transportation Program Safety Fund: A Notice of Funding Availability (NOFA) for Tribal Transportation Program Safety Funds (TTPSF) has been published. This notice announces the availability of $. million for projects in three categories: safety plans; data assessment, improvement, and analysis activities; and infrastructure improvement and other eligible activities as listed in U.S.C.(a)() The deadline for submissions is //. For complete instructions on preparing and submitting an application for TTPSF, please download the application information and NOFO at https://flh.fhwa.dot.gov/programs/ttp/safety/ttpsf.htm Please submit your application directly to FHWA through the &quot;apply now&quot; button at: https://flh.fhwa.dot.gov/programs/ttp/safety/ttpsf.htm",8,val,14
"Tribal Transportation Program Safety Fund: Eligible projects described in section (a)() are strategies, activities, and projects on a public road that are consistent with a transportation safety plan and that (i) correct or improve a hazardous road location or feature, or (ii) address a highway safety problem. TTPSF emphasizes the development of strategic transportation safety plans using a data-driven process as a means for Tribes to identify transportation safety needs and determine how those needs will be addressed in Tribal communities. FHWA has identified four eligibility categories: transportation safety plans; data assessment, improvement, and analysis activities; systemic roadway departure countermeasures; and infrastructure improvements and other eligible activities as listed in U.S.C. &#; (a)().",59,train,72
...,...,...,...
Strengthening Mobility and Revolutionizing Transportation (SMART) Grants Program: The purpose of this notice is to solicit applications for Strengthening Mobility and Revolutionizing Transportation (SMART) grants. Funds for the fiscal year (FY) SMART Grants Program are to be awarded on a competitive basis to conduct demonstration projects focused on advanced smart city or community technologies and systems to improve transportation efficiency and safety.,46,val,9
"The Infrastructure Investment and Jobs Act (IIJA) Notice of Funding Opportunity for America&#;s Marine Highway Program: This notice announces the availability of funding for grants and establishes selection criteria and application requirements for the America&#;s Marine Highway Program (&#;AMHP&#;). The purpose of this program is to make grants available to previously designated Marine Highway Projects that support the development and expansion of documented vessels or port and landside infrastructure. The Department also seeks eligible grant projects that will strengthen American supply chains. The U.S. Department of Transportation (&#;DOT&#; or &#;Department&#;) will award Marine Highway Grants to implement projects or components of projects previously designated by the Secretary of Transportation (&#;Secretary&#;) under the AMHP. Only Marine Highway Projects the Secretary designates before the Notice of Funding Opportunity (&#;NOFO&#;) closing date are eligible for funding as described in this notice. TIMING OF GRANT APPLICATIONS: Applications must be received by the Maritime Administration (&#;MARAD&#;) by : p.m. E.D.T. on April , . ADDRESSES: Grant applications must be submitted electronically using Grants.gov (https://www.grants.gov). Please be aware that you must complete the Grants.gov registration process before submitting your application and that the registration process usually takes to weeks to complete. Applicants are strongly encouraged to make submissions in advance of the deadline. FOR FURTHER INFORMATION CONTACT: Fred Jones, Office of Ports &amp; Waterways Planning, Room W&#;, Maritime Administration, U.S. Department of Transportation, New Jersey Avenue S.E., Washington, D.C. , phone --, or email Fred.Jones@dot.gov. Persons who use a telecommunications device for the deaf (TDD) may call the Federal Information Relay Service (FIRS) at --- to contact the above individual during business hours. The FIRS is available twenty-four hours a day, seven days a week, to leave a message or question with the above individual. You will receive a reply during regular business hours.",37,train,10
"The Infrastructure Investment and Jobs Act (IIJA) Notice of Funding Opportunity for America&#;s Marine Highway Program: This notice announces the availability of funding for grants and establishes selection criteria and application requirements for the America&#;s Marine Highway Program (&#;AMHP&#;). The purpose of this program is to make grants available to previously designated Marine Highway Projects that support the development and expansion of documented vessels or port and landside infrastructure. The Department also seeks eligible grant projects that will strengthen American supply chains. The U.S. Department of Transportation (&#;DOT&#; or &#;Department&#;) will award Marine Highway Grants to implement projects or components of projects previously designated by the Secretary of Transportation (&#;Secretary&#;) under the AMHP. Only Marine Highway Projects the Secretary designates before the Notice of Funding Opportunity (&#;NOFO&#;) closing date are eligible for funding as described in this notice. TIMING OF GRANT APPLICATIONS: Applications must be received by the Maritime Administration (&#;MARAD&#;) by : p.m. E.D.T. on April , . ADDRESSES: Grant applications must be submitted electronically using Grants.gov (https://www.grants.gov). Please be aware that you must complete the Grants.gov registration process before submitting your application and that the registration process usually takes to weeks to complete. Applicants are strongly encouraged to make submissions in advance of the deadline. FOR FURTHER INFORMATION CONTACT: Fred Jones, Office of Ports &amp; Waterways Planning, Room W&#;, Maritime Administration, U.S. Department of Transportation, New Jersey Avenue S.E., Washington, D.C. , phone --, or email Fred.Jones@dot.gov. Persons who use a telecommunications device for the deaf (TDD) may call the Federal Information Relay Service (FIRS) at --- to contact the above individual during business hours. The FIRS is available twenty-four hours a day, seven days a week, to leave a message or question with the above individual. You will receive a reply during regular business hours.",37,val,2
"United States Marine Highway Grants: This funding opportunity solicits applications for fiscal year (FY) United States Marine Highway Program (USMHP) grants. Funds for FY USMHP grants will be awarded on a competitive basis to assist in funding eligible projects for the purpose of developing, expanding, or promoting marine highway transportation. This opportunity announces the availability of up to $,, million in funding for grants under this program and establishes selection criteria and application requirements. All USMHP grant recipients must meet all applicable Federal requirements, including domestic content (&#;Buy America&#;) requirements. This program was formerly known as the America&#;s Marine Highway Program",60,train,7


# Tokenisation

In [None]:
"""
Constructs a BERT tokenizer. Based on WordPiece.
Instantiate a pre-trained BERT model configuration to encode our data.
To convert all the titles from text into encoded form, we use a function called batch_encode_plus , and we will proceed train and validation data separately.
The 1st parameter inside the above function is the title text.
add_special_tokens=True means the sequences will be encoded with the special tokens relative to their model.
When batching sequences together, we set return_attention_mask=True, so it will return the attention mask according to the specific tokenizer defined by the max_length attribute.
We also want to pad all the titles to certain maximum length.
We actually do not need to set max_length=256, but just to play it safe.
return_tensors='pt' to return PyTorch.
And then we need to split the data into input_ids, attention_masks and labels.
Finally, after we get encoded data set, we can create training data and validation data.
"""

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased',
                                          do_lower_case=True)

encoded_data_train = tokenizer.batch_encode_plus(
    simple_df[simple_df.data_type=='train'].project_profile.values,
    add_special_tokens=True,
    return_attention_mask=True,
    pad_to_max_length=True,
    # max_length=256,
    return_tensors='pt'
)

encoded_data_val = tokenizer.batch_encode_plus(
    simple_df[simple_df.data_type=='val'].project_profile.values,
    add_special_tokens=True,
    return_attention_mask=True,
    pad_to_max_length=True,
    # max_length=256,
    return_tensors='pt'
)

input_ids_train = encoded_data_train['input_ids']
attention_masks_train = encoded_data_train['attention_mask']
labels_train = torch.tensor(simple_df[simple_df.data_type=='train'].label.values)

input_ids_val = encoded_data_val['input_ids']
attention_masks_val = encoded_data_val['attention_mask']
labels_val = torch.tensor(simple_df[simple_df.data_type=='val'].label.values)

dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)
dataset_val = TensorDataset(input_ids_val, attention_masks_val, labels_val)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]



# BERT

In [None]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                      num_labels=len(label_dict),
                                                      output_attentions=False,
                                                      output_hidden_states=False)

device = 'cpu'
if torch.cuda.is_available():
    device = torch.cuda.current_device()
print(device)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0


In [None]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

batch_size = 3

dataloader_train = DataLoader(dataset_train,
                              sampler=RandomSampler(dataset_train),
                              batch_size=batch_size)

dataloader_validation = DataLoader(dataset_val,
                                   sampler=SequentialSampler(dataset_val),
                                   batch_size=batch_size)

In [None]:
from transformers import AdamW, get_linear_schedule_with_warmup

optimizer = AdamW(model.parameters(),
                  lr=1e-5,
                  eps=1e-8)

epochs = 5

scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=0,
                                            num_training_steps=len(dataloader_train)*epochs)



In [None]:
import torch
from sklearn.metrics import f1_score

def f1_score_func(preds, labels):
    preds_flat = torch.argmax(preds, dim=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat.cpu().numpy(), preds_flat.cpu().numpy(), average='weighted')

def accuracy_per_class(preds, labels):
    label_dict_inverse = {v: k for k, v in label_dict.items()}

    preds_flat = torch.argmax(preds, dim=1).flatten()
    labels_flat = labels.flatten()

    class_accuracies = {}
    for label in torch.unique(labels_flat):
        y_preds = preds_flat[labels_flat == label]
        y_true = labels_flat[labels_flat == label]
        accuracy = (y_preds == label).sum().item() / len(y_true)
        class_accuracies[label_dict_inverse[label.item()]] = accuracy
        print(f'Class: {label_dict_inverse[label.item()]}')
        print(f'Accuracy: {(y_preds == label).sum().item()}/{len(y_true)} ({accuracy:.4f})\n')

    return class_accuracies

def recall_at_k(y_true, y_pred, k=5):
    num_samples = y_true.size(0)
    recall_count = 0

    for true_label, predictions in zip(y_true, y_pred):
        top_k_predictions = torch.topk(predictions, k).indices  # Get indices of the top k predictions
        if true_label in top_k_predictions:
            recall_count += 1

    recall_at_k_score = recall_count / num_samples
    return recall_at_k_score

def evaluate(dataloader_val):
    model.eval()

    loss_val_total = 0
    predictions, true_vals = [], []

    for batch in dataloader_val:

        batch = tuple(b.to(device) for b in batch)

        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }

        with torch.no_grad():
            outputs = model(**inputs)

        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu()
        label_ids = inputs['labels'].cpu()
        predictions.append(logits)
        true_vals.append(label_ids)

    loss_val_avg = loss_val_total / len(dataloader_val)

    predictions = torch.cat(predictions, dim=0)
    true_vals = torch.cat(true_vals, dim=0)

    recall_at_5 = recall_at_k(true_vals, predictions, k=5)

    return loss_val_avg, predictions, true_vals, recall_at_5


In [None]:
import random
import torch
import numpy as np
from tqdm import tqdm

# Set the random seeds for reproducibility
seed_val = 17
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

# Ensure to use GPU if available
model.to(device)

# Training loop
for epoch in tqdm(range(1, epochs+1)):

    model.train()

    loss_train_total = 0

    progress_bar = tqdm(dataloader_train, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
    for batch in progress_bar:
        model.zero_grad()

        batch = tuple(b.to(device) for b in batch)

        inputs = {
            'input_ids': batch[0],
            'attention_mask': batch[1],
            'labels': batch[2],
        }

        outputs = model(**inputs)

        loss = outputs[0]
        loss_train_total += loss.item()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()

        progress_bar.set_postfix({'training_loss': f'{loss.item() / len(batch):.3f}'})

    torch.save(model.state_dict(), f'finetuned_BERT_epoch_{epoch}.model')

    tqdm.write(f'\nEpoch {epoch}')
    loss_train_avg = loss_train_total / len(dataloader_train)
    tqdm.write(f'Training loss: {loss_train_avg}')

    loss_val_avg, predictions, true_vals, recall_at_5 = evaluate(dataloader_validation)
    val_f1 = f1_score_func(predictions, true_vals)
    tqdm.write(f'Validation loss: {loss_val_avg}')
    tqdm.write(f'F1 Score (Weighted): {val_f1}')
    tqdm.write(f'Recall@5: {recall_at_5}')


  0%|          | 0/5 [00:00<?, ?it/s]
Epoch 1:   0%|          | 0/1271 [00:00<?, ?it/s][A
Epoch 1:   0%|          | 0/1271 [00:02<?, ?it/s, training_loss=1.373][A
Epoch 1:   0%|          | 1/1271 [00:02<42:32,  2.01s/it, training_loss=1.373][A
Epoch 1:   0%|          | 1/1271 [00:02<42:32,  2.01s/it, training_loss=1.362][A
Epoch 1:   0%|          | 2/1271 [00:02<21:03,  1.00it/s, training_loss=1.362][A
Epoch 1:   0%|          | 2/1271 [00:02<21:03,  1.00it/s, training_loss=1.432][A
Epoch 1:   0%|          | 3/1271 [00:02<14:16,  1.48it/s, training_loss=1.432][A
Epoch 1:   0%|          | 3/1271 [00:02<14:16,  1.48it/s, training_loss=1.431][A
Epoch 1:   0%|          | 4/1271 [00:02<11:03,  1.91it/s, training_loss=1.431][A
Epoch 1:   0%|          | 4/1271 [00:03<11:03,  1.91it/s, training_loss=1.440][A
Epoch 1:   0%|          | 5/1271 [00:03<09:16,  2.28it/s, training_loss=1.440][A
Epoch 1:   0%|          | 5/1271 [00:03<09:16,  2.28it/s, training_loss=1.468][A
Epoch 1:   0%| 


Epoch 1
Training loss: 2.3098087324771845


 20%|██        | 1/5 [07:07<28:30, 427.73s/it]

Validation loss: 1.4743896122939057
F1 Score (Weighted): 0.6063206751832861
Recall@5: 0.9019316493313522



Epoch 2:   0%|          | 0/1271 [00:00<?, ?it/s][A
Epoch 2:   0%|          | 0/1271 [00:00<?, ?it/s, training_loss=0.918][A
Epoch 2:   0%|          | 1/1271 [00:00<06:42,  3.15it/s, training_loss=0.918][A
Epoch 2:   0%|          | 1/1271 [00:00<06:42,  3.15it/s, training_loss=0.510][A
Epoch 2:   0%|          | 2/1271 [00:00<06:44,  3.14it/s, training_loss=0.510][A
Epoch 2:   0%|          | 2/1271 [00:00<06:44,  3.14it/s, training_loss=0.244][A
Epoch 2:   0%|          | 3/1271 [00:00<06:47,  3.11it/s, training_loss=0.244][A
Epoch 2:   0%|          | 3/1271 [00:01<06:47,  3.11it/s, training_loss=0.838][A
Epoch 2:   0%|          | 4/1271 [00:01<06:53,  3.06it/s, training_loss=0.838][A
Epoch 2:   0%|          | 4/1271 [00:01<06:53,  3.06it/s, training_loss=0.425][A
Epoch 2:   0%|          | 5/1271 [00:01<06:50,  3.08it/s, training_loss=0.425][A
Epoch 2:   0%|          | 5/1271 [00:01<06:50,  3.08it/s, training_loss=0.196][A
Epoch 2:   0%|          | 6/1271 [00:01<06:48,  3.10


Epoch 2
Training loss: 1.255568378100818


 40%|████      | 2/5 [14:24<21:38, 432.80s/it]

Validation loss: 1.045277183175915
F1 Score (Weighted): 0.6863383068126755
Recall@5: 0.962852897473997



Epoch 3:   0%|          | 0/1271 [00:00<?, ?it/s][A
Epoch 3:   0%|          | 0/1271 [00:00<?, ?it/s, training_loss=0.460][A
Epoch 3:   0%|          | 1/1271 [00:00<06:41,  3.17it/s, training_loss=0.460][A
Epoch 3:   0%|          | 1/1271 [00:00<06:41,  3.17it/s, training_loss=0.732][A
Epoch 3:   0%|          | 2/1271 [00:00<06:42,  3.15it/s, training_loss=0.732][A
Epoch 3:   0%|          | 2/1271 [00:00<06:42,  3.15it/s, training_loss=0.355][A
Epoch 3:   0%|          | 3/1271 [00:00<06:47,  3.11it/s, training_loss=0.355][A
Epoch 3:   0%|          | 3/1271 [00:01<06:47,  3.11it/s, training_loss=0.446][A
Epoch 3:   0%|          | 4/1271 [00:01<06:49,  3.09it/s, training_loss=0.446][A
Epoch 3:   0%|          | 4/1271 [00:01<06:49,  3.09it/s, training_loss=0.254][A
Epoch 3:   0%|          | 5/1271 [00:01<06:48,  3.10it/s, training_loss=0.254][A
Epoch 3:   0%|          | 5/1271 [00:01<06:48,  3.10it/s, training_loss=0.114][A
Epoch 3:   0%|          | 6/1271 [00:01<06:48,  3.10


Epoch 3
Training loss: 0.9249951249639922


 60%|██████    | 3/5 [21:38<14:26, 433.49s/it]

Validation loss: 0.8600434106629756
F1 Score (Weighted): 0.7261731861827846
Recall@5: 0.9717682020802377



Epoch 4:   0%|          | 0/1271 [00:00<?, ?it/s][A
Epoch 4:   0%|          | 0/1271 [00:00<?, ?it/s, training_loss=0.003][A
Epoch 4:   0%|          | 1/1271 [00:00<06:34,  3.22it/s, training_loss=0.003][A
Epoch 4:   0%|          | 1/1271 [00:00<06:34,  3.22it/s, training_loss=0.432][A
Epoch 4:   0%|          | 2/1271 [00:00<06:38,  3.18it/s, training_loss=0.432][A
Epoch 4:   0%|          | 2/1271 [00:00<06:38,  3.18it/s, training_loss=0.189][A
Epoch 4:   0%|          | 3/1271 [00:00<06:44,  3.13it/s, training_loss=0.189][A
Epoch 4:   0%|          | 3/1271 [00:01<06:44,  3.13it/s, training_loss=0.394][A
Epoch 4:   0%|          | 4/1271 [00:01<06:45,  3.12it/s, training_loss=0.394][A
Epoch 4:   0%|          | 4/1271 [00:01<06:45,  3.12it/s, training_loss=0.249][A
Epoch 4:   0%|          | 5/1271 [00:01<06:48,  3.10it/s, training_loss=0.249][A
Epoch 4:   0%|          | 5/1271 [00:01<06:48,  3.10it/s, training_loss=0.757][A
Epoch 4:   0%|          | 6/1271 [00:01<06:49,  3.09


Epoch 4
Training loss: 0.762079257326989


 80%|████████  | 4/5 [28:54<07:14, 434.55s/it]

Validation loss: 0.7947616682439629
F1 Score (Weighted): 0.7344331106199063
Recall@5: 0.9791976225854383



Epoch 5:   0%|          | 0/1271 [00:00<?, ?it/s][A
Epoch 5:   0%|          | 0/1271 [00:00<?, ?it/s, training_loss=0.200][A
Epoch 5:   0%|          | 1/1271 [00:00<06:50,  3.10it/s, training_loss=0.200][A
Epoch 5:   0%|          | 1/1271 [00:00<06:50,  3.10it/s, training_loss=0.116][A
Epoch 5:   0%|          | 2/1271 [00:00<06:51,  3.09it/s, training_loss=0.116][A
Epoch 5:   0%|          | 2/1271 [00:00<06:51,  3.09it/s, training_loss=0.563][A
Epoch 5:   0%|          | 3/1271 [00:00<06:50,  3.09it/s, training_loss=0.563][A
Epoch 5:   0%|          | 3/1271 [00:01<06:50,  3.09it/s, training_loss=0.267][A
Epoch 5:   0%|          | 4/1271 [00:01<06:52,  3.07it/s, training_loss=0.267][A
Epoch 5:   0%|          | 4/1271 [00:01<06:52,  3.07it/s, training_loss=0.520][A
Epoch 5:   0%|          | 5/1271 [00:01<06:51,  3.08it/s, training_loss=0.520][A
Epoch 5:   0%|          | 5/1271 [00:01<06:51,  3.08it/s, training_loss=0.129][A
Epoch 5:   0%|          | 6/1271 [00:01<06:47,  3.10


Epoch 5
Training loss: 0.6835919609982125


100%|██████████| 5/5 [36:08<00:00, 433.75s/it]

Validation loss: 0.7440567450929019
F1 Score (Weighted): 0.7496632218792628
Recall@5: 0.9836552748885586





In [19]:
model.load_state_dict(torch.load('finetuned_BERT_epoch_1.model', map_location=torch.device('cuda')))

_, predictions, true_vals, _ = evaluate(dataloader_validation)
accuracy_per_class(predictions, true_vals)

Class: FY  Notice of Funding Opportunity: Bipartisan Infrastructure Law (BIL) Airport Terminal Program (ATP) Grants: The Department of Transportation (DOT), Federal Aviation Administration (FAA) announces the opportunity to apply for approximately $ billion in FY  discretionary funds for the Airport Terminal Program (ATP), made available under the Infrastructure Investment and Jobs Act of  (IIJA), Pub. L. -, herein referred to as the Bipartisan Infrastructure Law (BIL). The purpose of the ATP is to make annual grants available to eligible airports for airport terminal and airport-owned Airport Traffic Control Towers development projects that address the aging infrastructure of our nation&#;s airports. In addition, ATP grants will align with DOT&#;s Strategic Framework FY- at https://www.transportation.gov/administrations/office-policy/fy--strategic-framework. The FY  ATP will be implemented consistent with law and in alignment with the priorities in Executive Order , Implementation of 

{'FY  Notice of Funding Opportunity: Bipartisan Infrastructure Law (BIL) Airport Terminal Program (ATP) Grants: The Department of Transportation (DOT), Federal Aviation Administration (FAA) announces the opportunity to apply for approximately $ billion in FY  discretionary funds for the Airport Terminal Program (ATP), made available under the Infrastructure Investment and Jobs Act of  (IIJA), Pub. L. -, herein referred to as the Bipartisan Infrastructure Law (BIL). The purpose of the ATP is to make annual grants available to eligible airports for airport terminal and airport-owned Airport Traffic Control Towers development projects that address the aging infrastructure of our nation&#;s airports. In addition, ATP grants will align with DOT&#;s Strategic Framework FY- at https://www.transportation.gov/administrations/office-policy/fy--strategic-framework. The FY  ATP will be implemented consistent with law and in alignment with the priorities in Executive Order , Implementation of the I

In [20]:
model.load_state_dict(torch.load('finetuned_BERT_epoch_2.model', map_location=torch.device('cuda')))

_, predictions, true_vals, _ = evaluate(dataloader_validation)
accuracy_per_class(predictions, true_vals)

Class: FY  Notice of Funding Opportunity: Bipartisan Infrastructure Law (BIL) Airport Terminal Program (ATP) Grants: The Department of Transportation (DOT), Federal Aviation Administration (FAA) announces the opportunity to apply for approximately $ billion in FY  discretionary funds for the Airport Terminal Program (ATP), made available under the Infrastructure Investment and Jobs Act of  (IIJA), Pub. L. -, herein referred to as the Bipartisan Infrastructure Law (BIL). The purpose of the ATP is to make annual grants available to eligible airports for airport terminal and airport-owned Airport Traffic Control Towers development projects that address the aging infrastructure of our nation&#;s airports. In addition, ATP grants will align with DOT&#;s Strategic Framework FY- at https://www.transportation.gov/administrations/office-policy/fy--strategic-framework. The FY  ATP will be implemented consistent with law and in alignment with the priorities in Executive Order , Implementation of 

{'FY  Notice of Funding Opportunity: Bipartisan Infrastructure Law (BIL) Airport Terminal Program (ATP) Grants: The Department of Transportation (DOT), Federal Aviation Administration (FAA) announces the opportunity to apply for approximately $ billion in FY  discretionary funds for the Airport Terminal Program (ATP), made available under the Infrastructure Investment and Jobs Act of  (IIJA), Pub. L. -, herein referred to as the Bipartisan Infrastructure Law (BIL). The purpose of the ATP is to make annual grants available to eligible airports for airport terminal and airport-owned Airport Traffic Control Towers development projects that address the aging infrastructure of our nation&#;s airports. In addition, ATP grants will align with DOT&#;s Strategic Framework FY- at https://www.transportation.gov/administrations/office-policy/fy--strategic-framework. The FY  ATP will be implemented consistent with law and in alignment with the priorities in Executive Order , Implementation of the I

In [21]:
model.load_state_dict(torch.load('finetuned_BERT_epoch_3.model', map_location=torch.device('cuda')))

_, predictions, true_vals, _ = evaluate(dataloader_validation)
accuracy_per_class(predictions, true_vals)

Class: FY  Notice of Funding Opportunity: Bipartisan Infrastructure Law (BIL) Airport Terminal Program (ATP) Grants: The Department of Transportation (DOT), Federal Aviation Administration (FAA) announces the opportunity to apply for approximately $ billion in FY  discretionary funds for the Airport Terminal Program (ATP), made available under the Infrastructure Investment and Jobs Act of  (IIJA), Pub. L. -, herein referred to as the Bipartisan Infrastructure Law (BIL). The purpose of the ATP is to make annual grants available to eligible airports for airport terminal and airport-owned Airport Traffic Control Towers development projects that address the aging infrastructure of our nation&#;s airports. In addition, ATP grants will align with DOT&#;s Strategic Framework FY- at https://www.transportation.gov/administrations/office-policy/fy--strategic-framework. The FY  ATP will be implemented consistent with law and in alignment with the priorities in Executive Order , Implementation of 

{'FY  Notice of Funding Opportunity: Bipartisan Infrastructure Law (BIL) Airport Terminal Program (ATP) Grants: The Department of Transportation (DOT), Federal Aviation Administration (FAA) announces the opportunity to apply for approximately $ billion in FY  discretionary funds for the Airport Terminal Program (ATP), made available under the Infrastructure Investment and Jobs Act of  (IIJA), Pub. L. -, herein referred to as the Bipartisan Infrastructure Law (BIL). The purpose of the ATP is to make annual grants available to eligible airports for airport terminal and airport-owned Airport Traffic Control Towers development projects that address the aging infrastructure of our nation&#;s airports. In addition, ATP grants will align with DOT&#;s Strategic Framework FY- at https://www.transportation.gov/administrations/office-policy/fy--strategic-framework. The FY  ATP will be implemented consistent with law and in alignment with the priorities in Executive Order , Implementation of the I

In [22]:
model.load_state_dict(torch.load('finetuned_BERT_epoch_4.model', map_location=torch.device('cuda')))

_, predictions, true_vals, _ = evaluate(dataloader_validation)
accuracy_per_class(predictions, true_vals)

Class: FY  Notice of Funding Opportunity: Bipartisan Infrastructure Law (BIL) Airport Terminal Program (ATP) Grants: The Department of Transportation (DOT), Federal Aviation Administration (FAA) announces the opportunity to apply for approximately $ billion in FY  discretionary funds for the Airport Terminal Program (ATP), made available under the Infrastructure Investment and Jobs Act of  (IIJA), Pub. L. -, herein referred to as the Bipartisan Infrastructure Law (BIL). The purpose of the ATP is to make annual grants available to eligible airports for airport terminal and airport-owned Airport Traffic Control Towers development projects that address the aging infrastructure of our nation&#;s airports. In addition, ATP grants will align with DOT&#;s Strategic Framework FY- at https://www.transportation.gov/administrations/office-policy/fy--strategic-framework. The FY  ATP will be implemented consistent with law and in alignment with the priorities in Executive Order , Implementation of 

{'FY  Notice of Funding Opportunity: Bipartisan Infrastructure Law (BIL) Airport Terminal Program (ATP) Grants: The Department of Transportation (DOT), Federal Aviation Administration (FAA) announces the opportunity to apply for approximately $ billion in FY  discretionary funds for the Airport Terminal Program (ATP), made available under the Infrastructure Investment and Jobs Act of  (IIJA), Pub. L. -, herein referred to as the Bipartisan Infrastructure Law (BIL). The purpose of the ATP is to make annual grants available to eligible airports for airport terminal and airport-owned Airport Traffic Control Towers development projects that address the aging infrastructure of our nation&#;s airports. In addition, ATP grants will align with DOT&#;s Strategic Framework FY- at https://www.transportation.gov/administrations/office-policy/fy--strategic-framework. The FY  ATP will be implemented consistent with law and in alignment with the priorities in Executive Order , Implementation of the I

In [23]:
model.load_state_dict(torch.load('finetuned_BERT_epoch_5.model', map_location=torch.device('cuda')))

_, predictions, true_vals, _ = evaluate(dataloader_validation)
accuracy_per_class(predictions, true_vals)

Class: FY  Notice of Funding Opportunity: Bipartisan Infrastructure Law (BIL) Airport Terminal Program (ATP) Grants: The Department of Transportation (DOT), Federal Aviation Administration (FAA) announces the opportunity to apply for approximately $ billion in FY  discretionary funds for the Airport Terminal Program (ATP), made available under the Infrastructure Investment and Jobs Act of  (IIJA), Pub. L. -, herein referred to as the Bipartisan Infrastructure Law (BIL). The purpose of the ATP is to make annual grants available to eligible airports for airport terminal and airport-owned Airport Traffic Control Towers development projects that address the aging infrastructure of our nation&#;s airports. In addition, ATP grants will align with DOT&#;s Strategic Framework FY- at https://www.transportation.gov/administrations/office-policy/fy--strategic-framework. The FY  ATP will be implemented consistent with law and in alignment with the priorities in Executive Order , Implementation of 

{'FY  Notice of Funding Opportunity: Bipartisan Infrastructure Law (BIL) Airport Terminal Program (ATP) Grants: The Department of Transportation (DOT), Federal Aviation Administration (FAA) announces the opportunity to apply for approximately $ billion in FY  discretionary funds for the Airport Terminal Program (ATP), made available under the Infrastructure Investment and Jobs Act of  (IIJA), Pub. L. -, herein referred to as the Bipartisan Infrastructure Law (BIL). The purpose of the ATP is to make annual grants available to eligible airports for airport terminal and airport-owned Airport Traffic Control Towers development projects that address the aging infrastructure of our nation&#;s airports. In addition, ATP grants will align with DOT&#;s Strategic Framework FY- at https://www.transportation.gov/administrations/office-policy/fy--strategic-framework. The FY  ATP will be implemented consistent with law and in alignment with the priorities in Executive Order , Implementation of the I

In [24]:
# Create new dataset of project + project name and grant title
simple_df = clean_dataset
simple_df['project_profile'] =  simple_df['Applicants'] + ': ' + simple_df['Project Description'] + ' ' + simple_df['Project Name']
simple_df['grant_profile'] =  simple_df['opportunitytitle'] + ': ' + simple_df['description']
simple_df = simple_df[['project_profile','grant_profile'] ]
possible_grants = simple_df['grant_profile'].unique()

#substitute label with number instead
label_dict = {}
for index, possible_label in enumerate(possible_grants):
    label_dict[possible_label] = index

simple_df['label'] = simple_df['grant_profile'].replace(label_dict)
simple_df['grant_profile'].unique()[0]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  simple_df['project_profile'] =  simple_df['Applicants'] + ': ' + simple_df['Project Description'] + ' ' + simple_df['Project Name']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  simple_df['grant_profile'] =  simple_df['opportunitytitle'] + ': ' + simple_df['description']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-

'FY  Notice of Funding Opportunity: Bipartisan Infrastructure Law (BIL) Airport Terminal Program (ATP) Grants: The Department of Transportation (DOT), Federal Aviation Administration (FAA) announces the opportunity to apply for approximately $ billion in FY  discretionary funds for the Airport Terminal Program (ATP), made available under the Infrastructure Investment and Jobs Act of  (IIJA), Pub. L. -, herein referred to as the Bipartisan Infrastructure Law (BIL). The purpose of the ATP is to make annual grants available to eligible airports for airport terminal and airport-owned Airport Traffic Control Towers development projects that address the aging infrastructure of our nation&#;s airports. In addition, ATP grants will align with DOT&#;s Strategic Framework FY- at https://www.transportation.gov/administrations/office-policy/fy--strategic-framework. The FY  ATP will be implemented consistent with law and in alignment with the priorities in Executive Order , Implementation of the In