In [None]:
import os
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd
from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup


import torch
from tqdm.notebook import tqdm

from transformers import BertTokenizer
from torch.utils.data import TensorDataset

from transformers import BertForSequenceClassification


# Helper Function

In [None]:
import re
def remove_numbers(text):
    if isinstance(text, str):
        return re.sub(r'\d+', '', text)
    return text


def remove_years(text):
    return text


# Import Data

In [None]:
dataset = pd.read_csv('BIL Launchpad Case studies - Sheet1.csv')

"""
Cleaning the data:
- only include columns we will use: 'Project Name', 'Project Description', 'Applicants', 'opportunitytitle', 'description'
- drop column with missing information
- currently, the grant opportunities include years, i will omit this from their title to group recurring grants together
"""
dataset = dataset[['Project Name', 'Project Description', 'Applicants', 'opportunitytitle', 'description']]
clean_dataset = dataset.dropna()
print(len(clean_dataset['opportunitytitle'].unique()))
# print(dataset['opportunitytitle'].unique())
clean_dataset['opportunitytitle'] = clean_dataset['opportunitytitle'].apply(remove_numbers)
print(len(clean_dataset['opportunitytitle'].unique()))
# print(dataset['opportunitytitle'].unique())

79
55


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clean_dataset['opportunitytitle'] = clean_dataset['opportunitytitle'].apply(remove_numbers)


In [None]:
clean_dataset

Unnamed: 0,Project Name,Project Description,Applicants,opportunitytitle,description
0,Fairbanks International Airport,This award funds the replacement of the passen...,Fairbanks International Airport,FY Notice of Funding Opportunity: Bipartisan ...,"The Department of Transportation (DOT), Federa..."
1,Ted Stevens Anchorage International Airport,This award funds the installation of 110 audio...,Ted Stevens Anchorage International Airport,FY Notice of Funding Opportunity: Bipartisan ...,"The Department of Transportation (DOT), Federa..."
2,Ted Stevens Anchorage International Airport,This award funds the installation of a new pas...,Ted Stevens Anchorage International Airport,FY Notice of Funding Opportunity: Bipartisan ...,"The Department of Transportation (DOT), Federa..."
3,Phoenix Sky Harbor International Airport,"This award funds the replacement of chillers, ...",Phoenix Sky Harbor International Airport,FY Notice of Funding Opportunity: Bipartisan ...,"The Department of Transportation (DOT), Federa..."
4,Yuma International Airport,This award funds upgrading and replacing secur...,Yuma International Airport,FY Notice of Funding Opportunity: Bipartisan ...,"The Department of Transportation (DOT), Federa..."
...,...,...,...,...,...
5664,buy 52 new light rail vehicles,The Maryland Department of Transportation Mary...,Maryland Department of Transportation (Marylan...,FY Competitive Funding Opportunity: Rail Vehi...,The Federal Transit Administration (FTA) annou...
5665,buy up to 200 new rail cars to replace older r...,The Southeastern Pennsylvania Transportation A...,Southeastern Pennsylvania Transportation Autho...,FY Competitive Funding Opportunity: Rail Vehi...,The Federal Transit Administration (FTA) annou...
5666,buy up to 200 new rail cars to replace older r...,The Southeastern Pennsylvania Transportation A...,Southeastern Pennsylvania Transportation Autho...,FY Competitive Funding Opportunity: Rail Vehi...,The Federal Transit Administration (FTA) annou...
5681,Colorado Department of Transportation_CO_Buses...,"The Colorado Department of Transportation, on ...",Colorado Department of Transportation,*Grants for Buses and Bus Facilities Program,The Federal Transit Administration (FTA) annou...


# Experiment 1: Classification of project <> grant name (multi-class text classification) (ignore description of program)

In [None]:
# Create new dataset of project + project name and grant title
simple_df = clean_dataset[['Project Name', 'Project Description', 'Applicants', 'opportunitytitle']]
simple_df['project_profile'] =  simple_df['Applicants'] + ': ' + simple_df['Project Description'] + ' ' + simple_df['Project Name']
simple_df = simple_df[['project_profile','opportunitytitle'] ]
possible_grants = simple_df['opportunitytitle'].unique()

#substitute label with number instead
label_dict = {}
for index, possible_label in enumerate(possible_grants):
    label_dict[possible_label] = index

simple_df['label'] = simple_df['opportunitytitle'].replace(label_dict)
simple_df


Unnamed: 0,project_profile,opportunitytitle,label
0,Fairbanks International Airport: This award fu...,FY Notice of Funding Opportunity: Bipartisan ...,0
1,Ted Stevens Anchorage International Airport: T...,FY Notice of Funding Opportunity: Bipartisan ...,0
2,Ted Stevens Anchorage International Airport: T...,FY Notice of Funding Opportunity: Bipartisan ...,0
3,Phoenix Sky Harbor International Airport: This...,FY Notice of Funding Opportunity: Bipartisan ...,0
4,Yuma International Airport: This award funds u...,FY Notice of Funding Opportunity: Bipartisan ...,0
...,...,...,...
5664,Maryland Department of Transportation (Marylan...,FY Competitive Funding Opportunity: Rail Vehi...,53
5665,Southeastern Pennsylvania Transportation Autho...,FY Competitive Funding Opportunity: Rail Vehi...,53
5666,Southeastern Pennsylvania Transportation Autho...,FY Competitive Funding Opportunity: Rail Vehi...,53
5681,Colorado Department of Transportation: The Col...,*Grants for Buses and Bus Facilities Program,54


In [None]:
simple_df['project_profile'][0]

'Fairbanks International Airport: This award funds the replacement of the passenger boarding bridge at Gate 3. Fairbanks International Airport'

# Train validation split

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(simple_df.index.values,
                                                  simple_df.label.values,
                                                  test_size=0.15,
                                                  random_state=42,
                                                  stratify=simple_df.label.values)

simple_df['data_type'] = ['not_set']*simple_df.shape[0]

simple_df.loc[X_train, 'data_type'] = 'train'
simple_df.loc[X_val, 'data_type'] = 'val'

simple_df.groupby(['opportunitytitle', 'label', 'data_type']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,project_profile
opportunitytitle,label,data_type,Unnamed: 3_level_1
Competitive Funding Opportunity: Pilot Program for Transit-Oriented Development (TOD) Planning,14,train,17
Competitive Funding Opportunity: Pilot Program for Transit-Oriented Development (TOD) Planning,14,val,3
Tribal Transportation Program Safety Fund,8,train,149
Tribal Transportation Program Safety Fund,8,val,26
*FY Notice of Funding Opportunity: Bipartisan Infrastructure Law (BIL) Airport Terminal Program (ATP) Grants,33,train,159
...,...,...,...
Strengthening Mobility and Revolutionizing Transportation (SMART) Grants Program,40,val,9
The Infrastructure Investment and Jobs Act (IIJA) Notice of Funding Opportunity for America&#;s Marine Highway Program,32,train,10
The Infrastructure Investment and Jobs Act (IIJA) Notice of Funding Opportunity for America&#;s Marine Highway Program,32,val,2
United States Marine Highway Grants,51,train,7


# Tokenisation

In [None]:
"""
Constructs a BERT tokenizer. Based on WordPiece.
Instantiate a pre-trained BERT model configuration to encode our data.
To convert all the titles from text into encoded form, we use a function called batch_encode_plus , and we will proceed train and validation data separately.
The 1st parameter inside the above function is the title text.
add_special_tokens=True means the sequences will be encoded with the special tokens relative to their model.
When batching sequences together, we set return_attention_mask=True, so it will return the attention mask according to the specific tokenizer defined by the max_length attribute.
We also want to pad all the titles to certain maximum length.
We actually do not need to set max_length=256, but just to play it safe.
return_tensors='pt' to return PyTorch.
And then we need to split the data into input_ids, attention_masks and labels.
Finally, after we get encoded data set, we can create training data and validation data.
"""
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased',
                                          do_lower_case=True)

encoded_data_train = tokenizer.batch_encode_plus(
    simple_df[simple_df.data_type=='train'].project_profile.values,
    add_special_tokens=True,
    return_attention_mask=True,
    pad_to_max_length=True,
    #max_length=256,
    return_tensors='pt'
)

encoded_data_val = tokenizer.batch_encode_plus(
    simple_df[simple_df.data_type=='val'].project_profile.values,
    add_special_tokens=True,
    return_attention_mask=True,
    pad_to_max_length=True,
    #max_length=256,
    return_tensors='pt'
)

input_ids_train = encoded_data_train['input_ids']
attention_masks_train = encoded_data_train['attention_mask']
labels_train = torch.tensor(simple_df[simple_df.data_type=='train'].label.values)

input_ids_val = encoded_data_val['input_ids']
attention_masks_val = encoded_data_val['attention_mask']
labels_val = torch.tensor(simple_df[simple_df.data_type=='val'].label.values)

dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)
dataset_val = TensorDataset(input_ids_val, attention_masks_val, labels_val)




# BERT

In [None]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                      num_labels=len(label_dict),
                                                      output_attentions=False,
                                                      output_hidden_states=False)

device = 'cpu'
if torch.cuda.is_available():
    device = torch.cuda.current_device()
print(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0


In [None]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

batch_size = 3

dataloader_train = DataLoader(dataset_train,
                              sampler=RandomSampler(dataset_train),
                              batch_size=batch_size)

dataloader_validation = DataLoader(dataset_val,
                                   sampler=SequentialSampler(dataset_val),
                                   batch_size=batch_size)

In [None]:
from transformers import AdamW, get_linear_schedule_with_warmup

optimizer = AdamW(model.parameters(),
                  lr=1e-5,
                  eps=1e-8)

epochs = 5

scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=0,
                                            num_training_steps=len(dataloader_train)*epochs)



In [26]:
import torch
from sklearn.metrics import f1_score

def f1_score_func(preds, labels):
    preds_flat = torch.argmax(preds, dim=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat.cpu().numpy(), preds_flat.cpu().numpy(), average='weighted')

def accuracy_per_class(preds, labels):
    label_dict_inverse = {v: k for k, v in label_dict.items()}

    preds_flat = torch.argmax(preds, dim=1).flatten()
    labels_flat = labels.flatten()

    class_accuracies = {}
    for label in torch.unique(labels_flat):
        y_preds = preds_flat[labels_flat == label]
        y_true = labels_flat[labels_flat == label]
        accuracy = (y_preds == label).sum().item() / len(y_true)
        class_accuracies[label_dict_inverse[label.item()]] = accuracy
        print(f'Class: {label_dict_inverse[label.item()]}')
        print(f'Accuracy: {(y_preds == label).sum().item()}/{len(y_true)} ({accuracy:.4f})\n')

    return class_accuracies

def recall_at_k(y_true, y_pred, k=5):
    num_samples = y_true.size(0)
    recall_count = 0

    for true_label, predictions in zip(y_true, y_pred):
        top_k_predictions = torch.topk(predictions, k).indices  # Get indices of the top k predictions
        if true_label in top_k_predictions:
            recall_count += 1

    recall_at_k_score = recall_count / num_samples
    return recall_at_k_score

def evaluate(dataloader_val):
    model.eval()

    loss_val_total = 0
    predictions, true_vals = [], []

    for batch in dataloader_val:

        batch = tuple(b.to(device) for b in batch)

        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }

        with torch.no_grad():
            outputs = model(**inputs)

        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu()
        label_ids = inputs['labels'].cpu()
        predictions.append(logits)
        true_vals.append(label_ids)

    loss_val_avg = loss_val_total / len(dataloader_val)

    predictions = torch.cat(predictions, dim=0)
    true_vals = torch.cat(true_vals, dim=0)

    recall_at_5 = recall_at_k(true_vals, predictions, k=5)

    return loss_val_avg, predictions, true_vals, recall_at_5


In [24]:
import random
import torch
import numpy as np
from tqdm import tqdm

# Set the random seeds for reproducibility
seed_val = 17
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

# Ensure to use GPU if available
model.to(device)

# Training loop
for epoch in tqdm(range(1, epochs+1)):

    model.train()

    loss_train_total = 0

    progress_bar = tqdm(dataloader_train, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
    for batch in progress_bar:
        model.zero_grad()

        batch = tuple(b.to(device) for b in batch)

        inputs = {
            'input_ids': batch[0],
            'attention_mask': batch[1],
            'labels': batch[2],
        }

        outputs = model(**inputs)

        loss = outputs[0]
        loss_train_total += loss.item()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()

        progress_bar.set_postfix({'training_loss': f'{loss.item() / len(batch):.3f}'})

    torch.save(model.state_dict(), f'finetuned_BERT_epoch_{epoch}.model')

    tqdm.write(f'\nEpoch {epoch}')
    loss_train_avg = loss_train_total / len(dataloader_train)
    tqdm.write(f'Training loss: {loss_train_avg}')

    loss_val_avg, predictions, true_vals, recall_at_5 = evaluate(dataloader_validation)
    val_f1 = f1_score_func(predictions, true_vals)
    tqdm.write(f'Validation loss: {loss_val_avg}')
    tqdm.write(f'F1 Score (Weighted): {val_f1}')
    tqdm.write(f'Recall@5: {recall_at_5}')


  0%|          | 0/5 [00:00<?, ?it/s]
Epoch 1:   0%|          | 0/1271 [00:00<?, ?it/s][A
Epoch 1:   0%|          | 0/1271 [00:00<?, ?it/s, training_loss=0.514][A
Epoch 1:   0%|          | 1/1271 [00:00<07:48,  2.71it/s, training_loss=0.514][A
Epoch 1:   0%|          | 1/1271 [00:00<07:48,  2.71it/s, training_loss=0.622][A
Epoch 1:   0%|          | 2/1271 [00:00<07:01,  3.01it/s, training_loss=0.622][A
Epoch 1:   0%|          | 2/1271 [00:00<07:01,  3.01it/s, training_loss=0.755][A
Epoch 1:   0%|          | 3/1271 [00:00<06:44,  3.13it/s, training_loss=0.755][A
Epoch 1:   0%|          | 3/1271 [00:01<06:44,  3.13it/s, training_loss=1.200][A
Epoch 1:   0%|          | 4/1271 [00:01<06:35,  3.21it/s, training_loss=1.200][A
Epoch 1:   0%|          | 4/1271 [00:01<06:35,  3.21it/s, training_loss=0.685][A
Epoch 1:   0%|          | 5/1271 [00:01<06:29,  3.25it/s, training_loss=0.685][A
Epoch 1:   0%|          | 5/1271 [00:01<06:29,  3.25it/s, training_loss=0.789][A
Epoch 1:   0%| 


Epoch 1
Training loss: 1.485474429968935


 20%|██        | 1/5 [07:09<28:36, 429.01s/it]

Validation loss: 1.06739525972969
F1 Score (Weighted): 0.6696044260421984
Recall@5: 0.9583952451708767



Epoch 2:   0%|          | 0/1271 [00:00<?, ?it/s][A
Epoch 2:   0%|          | 0/1271 [00:00<?, ?it/s, training_loss=0.839][A
Epoch 2:   0%|          | 1/1271 [00:00<06:41,  3.16it/s, training_loss=0.839][A
Epoch 2:   0%|          | 1/1271 [00:00<06:41,  3.16it/s, training_loss=0.351][A
Epoch 2:   0%|          | 2/1271 [00:00<06:41,  3.16it/s, training_loss=0.351][A
Epoch 2:   0%|          | 2/1271 [00:00<06:41,  3.16it/s, training_loss=0.064][A
Epoch 2:   0%|          | 3/1271 [00:00<06:39,  3.17it/s, training_loss=0.064][A
Epoch 2:   0%|          | 3/1271 [00:01<06:39,  3.17it/s, training_loss=0.575][A
Epoch 2:   0%|          | 4/1271 [00:01<06:40,  3.16it/s, training_loss=0.575][A
Epoch 2:   0%|          | 4/1271 [00:01<06:40,  3.16it/s, training_loss=0.299][A
Epoch 2:   0%|          | 5/1271 [00:01<06:39,  3.17it/s, training_loss=0.299][A
Epoch 2:   0%|          | 5/1271 [00:01<06:39,  3.17it/s, training_loss=0.033][A
Epoch 2:   0%|          | 6/1271 [00:01<06:39,  3.17


Epoch 2
Training loss: 0.926874428406434


 40%|████      | 2/5 [14:22<21:34, 431.64s/it]

Validation loss: 0.7921861243185898
F1 Score (Weighted): 0.7581724944459965
Recall@5: 0.9821693907875185



Epoch 3:   0%|          | 0/1271 [00:00<?, ?it/s][A
Epoch 3:   0%|          | 0/1271 [00:00<?, ?it/s, training_loss=0.481][A
Epoch 3:   0%|          | 1/1271 [00:00<06:38,  3.18it/s, training_loss=0.481][A
Epoch 3:   0%|          | 1/1271 [00:00<06:38,  3.18it/s, training_loss=0.212][A
Epoch 3:   0%|          | 2/1271 [00:00<06:42,  3.15it/s, training_loss=0.212][A
Epoch 3:   0%|          | 2/1271 [00:00<06:42,  3.15it/s, training_loss=0.144][A
Epoch 3:   0%|          | 3/1271 [00:00<06:44,  3.14it/s, training_loss=0.144][A
Epoch 3:   0%|          | 3/1271 [00:01<06:44,  3.14it/s, training_loss=0.077][A
Epoch 3:   0%|          | 4/1271 [00:01<06:43,  3.14it/s, training_loss=0.077][A
Epoch 3:   0%|          | 4/1271 [00:01<06:43,  3.14it/s, training_loss=0.160][A
Epoch 3:   0%|          | 5/1271 [00:01<06:44,  3.13it/s, training_loss=0.160][A
Epoch 3:   0%|          | 5/1271 [00:01<06:44,  3.13it/s, training_loss=0.090][A
Epoch 3:   0%|          | 6/1271 [00:01<06:43,  3.14


Epoch 3
Training loss: 0.694829112779053


 60%|██████    | 3/5 [21:37<14:26, 433.07s/it]

Validation loss: 0.7204306911087285
F1 Score (Weighted): 0.7754230674538404
Recall@5: 0.9881129271916791



Epoch 4:   0%|          | 0/1271 [00:00<?, ?it/s][A
Epoch 4:   0%|          | 0/1271 [00:00<?, ?it/s, training_loss=0.002][A
Epoch 4:   0%|          | 1/1271 [00:00<06:43,  3.15it/s, training_loss=0.002][A
Epoch 4:   0%|          | 1/1271 [00:00<06:43,  3.15it/s, training_loss=0.377][A
Epoch 4:   0%|          | 2/1271 [00:00<06:43,  3.15it/s, training_loss=0.377][A
Epoch 4:   0%|          | 2/1271 [00:00<06:43,  3.15it/s, training_loss=0.187][A
Epoch 4:   0%|          | 3/1271 [00:00<06:44,  3.14it/s, training_loss=0.187][A
Epoch 4:   0%|          | 3/1271 [00:01<06:44,  3.14it/s, training_loss=0.159][A
Epoch 4:   0%|          | 4/1271 [00:01<06:45,  3.13it/s, training_loss=0.159][A
Epoch 4:   0%|          | 4/1271 [00:01<06:45,  3.13it/s, training_loss=0.088][A
Epoch 4:   0%|          | 5/1271 [00:01<06:48,  3.10it/s, training_loss=0.088][A
Epoch 4:   0%|          | 5/1271 [00:01<06:48,  3.10it/s, training_loss=0.715][A
Epoch 4:   0%|          | 6/1271 [00:01<06:45,  3.12


Epoch 4
Training loss: 0.5758132709816383


 80%|████████  | 4/5 [28:51<07:13, 433.71s/it]

Validation loss: 0.6737579104231878
F1 Score (Weighted): 0.7840595970640883
Recall@5: 0.9895988112927192



Epoch 5:   0%|          | 0/1271 [00:00<?, ?it/s][A
Epoch 5:   0%|          | 0/1271 [00:00<?, ?it/s, training_loss=0.183][A
Epoch 5:   0%|          | 1/1271 [00:00<06:46,  3.12it/s, training_loss=0.183][A
Epoch 5:   0%|          | 1/1271 [00:00<06:46,  3.12it/s, training_loss=0.021][A
Epoch 5:   0%|          | 2/1271 [00:00<06:58,  3.03it/s, training_loss=0.021][A
Epoch 5:   0%|          | 2/1271 [00:00<06:58,  3.03it/s, training_loss=0.279][A
Epoch 5:   0%|          | 3/1271 [00:00<06:52,  3.07it/s, training_loss=0.279][A
Epoch 5:   0%|          | 3/1271 [00:01<06:52,  3.07it/s, training_loss=0.154][A
Epoch 5:   0%|          | 4/1271 [00:01<06:49,  3.10it/s, training_loss=0.154][A
Epoch 5:   0%|          | 4/1271 [00:01<06:49,  3.10it/s, training_loss=0.457][A
Epoch 5:   0%|          | 5/1271 [00:01<06:50,  3.08it/s, training_loss=0.457][A
Epoch 5:   0%|          | 5/1271 [00:01<06:50,  3.08it/s, training_loss=0.026][A
Epoch 5:   0%|          | 6/1271 [00:01<06:53,  3.06


Epoch 5
Training loss: 0.51770970256421


100%|██████████| 5/5 [36:07<00:00, 433.58s/it]

Validation loss: 0.6627012163514479
F1 Score (Weighted): 0.7926523718777186
Recall@5: 0.9895988112927192





In [27]:
model.load_state_dict(torch.load('finetuned_BERT_epoch_1.model', map_location=torch.device('cpu')))

_, predictions, true_vals, _ = evaluate(dataloader_validation)
accuracy_per_class(predictions, true_vals)

Class: FY  Notice of Funding Opportunity: Bipartisan Infrastructure Law (BIL) Airport Terminal Program (ATP) Grants
Accuracy: 14/25 (0.5600)

Class: FY  Competitive Funding Opportunity: Airport Improvement Program Discretionary Grants
Accuracy: 163/163 (1.0000)

Class: Grants for Buses and Bus Facilities Program
Accuracy: 27/28 (0.9643)

Class: Solicitation of Project Proposals for the Low or No Emission (Low-No) Program
Accuracy: 0/6 (0.0000)

Class: FY  Competitive Funding Opportunity: Passenger Ferry Grant Program
Accuracy: 5/5 (1.0000)

Class: Pilot Program for Transit-Oriented Development (TOD) Planning
Accuracy: 0/5 (0.0000)

Class: Public Transportation on Indian Reservations Program
Accuracy: 0/6 (0.0000)

Class: INFRA Grants
Accuracy: 18/28 (0.6429)

Class:  Tribal Transportation Program Safety Fund
Accuracy: 26/26 (1.0000)

Class: Low or No Emission Program (Low-No Program)
Accuracy: 13/14 (0.9286)

Class: Passenger Ferry Grant Program
Accuracy: 0/2 (0.0000)

Class: FY  Compe

{'FY  Notice of Funding Opportunity: Bipartisan Infrastructure Law (BIL) Airport Terminal Program (ATP) Grants': 0.56,
 'FY  Competitive Funding Opportunity: Airport Improvement Program Discretionary Grants': 1.0,
 'Grants for Buses and Bus Facilities Program': 0.9642857142857143,
 'Solicitation of Project Proposals for the Low or No Emission (Low-No) Program': 0.0,
 'FY  Competitive Funding Opportunity: Passenger Ferry Grant Program': 1.0,
 'Pilot Program for Transit-Oriented Development (TOD) Planning': 0.0,
 'Public Transportation on Indian Reservations Program': 0.0,
 'INFRA Grants': 0.6428571428571429,
 ' Tribal Transportation Program Safety Fund': 1.0,
 'Low or No Emission Program (Low-No Program)': 0.9285714285714286,
 'Passenger Ferry Grant Program': 0.0,
 'FY  Competitive Funding Opportunity: Public Transportation on Indian Reservations Program': 0.0,
 '*INFRA Grants': 0.0,
 'Fiscal Year  Competitive Funding Opportunity; Grants for Buses and Bus Facilities Program': 0.0,
 ' Co

In [None]:

"""
we will summarise the project down to a multi-class text classification.
To do so:
1. We will concatenate 'project name' and 'project description' together
2.
"""