In [90]:
import pandas as pd
import numpy as np

from tqdm import tqdm
tqdm.pandas(desc="progress-bar")

from sklearn import utils
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

import re

from gensim.models import Doc2Vec
from gensim.utils import simple_preprocess
from gensim.models.doc2vec import TaggedDocument

In [99]:
def remove_year(string):
    if isinstance(string, str):
        return re.sub(r'\s20\d{2}(\s|\-\d{2})?', ' ', string)
    return string

def remove_numbers(text):
    if isinstance(text, str):
        return re.sub(r'\d+', '', text)
    return text

In [104]:
dataset = pd.read_csv("case_studies.csv")
"""
Cleaning the data:
- only include columns we will use: 'Project Name', 'Project Description', 'Applicants', 'opportunitytitle', 'description'
- drop column with missing information
- currently, the grant opportunities include years, i will omit this from their title to group recurring grants together
"""
dataset = dataset[['Project Name', 'Project Description', 'Applicants', 'opportunitytitle', 'description']]
clean_dataset = dataset.dropna()
print(len(clean_dataset['opportunitytitle'].unique()))
# print(dataset['opportunitytitle'].unique())
clean_dataset['description'] = clean_dataset['description'].apply(remove_year)
clean_dataset['opportunitytitle'] = clean_dataset['opportunitytitle'].apply(remove_numbers)

print(len(clean_dataset['opportunitytitle'].unique()))
# print(dataset['opportunitytitle'].unique())

79
55


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clean_dataset['description'] = clean_dataset['description'].apply(remove_year)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clean_dataset['opportunitytitle'] = clean_dataset['opportunitytitle'].apply(remove_numbers)


In [115]:
# import dataset
df = clean_dataset.copy()

df['opportunitytitle'] = df['opportunitytitle'].apply(remove_year)
df['description'] = df['description'].apply(remove_year)

# separate and isolate applicant and grant details
df['applicant_details'] = df['Applicants'] + ' ' + df['Project Description']
df['grant_details'] = df['opportunitytitle']

# merge them into one for vector embedding
applications = df['applicant_details'].to_list()
grants = df['grant_details'].unique()

grant_names = df['opportunitytitle'].unique()
print(len(grant_names))
print(len(grants))

all = applications.copy()
all.extend(grants)

old_df = df.copy()
df = pd.DataFrame({'details': all})
df.tail(5)

55
55


Unnamed: 0,details
4534,SMART Grants Notice of Funding
4535,United States Marine Highway Grants
4536,Rural Surface Transportation Grant Progam
4537,FY Competitive Funding Opportunity: Rail Vehi...
4538,*Grants for Buses and Bus Facilities Program


In [116]:
# prepare dataset for model
df['details'] = df['details'].astype(str).fillna('') # remove NaNs and non-strings
df['cleaned_details'] = df['details'].apply(lambda x: simple_preprocess(x)) # remove auxiliary stuff
df.head(5)

Unnamed: 0,details,cleaned_details
0,Fairbanks International Airport This award fun...,"[fairbanks, international, airport, this, awar..."
1,Ted Stevens Anchorage International Airport Th...,"[ted, stevens, anchorage, international, airpo..."
2,Ted Stevens Anchorage International Airport Th...,"[ted, stevens, anchorage, international, airpo..."
3,Phoenix Sky Harbor International Airport This ...,"[phoenix, sky, harbor, international, airport,..."
4,Yuma International Airport This award funds up...,"[yuma, international, airport, this, award, fu..."


In [117]:
# prepare TaggedDocument objects
documents = [TaggedDocument(words=row['cleaned_details'], tags=[index]) for index, row in df.iterrows()]

# train doc2vec model
model = Doc2Vec(vector_size=100, window=5, min_count=2, workers=4, epochs=40)
model.build_vocab(documents)
model.train(documents, total_examples=model.corpus_count, epochs=model.epochs)

# infer vectors for each item
df['vector'] = df['cleaned_details'].apply(lambda x: model.infer_vector(x))


In [110]:
df.head(5)

Unnamed: 0,details,cleaned_details,vector
0,Fairbanks International Airport This award fun...,"[fairbanks, international, airport, this, awar...","[-0.100922406, 0.28740832, -0.13542156, 0.0151..."
1,Ted Stevens Anchorage International Airport Th...,"[ted, stevens, anchorage, international, airpo...","[0.42132458, -0.17316939, -0.3352974, 0.096254..."
2,Ted Stevens Anchorage International Airport Th...,"[ted, stevens, anchorage, international, airpo...","[0.0013673275, 0.3800731, -0.10873813, 0.12498..."
3,Phoenix Sky Harbor International Airport This ...,"[phoenix, sky, harbor, international, airport,...","[-0.14518051, 0.45197237, -0.3929733, -0.30838..."
4,Yuma International Airport This award funds up...,"[yuma, international, airport, this, award, fu...","[0.97314936, 0.73296005, -0.21525623, -0.24233..."


In [111]:
print(len(applications))
print(len(grants))

4484
66


In [124]:
from sklearn.metrics.pairwise import cosine_similarity

grant_df = df.copy()
grant_df = grant_df[len(applications):]

# Function to get top K recommendations
def get_recommendations(item_vector, k=5):
    similarities = cosine_similarity([item_vector], grant_df['vector'].tolist())[0]
    similar_indices = similarities.argsort()[-k:][::-1]  # Get top k indices
    return grant_df.iloc[similar_indices]['details'].tolist()

# Example of getting recommendations for the first item
count_correct = 0
with open('doc2vec_outputs.txt', 'w') as file:
    for i in range(len(applications)):

        item_vector = df.iloc[i]['vector']
        recommendations = get_recommendations(item_vector)

        file.write(f"Grant: {df.iloc[i]['details'][:200]}:\n")
        for rec in recommendations:
            file.write(rec + '\n')
            if rec == remove_numbers(str(clean_dataset.iloc[i]['opportunitytitle'])):
                count_correct += 1
        file.write('\n')

print(count_correct / len(applications))

0.10370205173951828
