In [89]:
!pip install bertopic



In [90]:
from bertopic import BERTopic

## Load corpus

In [91]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Data Source: sam.gov contract opportunities downloaded on 17 Sep 23
https://sam.gov/data-services/Contract%20Opportunities/datagov?privacy=Public

In [92]:
import pandas as pd
corpus = pd.read_csv('/content/drive/MyDrive/a_datasets/ContractOpportunitiesFullCSV.csv',encoding='latin1')

#min word count for description
min_word_count = 200
min_character_count = min_word_count*4.7

# Filter for only "Solicitation" entries with Descriptions, ensure that they're all non-null strings of required length
corpus = corpus[(corpus.BaseType=='Solicitation')&(corpus.Description.str.len()>min_character_count)]
abstracts = corpus[~corpus.Description.isnull()].Description.astype(str).tolist()
print('{} qualifying solicitation descriptions'.format(len(abstracts)))


Columns (27,34) have mixed types. Specify dtype option on import or set low_memory=False.



9097 qualifying solicitation descriptions


In [93]:
len(abstracts)

9097

## Create embeddings for each document in the corpus using the "all-Mini_LM-L6-V2" language model that is the default for BERTopic

In [94]:
from sentence_transformers import SentenceTransformer

# Pre-calculate embeddings
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = embedding_model.encode(abstracts, show_progress_bar=True)

Batches:   0%|          | 0/285 [00:00<?, ?it/s]

## Set up the pipeline for the remainder of the topic modeling process

Note: because the documents are now pre-embedded, you can tune hyperparameters and try different approaches much more quickly than you could otherwise

## 1. Reduce the dimensionality of the embeddings

In [101]:
from umap import UMAP

umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=4)
## note: setting the random_state here prevents stochastic behavior when you re-run the pipeline

## 2. Cluster the resulting dense vectors

In [102]:
from hdbscan import HDBSCAN

hdbscan_model = HDBSCAN(min_cluster_size=20, metric='euclidean', cluster_selection_method='eom', prediction_data=True)


## 3. Prep for for topic modeling

In [103]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import text


# Add stopwords that are specific to the RFP corpus and contain little to no meaning
custom_stop_words = ['solicitation','amendment','contract','questions','quotation','synopsis','solicitation','commercial',\
                     'notice','offer','proposal','attachment','purpose','offerors','proposal','attachment','purpose','phase',\
                     'task','acquisition','quotations','projects','attached','offers','contractor','government','award','clause','clauses'\
                     'additional information','provision','provisions','date','selection','attachments','quotes','quote','deliverable','indefinite'\
                     'offeror','offerors']

custom_stop_words = text.ENGLISH_STOP_WORDS.union(custom_stop_words)

custom_stop_words = list(custom_stop_words)

vectorizer_model = CountVectorizer(stop_words=custom_stop_words, min_df=2, ngram_range=(1, 4))
## note: the author of the library rarely finds removing stopwords from the topic naming process helpful.
## With this dataset, I found that it removed some noise from topic names.

In [104]:
type(list(custom_stop_words))

list

## 4. Select the representation model that will draw the words that best describe the core themes of each topic cluster

In [105]:
from bertopic.representation import KeyBERTInspired

# Create your representation model
representation_model = KeyBERTInspired()

# Use the representation model in BERTopic on top of the default pipeline
topic_model = BERTopic(representation_model=representation_model)

## 5. Run the pipeline to create a topic_model

In [106]:
from bertopic import BERTopic

from bertopic.representation import PartOfSpeech

# Create your representation model
representation_model = PartOfSpeech("en_core_web_sm")

topic_model = BERTopic(

  # Pipeline models
  embedding_model=embedding_model,
  umap_model=umap_model,
  hdbscan_model=hdbscan_model,
  vectorizer_model=vectorizer_model,
  representation_model=representation_model,

  # Hyperparameters
  top_n_words=10,
  verbose=True
)

topics, probs = topic_model.fit_transform(abstracts, embeddings)

2023-09-21 17:49:38,719 - BERTopic - Reduced dimensionality
2023-09-21 17:49:39,097 - BERTopic - Clustered reduced embeddings


## Review topics
Note: Topic -1 is the outliers that were not clustered into any other topics

In [107]:
topic_model.get_topic_info()[:25]

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,2457,-1_information_code_requirements_order,"[information, code, requirements, order, work,...",[CONTACT INFORMATION|4|N7M2.14|EFN|N/A|alexand...
1,0,379,0_update_section_time_answers,"[update, section, time, answers, site, proposa...",[Amendment P0002 for the National Aeronautics ...
2,1,208,1_clauses_services_offeror_business,"[clauses, services, offeror, business, small, ...",[Request for Quote (RFQ)  Benchmade Knives Th...
3,2,173,2_build_indefinite_services_construction,"[build, indefinite, services, construction, de...",[NOTICE - The following attachments have been ...
4,3,156,3_code_inspection_order_repair,"[code, inspection, order, repair, specificatio...",[CONTACT INFORMATION|4|N712.5|AEQ|717.605.1303...
5,4,150,4_qualified sources_additional information_eff...,"[qualified sources, additional information, ef...",[This is a COMBINED SYNOPSIS/SOLICITATION for ...
6,5,123,5_embassy_prior_prospective_discussions,"[embassy, prior, prospective, discussions, tim...",[Enclosed is a Request for Quotations (RFQ) fo...
7,6,106,6_ombudsman_bid_section_new,"[ombudsman, bid, section, new, project, enclos...",[Please see attached FRFP Questions and Answer...
8,7,94,7_meeting_bids_hereunder_project,"[meeting, bids, hereunder, project, effective,...","[EFFECTIVE JULY 13, 2021: The Chicago District..."
9,8,86,8_personal_development_pay_persons,"[personal, development, pay, persons, internat...",[SOLICITATION NUMBER: 720BHA23R00002 ...


In [108]:
docs = abstracts

In [None]:
from scipy.cluster import hierarchy as sch
# Hierarchical topics
linkage_function = lambda x: sch.linkage(x, 'single', optimal_ordering=True)
hierarchical_topics = topic_model.hierarchical_topics(docs, linkage_function=linkage_function)

titles = corpus.title.tolist()
cluster_fig = topic_model.visualize_documents(titles, embeddings=embeddings)
# cluster_fig.write_html('/content/drive/MyDrive/a_datasets/topic_distro.html')
cluster_fig

In [None]:
titles = corpus.Title.tolist()
cluster_fig = topic_model.visualize_documents(titles, embeddings=embeddings)
# cluster_fig.write_html('/content/drive/MyDrive/a_datasets/topic_distro.html')
cluster_fig

In [None]:
def search_model(search_text):
  similar_topics, similarity = topic_model.find_topics(search_text, top_n=5)

  df = pd.DataFrame({'topic':topic_model.topics_,'title':corpus['Title'],'agency':corpus['Department/Ind.Agency'],'status':corpus['Active'],'description':corpus['Description'],'award_date':corpus['AwardDate']})
  hits = df[df.topic==similar_topics[0]][0:10]
  print('Topics:{}'.format(similar_topics))
  for i in hits[:10].itertuples():
    print(i.title)
    # print('https://pubmed.ncbi.nlm.nih.gov/' + str(i.pubmed_id))

In [None]:
corpus.columns

In [142]:
search_text = 'it services predictive analytics data engineering natural language processing python aws azure'
similar_topics, similarity = topic_model.find_topics(search_text, top_n=5)
pd.DataFrame({'topic':similar_topics,'similarity':similarity})

Unnamed: 0,topic,similarity
0,10,0.253622
1,30,0.251512
2,105,0.234656
3,93,0.226599
4,61,0.220074


In [143]:
topic_model.get_topic(10)

[('small business', 0.011894672143335448),
 ('portal', 0.011703821486069889),
 ('small', 0.011504768049825136),
 ('session', 0.011055395471382998),
 ('business', 0.010697409639197207),
 ('feedback', 0.010652526267803669),
 ('conference', 0.009639743204665112),
 ('drawing', 0.009082369110132807),
 ('members', 0.008533126712500217),
 ('team', 0.008353869198656185)]

In [132]:
df = pd.DataFrame({'notice_id':corpus['NoticeId'],'topic':topic_model.topics_,'link':corpus['Link'],'title':corpus['Title'],'agency':corpus['Department/Ind.Agency'],'status':corpus['Active'],'description':corpus['Description'],'award_date':corpus['AwardDate']})


In [144]:
for row in df[df.topic==10].itertuples():
  print(row.notice_id)
  print(row.title)
  print(row.link)

7522e479940643aebdcae94022795b7e
Forensic Science Manual (FS M) comparison macroscope and accessories
https://sam.gov/opp/7522e479940643aebdcae94022795b7e/view
289baf4c0f0449ecb3cccd58212a214f
Polaris GWAC Small Business Pool
https://sam.gov/opp/289baf4c0f0449ecb3cccd58212a214f/view
dfbf2cc473144443bed25c1406b4dff6
Polaris GWAC Service-Disabled Veteran-Owned Small Business Pool
https://sam.gov/opp/dfbf2cc473144443bed25c1406b4dff6/view
990c4846c3ca48868d64d96a8dbbdb9d
Polaris GWAC Women Owned Small Business Pool
https://sam.gov/opp/990c4846c3ca48868d64d96a8dbbdb9d/view
b51f3dc35611457fbb402a82c0cef99a
Polaris GWAC HubZone Pool
https://sam.gov/opp/b51f3dc35611457fbb402a82c0cef99a/view
1c4939a115e14b199bc0c1b51eafc1f6
SHAFT,LATCH
https://sam.gov/opp/1c4939a115e14b199bc0c1b51eafc1f6/view
8210ae33068444509da20b9e203e9179
RING,RETAINING
https://sam.gov/opp/8210ae33068444509da20b9e203e9179/view
8b21b1e421994e49b8f0107b67079745
SHAFT,SHOULDERED
https://sam.gov/opp/8b21b1e421994e49b8f0107b67079