In [5]:
def read_data(file):
    data = []
    with open(file, 'r', encoding='utf-8',
                 errors='ignore') as f:
        for line in f:
            if line.strip() != "":
                data.append(line.strip())
    return data

In [6]:
data_dict = {}
for year in range(2000, 2019):
    print(f"Reading data for year: {year}")
    data_dict[year] = read_data(f"data//{year}.txt")

Reading data for year: 2000
Reading data for year: 2001
Reading data for year: 2002
Reading data for year: 2003
Reading data for year: 2004
Reading data for year: 2005
Reading data for year: 2006
Reading data for year: 2007
Reading data for year: 2008
Reading data for year: 2009
Reading data for year: 2010
Reading data for year: 2011
Reading data for year: 2012
Reading data for year: 2013
Reading data for year: 2014
Reading data for year: 2015
Reading data for year: 2016
Reading data for year: 2017
Reading data for year: 2018


In [7]:
import warnings
warnings.filterwarnings("ignore")

In [8]:
%%time
from bertopic import BERTopic
model = BERTopic(verbose=True, embedding_model='paraphrase-MiniLM-L3-v2', min_topic_size= 7)
headline_topics, _ = model.fit_transform(data_dict[2018])

Batches: 100%|██████████| 9044/9044 [13:44<00:00, 10.97it/s] 
2022-11-23 09:26:19,543 - BERTopic - Transformed documents to Embeddings
2022-11-23 10:38:09,820 - BERTopic - Reduced dimensionality
2022-11-23 10:39:56,358 - BERTopic - Clustered reduced embeddings


CPU times: total: 8h 33min 52s
Wall time: 1h 28min 41s


In [9]:
freq = model.get_topic_info()
print("Number of topics: {}".format( len(freq)))
freq.head()

Number of topics: 6920


Unnamed: 0,Topic,Count,Name
0,-1,55776,-1_employees_substitute_letter_tion
1,0,8716,0_2011_2015_american_enacted
2,1,8573,1_community_low_income_distressed
3,2,1303,2_paragraph_paragraphs_adding_illustrate
4,3,836,3_california_berkeley__


In [10]:
a_topic = freq.iloc[1]["Topic"] # Select the 1st topic
model.get_topic(a_topic) # Show the words and their c-TF-IDF scores

[('2011', 0.011258826056327643),
 ('2015', 0.010274562219420535),
 ('american', 6.774380431035996e-05),
 ('enacted', 5.9511296260311375e-05),
 ('respectively', 5.2432992137561175e-05),
 ('2010', 2.6671437969350832e-05),
 ('based', 1.190444347444e-05),
 ('when', 1.143465994847161e-05),
 ('include', 1.0781126114791035e-05),
 ('13', 8.841427895779255e-06)]

In [11]:
model.visualize_barchart(top_n_topics=6)

In [12]:
model.visualize_topics()