<a href="https://colab.research.google.com/github/agawronski/word-embeddings/blob/main/20211114_2_hierarchical_patterns_topics_temporal_direction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# !pip install bertopic

In [None]:
from sklearn.linear_model import LinearRegression
from bertopic import BERTopic
import pandas as pd

In [None]:
data = pd.read_csv('https://word-emeddings.s3.us-west-2.amazonaws.com/20211107_main_article_dataframe_small.csv')
data.datePublished = pd.to_datetime(data.datePublished)

In [4]:
data.head()

Unnamed: 0,abstract,fullText,datePublished
0,<em>Waldsterben</em>. We are familiar with tha...,['Managing what Cannot be Managed On the Possi...,2017-01-01
1,,"['References Aaron HJ, “How Not to Reform Medi...",2014-01-01
2,,['APPENDIX B Federal Geospatial Data Sources I...,2004-01-01
3,In 2012 I stood on the fourth floor of the Shi...,['6 Health City In 2012 I stood on the fourth ...,2016-01-01
4,A clean energy transition² is an increasingly ...,['© 2019 The International Institute for Susta...,2019-11-01


In [12]:
# topic modeling
topic_model = BERTopic(verbose=True)
topics, probs = topic_model.fit_transform(data.fullText)

freq = topic_model.get_topic_info()

Batches:   0%|          | 0/44 [00:00<?, ?it/s]

2021-11-15 05:13:50,651 - BERTopic - Transformed documents to Embeddings
2021-11-15 05:13:58,598 - BERTopic - Reduced dimensionality with UMAP
2021-11-15 05:13:58,673 - BERTopic - Clustered UMAP embeddings with HDBSCAN


In [15]:
topic_model.get_topics()

{-1: [('on', 0.011824509618097969),
  ('with', 0.010827350165348103),
  ('to', 0.01081259251370511),
  ('are', 0.010236746026480507),
  ('by', 0.010210788726260574),
  ('be', 0.009917633479678148),
  ('has', 0.008081524897607677),
  ('at', 0.007556283509579183),
  ('their', 0.006872936439821583),
  ('development', 0.006107898383142128)],
 0: [('energy', 0.012513402702892142),
  ('climate', 0.011789516429669801),
  ('in', 0.011478808959473261),
  ('development', 0.010070831860499284),
  ('change', 0.00892103806765539),
  ('emissions', 0.0072180398013768415),
  ('sustainable', 0.007132099633421098),
  ('environmental', 0.007077897919185383),
  ('population', 0.006683758100908108),
  ('index', 0.006589487418612333)],
 1: [('women', 0.018660986364903438),
  ('her', 0.016025139331659517),
  ('she', 0.013422067118618392),
  ('we', 0.01272258077676623),
  ('with', 0.011235948361112712),
  ('at', 0.010917987212941196),
  ('my', 0.009802237168544238),
  ('womens', 0.009653754829825112),
  ('be'

In [13]:
freq

Unnamed: 0,Topic,Count,Name
0,-1,336,-1_on_with_to_are
1,0,150,0_energy_climate_in_development
2,1,143,1_women_her_she_we
3,2,103,2_health_alcohol_tb_covid19
4,3,88,3_is_security_has_us
5,4,60,4_roma_social_muslim_eu
6,5,53,5_de_la_el_los
7,6,36,6_us_states_nuclear_united
8,7,35,7_black_women_percent_health
9,8,33,8_pharmaceutical_drugs_patent_drug


In [9]:
topic_model.visualize_barchart(top_n_topics=10, height=2000)

In [10]:
topic_model.visualize_topics(top_n_topics=20)

In [11]:
topic_model.visualize_hierarchy(top_n_topics=50, width=800)

In [17]:
topics_over_time = topic_model.topics_over_time(data.fullText,
                                                topics,
                                                data.datePublished,
                                                nr_bins=70)

36it [05:13,  8.70s/it]


In [21]:
# model.visualize_topics_over_time(topics_over_time, topics=[9, 10, 72, 83, 87, 91])
topic_model.visualize_topics_over_time(topics_over_time)

In [22]:
topics_over_time['time_int'] = topics_over_time.Timestamp.apply(lambda x: int(x.timestamp()))

In [23]:
topic_counts = topics_over_time.Topic.value_counts().reset_index()
topic_counts

Unnamed: 0,index,Topic
0,1,32
1,-1,17
2,5,14
3,0,12
4,4,12
5,2,12
6,14,12
7,3,10
8,7,10
9,11,9


In [24]:
def get_regression_slope(topic_num):
    X = topics_over_time.loc[topics_over_time.Topic == topic_num,'time_int'].to_numpy()
    y = topics_over_time.loc[topics_over_time.Topic == topic_num,'Frequency'].to_numpy()
    X = X.reshape(-1, 1)
    y = y.reshape(-1, 1)
    reg = LinearRegression().fit(X, y)
    return reg.coef_[0][0]

In [25]:
get_topic_names = topics_over_time.loc[:,['Topic', 'Name']].drop_duplicates()
get_topic_names['slope'] = get_topic_names.apply(lambda x: get_regression_slope(x.Topic), axis=1)
get_topic_names = get_topic_names.sort_values('slope', ascending=False)
get_topic_names

Unnamed: 0,Topic,Name,slope
38,2,2_health_alcohol_tb_covid19,5.314534e-08
10,-1,-1_on_with_to_are,5.084408e-08
128,9,9_ai_nuclear_cyber_systems,4.671889e-08
152,16,16_corruption_transparency_anticorruptio...,3.158834e-08
51,3,3_is_security_has_us,2.891777e-08
16,0,0_energy_climate_in_development,2.292282e-08
97,21,21_brazil_health_global_amazon,1.542063e-08
134,22,22_programs_tech_program_york,1.194517e-08
39,4,4_roma_social_muslim_eu,9.11577e-09
75,6,6_us_states_nuclear_united,8.05193e-09


In [26]:
topics_over_time.loc[topics_over_time.Topic == -1,:]

Unnamed: 0,Topic,Words,Frequency,Timestamp,Name,time_int
10,-1,"for, are, with, be, to",1,1975-04-15 02:03:25.714285568,-1_on_with_to_are,166759405
27,-1,"population, has, are, environmental, be",2,1992-05-03 22:37:42.857142784,-1_on_with_to_are,704932662
31,-1,"for, with, pb, feminist, her",2,1995-10-01 17:08:34.285713920,-1_on_with_to_are,812567314
33,-1,"by, for, with, women, feminist",2,1997-06-15 14:24:00.000000000,-1_on_with_to_are,866384640
36,-1,"pollution, air, particulate, health, exposure",1,1999-02-28 11:39:25.714285568,-1_on_with_to_are,920201965
40,-1,"forest, in, asthma, forestry, local",7,2000-11-12 08:54:51.428571136,-1_on_with_to_are,974019291
47,-1,"committee, south, aids, committees, is",4,2002-07-28 06:10:17.142857216,-1_on_with_to_are,1027836617
58,-1,"seed, corn, crops, soybean, crop",2,2004-04-11 03:25:42.857142784,-1_on_with_to_are,1081653942
68,-1,"in, clusters, ecoindustrial, resource, local",10,2005-12-25 00:41:08.571428352,-1_on_with_to_are,1135471268
81,-1,"for, with, to, their, investment",15,2007-09-08 21:56:34.285713920,-1_on_with_to_are,1189288594


In [27]:
topics_over_time.loc[topics_over_time.Topic == 24,:]

Unnamed: 0,Topic,Words,Frequency,Timestamp,Name,time_int
80,24,"media, australian, federalism, government, fed...",2,2005-12-25 00:41:08.571428352,24_media_australian_crime_asylum,1135471268
99,24,"olympic, games, olympics, beijing, ioc",1,2007-09-08 21:56:34.285713920,24_media_australian_crime_asylum,1189288594
135,24,"mncs, host, bhagwati, countries, media",1,2011-02-05 16:27:25.714285568,24_media_australian_crime_asylum,1296923245
157,24,"media, australian, asylum, social, politics",4,2012-10-20 13:42:51.428571136,24_media_australian_crime_asylum,1350740571
180,24,"crime, australian, australia, harms, media",2,2014-07-05 10:58:17.142857216,24_media_australian_crime_asylum,1404557897
206,24,"asylum, austria, european, border, borders",1,2016-03-19 08:13:42.857142784,24_media_australian_crime_asylum,1458375222
232,24,"video, media, participatory, participants, com...",3,2017-12-02 05:29:08.571428352,24_media_australian_crime_asylum,1512192548
256,24,"thunberg, greta, narratives, soros, disinforma...",1,2019-08-17 02:44:34.285713920,24_media_australian_crime_asylum,1566009874


In [34]:
ting = topic_model.get_topics()
ting = pd.DataFrame(ting).T

In [41]:
ting

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
-1,"(on, 0.011824509618097969)","(with, 0.010827350165348103)","(to, 0.01081259251370511)","(are, 0.010236746026480507)","(by, 0.010210788726260574)","(be, 0.009917633479678148)","(has, 0.008081524897607677)","(at, 0.007556283509579183)","(their, 0.006872936439821583)","(development, 0.006107898383142128)"
0,"(energy, 0.012513402702892142)","(climate, 0.011789516429669801)","(in, 0.011478808959473261)","(development, 0.010070831860499284)","(change, 0.00892103806765539)","(emissions, 0.0072180398013768415)","(sustainable, 0.007132099633421098)","(environmental, 0.007077897919185383)","(population, 0.006683758100908108)","(index, 0.006589487418612333)"
1,"(women, 0.018660986364903438)","(her, 0.016025139331659517)","(she, 0.013422067118618392)","(we, 0.01272258077676623)","(with, 0.011235948361112712)","(at, 0.010917987212941196)","(my, 0.009802237168544238)","(womens, 0.009653754829825112)","(be, 0.009166702637876117)","(our, 0.008600008461659682)"
2,"(health, 0.02009079941115313)","(alcohol, 0.015297453360695792)","(tb, 0.014851608370689112)","(covid19, 0.011265778298841345)","(countries, 0.011132889846350188)","(2020, 0.010566297206339257)","(virus, 0.010277028464820755)","(global, 0.00872362347600258)","(pda1, 0.008555433951112829)","(disease, 0.008181219454550335)"
3,"(is, 0.011969798220473615)","(security, 0.01154828647876164)","(has, 0.010414420882959454)","(us, 0.009946829484398513)","(government, 0.008597464198114866)","(are, 0.007990109133827158)","(political, 0.007563326525484806)","(forces, 0.007560214061567433)","(afghan, 0.007362287688562322)","(iraq, 0.0070481896284589705)"
4,"(roma, 0.018679130782749822)","(social, 0.010839769057791474)","(muslim, 0.010243109294992407)","(eu, 0.010150873933973164)","(their, 0.01012743240581683)","(migration, 0.009107509857827083)","(muslims, 0.008758442033508649)","(be, 0.008548128028695678)","(were, 0.008394135885689493)","(programme, 0.008249819137540481)"
5,"(de, 0.07327473316216698)","(la, 0.04740852959997394)","(el, 0.036636117155949645)","(los, 0.025708077206841134)","(denver, 0.021279776703218456)","(por, 0.01837777588074744)","(las, 0.016304039799162975)","(colorado, 0.015890277815474578)","(una, 0.01405810430475464)","(nueva, 0.008217118108660534)"
6,"(us, 0.013825833567369727)","(states, 0.012073449820180831)","(nuclear, 0.011853827762897737)","(united, 0.010536107374685627)","(security, 0.00955522374520431)","(space, 0.009421949599685096)","(military, 0.008850551573449049)","(has, 0.008111495542598147)","(defense, 0.0077082960201212206)","(at, 0.007447843247794084)"
7,"(black, 0.028226710947434983)","(women, 0.026488835102519172)","(percent, 0.02248439474510844)","(health, 0.01789140479675464)","(care, 0.016569940260115493)","(medicaid, 0.016399778370514442)","(states, 0.01582663597023201)","(coverage, 0.015085178766680137)","(insurance, 0.009668881678396793)","(medicare, 0.00898707065720119)"
8,"(pharmaceutical, 0.025740805456511175)","(drugs, 0.022930673573066856)","(patent, 0.02072925879417809)","(drug, 0.01928746929794297)","(medicines, 0.014878231431663377)","(trips, 0.012207923241667924)","(health, 0.011041552635619904)","(canada, 0.009921441435825448)","(products, 0.009830606564364242)","(bangladesh, 0.009161167682554169)"


In [40]:
ting2 = ting.apply(lambda y: y.apply(lambda x: x[0]))

In [43]:
ting2.to_csv('20211114_dynamic_topics.csv')

In [44]:
ting2

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
-1,on,with,to,are,by,be,has,at,their,development
0,energy,climate,in,development,change,emissions,sustainable,environmental,population,index
1,women,her,she,we,with,at,my,womens,be,our
2,health,alcohol,tb,covid19,countries,2020,virus,global,pda1,disease
3,is,security,has,us,government,are,political,forces,afghan,iraq
4,roma,social,muslim,eu,their,migration,muslims,be,were,programme
5,de,la,el,los,denver,por,las,colorado,una,nueva
6,us,states,nuclear,united,security,space,military,has,defense,at
7,black,women,percent,health,care,medicaid,states,coverage,insurance,medicare
8,pharmaceutical,drugs,patent,drug,medicines,trips,health,canada,products,bangladesh
