In [1]:
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import text 
import pandas as pd
import glob
from pathlib import Path
import spacy


In [2]:
directory = "texts/history/US_Inaugural_Addresses"

In [3]:

files = glob.glob(f"{directory}/*.txt")

addresses = []
for file in files:
    with open(file, "r") as f:
        content = f.read()
        addresses.append(content)



In [4]:
len(addresses)

58

I'm dowloading here spacy's finetuned English Bert transformer because I'm getting better results with this model. 

In [6]:
!python -m spacy download en_core_web_trf

Collecting en-core-web-trf==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_trf-3.5.0/en_core_web_trf-3.5.0-py3-none-any.whl (460.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m460.3/460.3 MB[0m [31m13.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting spacy-transformers<1.3.0,>=1.2.0.dev0
  Downloading spacy_transformers-1.2.5-cp310-cp310-macosx_11_0_arm64.whl (173 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m173.9/173.9 kB[0m [31m14.9 MB/s[0m eta [36m0:00:00[0m
Collecting spacy-alignments<1.0.0,>=0.7.2
  Downloading spacy_alignments-0.9.1-cp310-cp310-macosx_11_0_arm64.whl (317 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.3/317.3 kB[0m [31m25.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: spacy-alignments, spacy-transformers, en-core-web-trf
Successfully installed en-core-web-trf-3.5.0 spacy-alignments-0.9.1 spacy-transformers-1.2.5
[38;5;2

In [7]:
## Here I load spacy's bert model and exclude some pipelines I don't need. We only need here the tokenizer
nlp = spacy.load('en_core_web_trf', exclude=['tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer'])

ValueError: [E002] Can't find factory for 'transformer' for language English (en). This usually happens when spaCy calls `nlp.create_pipe` with a custom component name that's not registered on the current language class. If you're using a Transformer, make sure to install 'spacy-transformers'. If you're using a custom component, make sure you've added the decorator `@Language.component` (for function components) or `@Language.factory` (for class components).

Available factories: attribute_ruler, tok2vec, merge_noun_chunks, merge_entities, merge_subtokens, token_splitter, doc_cleaner, parser, beam_parser, lemmatizer, trainable_lemmatizer, entity_linker, ner, beam_ner, entity_ruler, tagger, morphologizer, senter, sentencizer, textcat, spancat, spancat_singlelabel, future_entity_ruler, span_ruler, textcat_multilabel, en.lemmatizer

I am also removing stopwords because it does improve the results. Video in youtube in which it is claimed that BERT is smart enough to deal with stopwords is wrong: BERTopic only uses for the BERT the embeddings, BERT does not extract the topics. For finding the topics, BERTopic uses other layers of precessing like TF-IDF (term frequency–inverse document frequency) and others.

In [117]:
#I get the stopwords from sklearn as a vectorized model
vectorizer_model = CountVectorizer(stop_words="english")

In [118]:
#Again, spacy's model is smaller, faster, and more efficient than bart-large-mnli. Can be run in a personal computer
#topic_model = BERTopic(embedding_model="facebook/bart-large-mnli",vectorizer_model=vectorizer_model, min_topic_size=2)
#
topic_model = BERTopic(embedding_model=nlp, vectorizer_model=vectorizer_model, min_topic_size=2)
topics, probs = topic_model.fit_transform(obituaries)



In [119]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,-1,10,-1_people_government_freedom_shall
1,0,16,0_world_america_people_nation
2,1,6,1_government_country_public_people
3,2,6,2_government_union_states_people
4,3,5,3_great_government_states_people
5,4,4,4_constitution_power_government_union
6,5,4,5_let_world_new_know
7,6,4,6_men_nations_people_democracy
8,7,3,7_century_new_people_government


In [120]:
topic_model.get_topic_freq()

Unnamed: 0,Topic,Count
0,0,16
1,-1,10
2,1,6
3,2,6
4,3,5
5,4,4
6,5,4
7,6,4
8,7,3


In [121]:
topic_model.get_topic(3)

[('great', 0.03054465877409726),
 ('government', 0.026844889763954456),
 ('states', 0.02623015825541602),
 ('people', 0.023601844756724746),
 ('united', 0.02218885055653091),
 ('shall', 0.01849144350836386),
 ('power', 0.0182523477484895),
 ('revenue', 0.017867803036122298),
 ('citizens', 0.016832297357932857),
 ('country', 0.01671112937584359)]

In [122]:
topic_model.get_representative_docs(0)

['Warren G. Harding\t3/4/1921\tWhen one surveys the world about him after the great storm, noting the marks of destruction and yet rejoicing in the ruggedness of the things which withstood it, if he is an American he breathes the clarified atmosphere with a strange mingling of regret and new hope. We have seen a world passion spend its fury, but we contemplate our Republic unshaken, and hold our civilization secure. Liberty--liberty within the law--and civilization are inseparable, and though both were threatened we find them now secure; and there comes to Americans the profound assurance that our representative government is the highest expression and surest guaranty of both Standing in this presence, mindful of the solemnity of this occasion, feeling the emotions which no one may know until he senses the great weight of responsibility for himself, I must utter my belief in the divine inspiration of the founding fathers. Surely there must have been God\'s intent in the making of this 

In [123]:
df = pd.DataFrame({"topic": topics, "documents": obituaries})
df

Unnamed: 0,topic,documents
0,1,Martin Van Buren\t1837-03-04\tFellow-Citizens:...
1,0,"Richard Nixon\t1/20/1973\tI, RICHARD NIXON, do..."
2,0,"Ronald Reagan\t1/21/1985\tSenator Mathias, Chi..."
3,7,"Bill Clinton\t1/20/1997\tMy fellow citizens, a..."
4,3,Franklin Pierce\t1853-03-04\tMy Countrymen: It...
5,4,William Henry Harrison\t1841-03-04\tCalled fro...
6,5,"Barack Obama\t1/20/2009\tMy fellow citizens, I..."
7,-1,Grover Cleveland\t1885-03-04\tFellow-Citizens:...
8,1,John Adams\t1797-03-04\tWHEN it was first perc...
9,3,Andrew Jackson\t1833-03-04\tThe will of the Am...


In [124]:
topic_model.visualize_topics()


In [125]:
topic_model.visualize_barchart()

In [130]:
topic_model.visualize_hierarchy(top_n_topics=6)

In [None]:
topic_model.save("US_Inaugural_Addresses")

In [126]:
url = "https://raw.githubusercontent.com/melaniewalsh/BERT-4-Humanists/main/data/public-domain-poetry.csv"

poetry_df = pd.read_csv(url, encoding='utf-8')
# Show 5 random rows
poetry_df.sample(5)

Unnamed: 0,author,title,text,lifespan,birth_year,death_year,link,period
9545,Hattie Howard,Two Roses,I've a friend beyond the ocean\r\nSo regardful...,,,,http://public-domain-poetry.com/hattie-howard/...,
8284,Friedrich Schiller,The Philosophers,The principle by which each thing\r\nToward st...,10 November 1759-9 May 1805,1759.0,1805.0,http://public-domain-poetry.com/friedrich-schi...,18th Century
27589,Walter Savage Landor,On Himself,"I strove with none, for none was worth my stri...","January 30, 1775-September 17, 1864",1775.0,1864.0,http://public-domain-poetry.com/walter-savage-...,19th Century
437,Alexander Pope,"Prologue, Designed For Mr D'Urfey's Last Play","Grown old in rhyme, 'twere barbarous to discar...",21 May 1688-30 May 1744,1688.0,1744.0,http://public-domain-poetry.com/alexander-pope...,18th Century
4734,Dora Sigerson Shorter,Vale,"Good-bye, sweet friend, good-bye,\r\n And all...",1866-1918,1866.0,1918.0,http://public-domain-poetry.com/dora-sigerson-...,19th Century


In [127]:
# Filter the DataFrame for only a given time period, then randomly sample 1000 rows
nineteenth_sample = poetry_df[poetry_df['period'] == '19th Century'].sample(1000)
twentieth_sample = poetry_df[poetry_df['period'] == '20th Century'].sample(1000)
eighteenth_sample = poetry_df[poetry_df['period'] == '18th Century'].sample(1000)
sixteenth_sample = poetry_df[poetry_df['period'] == '16th-17th Centuries (Early Modern)'].sample(1000)

In [128]:
poetry_texts = twentieth_sample['text'].tolist()

In [129]:
topic_model = BERTopic(embedding_model=nlp, vectorizer_model=vectorizer_model, min_topic_size=2)
topics, probs = topic_model.fit_transform(poetry_texts)


In [133]:
topic_model.save("poems20th")

In [None]:
topic_model.load("poems20th")

In [134]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,-1,248,-1_god_judith_love_like
1,0,28,0_music_walls_darkness_say
2,1,28,1_heart_hill_ione_street
3,2,26,2_bah_brown_weasel_smith
4,3,22,3_remember_let_prison_leningrad
...,...,...,...
116,115,2,115_walking_walk_speaks_wet
117,116,2,116_skies_twinings_beechen_world
118,117,2,117_strove_hiding_lamp_adream
119,118,2,118_christ_bullock_averted_chill


In [135]:
topic_model.get_topic(3)

[('remember', 0.014593390394124816),
 ('let', 0.009821480392041042),
 ('prison', 0.00910510115538854),
 ('leningrad', 0.008577170064542769),
 ('dom', 0.008577170064542769),
 ('fontannyi', 0.008577170064542769),
 ('1939', 0.00714040013567585),
 ('1940', 0.00714040013567585),
 ('boston', 0.0068617360516342145),
 ('memorial', 0.006442060977621102)]

In [139]:
topic_model.visualize_barchart(top_n_topics=15, n_words=7)

In [141]:
topic_model.visualize_heatmap()

In [144]:
topic_model.visualize_documents(poetry_texts)