## Setting Up and Importing Data

In [17]:
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer

from sentence_transformers import SentenceTransformer
from umap import UMAP


from pprint import pprint as pp

from itables import init_notebook_mode
init_notebook_mode(all_interactive=True)

import pandas as pd
import re

import pickle

<IPython.core.display.Javascript object>

In [18]:
data = pd.read_csv("data/2021-6_2022-8_NYtimes_headlines.csv")

In [19]:
data.sample(20, random_state=58)

Unnamed: 0,headline,date,doc_type,material_type,section,keywords
Loading... (need help?),,,,,,


In [20]:
print(f"There are {len(data)} rows of headlines from NYTimes in this dataset")

There are 160224 rows of headlines from NYTimes in this dataset


In [21]:
docs = (data["headline"].astype(str) + " | " + data["keywords"].apply(lambda x: re.sub(r"[^A-Za-z\s]", "", x))).tolist()
timeline = data["date"].apply(lambda x: pd.Timestamp(x)).to_list()

In [22]:
assert len(docs) == data.shape[0], "Something wrong with data conversion! It is not the same size as the original dataset"

## Load Saved Models
Skip the line and just load the saved models!

In [23]:
topic_model = BERTopic.load("models/nytimes_bertopic")

In [24]:
embeddings = pickle.load(open("models/embeddings.pkl", "rb"))

In [25]:
topics_over_time = pd.read_csv("models/topics_over_time.csv")

In [26]:
reduced_embeddings = pickle.load(open("models/reduced_embeddings.pkl", 'rb'))

In [29]:
topic_model.get_representative_docs(92)

['Musk Says His Twitter Takeover Is ‘On Hold,’ Then Says He’s ‘Still Committed’ | Social Media Mergers Acquisitions and Divestitures Spam Electronic Rumors and Misinformation Regulation and Deregulation of Industry',
 'What’s Next in the Elon Musk-Twitter Saga? A Court Battle | Mergers Acquisitions and Divestitures Social Media Suits and Litigation Civil',
 'Twitter Counters a Musk Takeover With a Time-Tested Barrier | Social Media Mergers Acquisitions and Divestitures Shareholder Rights and Activism']

## Topic Modeling

In [68]:
from nltk.corpus import stopwords
stopwords = list(set(stopwords.words('english'))) + ['http', 'https', 'amp', 'com']

vectorizer_model = CountVectorizer(stop_words=stopwords, ngram_range=(1,3))

sentence_model = SentenceTransformer("all-mpnet-base-v2")

embeddings = sentence_model.encode(docs, show_progress_bar=False)

topic_model = BERTopic(embedding_model=sentence_model,
                       verbose=True,
                       vectorizer_model=vectorizer_model,
                       diversity=0.2)
topics, probs = topic_model.fit_transform(docs)

topic_model.save("models/nytimes_bertopic")
pickle.dump(embeddings, open("models/embeddings.pkl", "wb"))

Batches: 100%|██████████| 5007/5007 [02:10<00:00, 38.39it/s]
2022-08-16 16:10:32,274 - BERTopic - Transformed documents to Embeddings
2022-08-16 16:14:11,137 - BERTopic - Reduced dimensionality


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

2022-08-16 16:14:26,162 - BERTopic - Clustered reduced embeddings


## Extracting Topics

In [8]:
freq = topic_model.get_topic_info()
freq

Topic,Count,Name
Loading... (need help?),,


In [194]:
freq.to_csv("reports/frequency.csv")

In [101]:
# for topic in freq["Topic"][1:31]:
#     print(f"Displaying representative titles for topic no {topic}: Words {freq[freq['Topic']==topic]['Name'].values[0].split('_')}")
#     print(" - " + "\n - ".join(topic_model.get_representative_docs(topic)))
#     print("-----------------------------------------------------------------------------")

In [78]:
topics_over_time = topic_model.topics_over_time(docs, topics, timeline, 
                                                datetime_format="%b%M%Y", nr_bins=20)
topics_over_time.to_csv("models/topics_over_time.csv")

20it [42:16, 126.84s/it]


In [11]:
topic_model.visualize_topics_over_time(topics_over_time)

In [11]:
topic_model.visualize_topics_over_time(topics_over_time, topics=[2, 9, 83, 102, 4])

In [12]:
def get_topic_over_time(topic_num, topics_over_time=topics_over_time):
    title = topic_model.get_topic_info(topic_num)["Name"].values[0]
    print(title)
    print()
    
    texts = topic_model.get_representative_docs(topic_num)
    print("-" + " \n\n-".join(texts))
    fig = topic_model.visualize_topics_over_time(topics_over_time, topics=[topic_num])
    fig.update_layout(
        title=title, 
        font=dict(
            size=15
            )
    )
    fig.show()

    return fig, texts

In [13]:
def save_text(filepath, texts):
    texts = "\n".join(texts)
    with open(filepath, "w") as f:
        f.write(texts)

In [14]:
def generate_topic_timeline_report(topic_num, file_title):
    fig, texts = get_topic_over_time(topic_num)
    fig.write_html(f"reports/{file_title}.html")
    save_text(f"reports/{file_title}.txt", texts)

In [15]:
generate_topic_timeline_report(9, "masks")

9_masks coronavirus_masks coronavirus ncov_coronavirus ncov masks_mask mandates

-Does Wearing Glasses Protect You From Coronavirus? | Eyes and Eyesight Eyeglasses Coronavirus nCoV Face Protective Clothing and Gear 

-For passengers in midflight on Monday, there were cheers and alarm as a U.S. mask mandate ended. | Masks AIRLINES AND AIRPLANES Coronavirus nCoV 

-What to do with all those disposable masks? Turn them into roads, researchers say. | internalessential


In [189]:
# russian - ukraine war
generate_topic_timeline_report(2, "russian_ukraine")

2_russian invasion_invasion ukraine_russian invasion ukraine_ukraine defense military

-Looking to Help Ukrainian Refugees? A ‘Voluntourism’ Guide | Russian Invasion of Ukraine  Volunteers and Community Service Refugees and Displaced Persons 

-Ukraine War Forces a Question: How Far East Does Europe Go? | Russian Invasion of Ukraine  Politics and Government Defense and Military Forces 

-A New Surge of Ukrainians at U.S. Border | Refugees and Displaced Persons Immigration and Emigration Russian Invasion of Ukraine  United States Politics and Government United States International Relations


In [190]:
generate_topic_timeline_report(topic_num = 83, file_title = "christmas")

83_christmas_christmas trees_christmas tree_culture christmas

-Quotation of the Day: In a Tiny, Jolly Spy for Santa, Watchdogs Spot Big Brother |  

-‘A Present From Norway and It’s Dead’: Christmas Tree Unites London in Dismay | Christmas Trees and Shrubs Christmas Trees 

-Silent Night: Scenes of a Pandemic Christmas | Shopping and Retail Quarantine Life and Culture Christmas ECommerce Holidays and Special Occasions


In [196]:
generate_topic_timeline_report(topic_num = 102, file_title = "india_covid")

102_india_india coronavirus_india covid_vaccination immunization india

-‘An Out-of-Body Experience’: Inside India’s Covid Crisis | Coronavirus Risks and Safety Concerns Coronavirus nCoV Quarantines Hospitals Anxiety and Stress Oxygen Shutdowns Institutional Deaths Fatalities 

-India’s Coronavirus Crisis Overwhelms Health System | Coronavirus nCoV Disease Rates Hospitals Medicine and Health Shortages 

-India’s true pandemic death toll is likely to be well over 3 million, a new study finds. | Disease Rates Deaths Fatalities Coronavirus nCoV Research


In [192]:
generate_topic_timeline_report(topic_num = 12, file_title = "vaccine")

12_vaccination immunization coronavirus_immunization coronavirus_immunization coronavirus ncov_ncov vaccination

-A New Entry in the Race for a Coronavirus Vaccine: Hope | Coronavirus nCoV Vaccination and Immunization Clinical Trials Factories and Manufacturing United States Politics and Government Immune System Research yourfeedscience 

-Vaccinated People Can Get Covid, but It’s Most Likely Very Rare | Vaccination and Immunization Coronavirus nCoV Clinical Trials Disease Rates yourfeedhealthcare 

-Fauci calls booster shots ‘likely,’ not now but in future, citing early signs that vaccine immunity may wane. | Coronavirus nCoV Medicine and Health Vaccination and Immunization


In [193]:
generate_topic_timeline_report(topic_num = 16, file_title = "california_wildfires")

16_wildfires_wildfires fires_wildfires fires firefighters_california wildfires

-California’s fires are stretching crews and stranding evacuees. | Wildfires Fires and Firefighters Forests and Forestry Evacuations and Evacuees 

-Is It Safe to Exercise if the Air Is Hazy With Wildfire Smoke? | Exercise Athletics and Sports Masks Air Pollution Wildfires Content Type Service Protective Clothing and Gear Medicine and Health Dogs 

-‘Very Unhealthy’ Air Quality Forces M.L.B. to Reschedule Games | Baseball Wildfires


In [198]:
generate_topic_timeline_report(topic_num = 30, file_title = "omicron")

30_coronavirus omicron variant_coronavirus omicron_variant coronavirus_omicron variant coronavirus

-Your Monday Briefing | Coronavirus Omicron Variant 

-The first Omicron case has been detected in the U.S. | Coronavirus Omicron Variant Coronavirus nCoV Vaccination and Immunization 

-Omicron: A Big Deal About Small ‘O’ | internalsubonlynl Coronavirus Omicron Variant Coronavirus Delta Variant Greek Language English Language


In [31]:
generate_topic_timeline_report(topic_num = 92, file_title = "elon_twitter")

92_musk_elon_elon musk_twitter

-Musk Says His Twitter Takeover Is ‘On Hold,’ Then Says He’s ‘Still Committed’ | Social Media Mergers Acquisitions and Divestitures Spam Electronic Rumors and Misinformation Regulation and Deregulation of Industry 

-What’s Next in the Elon Musk-Twitter Saga? A Court Battle | Mergers Acquisitions and Divestitures Social Media Suits and Litigation Civil 

-Twitter Counters a Musk Takeover With a Time-Tested Barrier | Social Media Mergers Acquisitions and Divestitures Shareholder Rights and Activism


In [32]:
generate_topic_timeline_report(topic_num = 630, file_title = "tesla")

630_tesla_automobiles electric hybrid_automobiles electric_tesla reports

-Why Tesla Soared as Other Automakers Struggled to Make Cars | Automobiles Electric and Hybrid Vehicles Batteries Factories and Manufacturing Supply Chain Computer Chips Shortages Shutdowns Institutional 

-Tesla Reports 87% Increase in 2021 Deliveries | Electric and Hybrid Vehicles Company Reports Automobiles Supply Chain Automobile Safety Features and Defects Factories and Manufacturing 

-Tesla’s quarterly profit nearly quintuples to $1.6 billion as car sales surge. | Electric and Hybrid Vehicles Automobiles Company Reports Factories and Manufacturing Production Driverless and Semiautonomous Vehicles Automobile Safety Features and Defects Traffic Accidents and Safety


In [33]:
generate_topic_timeline_report(topic_num = 68, file_title = "crypto")

68_virtual currency_cryptocurrency_virtual currency bitcoin_currency bitcoin currency

-Cryptocurrency Start-Up Underpaid Women and Black Employees, Data Shows | Discrimination Black People Blacks Women and Girls Wages and Salaries Hiring and Promotion Labor and Jobs Startups Virtual Currency 

-Inside a Corporate Culture War Stoked by a Crypto C.E.O. | Bitcoin Currency Virtual Currency Layoffs and Job Reductions Computers and the Internet Workplace Environment Startups 

-Crypto Crash Widens a Divide: ‘Those With Money Will End Up Being Fine’ | Virtual Currency Layoffs and Job Reductions Bitcoin Currency Labor and Jobs Computers and the Internet Currency Executive Compensation Unemployment Careers and Professions High Net Worth Individuals Startups


## Visualize Documents

In [22]:
# create reduced_embeddings with UMAP
reduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings)

f_name = 'models/reduced_embeddings.pkl'
pickle.dump(reduced_embeddings, open(f_name, 'wb'))

In [30]:
fig_doc = topic_model.visualize_documents(docs, reduced_embeddings=reduced_embeddings, topics=[6, 2, 9, 83, 102, 16, 30, 120, 501, 378, 92])
fig_doc.write_html("reports/visualize_documents_selected_reduced_2.html")