<a href="https://colab.research.google.com/github/ainulyaqinmhd/GMD-FinalExam-GravityIsYourEnemy/blob/main/Natural_Language_Processing_BERT_immersion_topics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from zipfile import ZipFile
from google.colab import drive
drive.mount('/content/drive')
root_path = 'drive/My Drive/colabnotebooks/newsapi/'
import pandas as pd
import json

Mounted at /content/drive


# BERTopic installation

In [None]:
%%capture
!pip install bertopic

In [None]:
%%capture
!pip install distributed==2021.9.0

# Data

Load the data that we want to use for topic modeling.

In [None]:
# path to file location on google drive, this is where you should put the data
archive = "/content/drive/My Drive/colabnotebooks/newsapi/newsapi.zip"

In [None]:
archive_df = pd.json_normalize(
    pd.concat(
        [pd.read_json(ZipFile(archive).open(i), encoding="utf-8") for i in ZipFile(archive).namelist()],
        ignore_index=True
    )["articles"]
    #column name, if existent
)
'''
alternative: 
with ZipFile(archive) as z:
    with z.open(z.namelist()[0]) as f:
        archive_df = pd.read_json(f, encoding="utf-8")

archive_df = pd.json_normalize(archive_df["articles"])
'''

FileNotFoundError: ignored

In [None]:
# convert timestamp to something that is easier to read and work with

dates = pd.to_datetime(archive_df["publishedAt"]).dt.date
year = dates.apply(lambda x: int(str(x).split("-")[0]))
archive_df["year"] = year
archive_df.head(1)

In [None]:
archive_df.describe()

Next, we remove duplicates or empty entries.

In [None]:
# drop rows that have empty 'content' column
archive_df = archive_df[archive_df["content"].notna()]

In [None]:
# drop duplicates
# archive_df.drop_duplicates(subset="content", inplace=True)

In [None]:
# archive_df = archive_df[(archive_df["year"] == 2020) & (archive_df["source.name"] == "The Economist")]
# archive_df = archive_df[archive_df["year"] == 2019]

In [None]:
# reset the index, otherwise everything gets messed up
# archive_df.reset_index(drop=True, inplace=True)

Check the amount of texts that we have:

In [None]:
len(archive_df)

In [None]:
archive_df["source.name"].value_counts()

# Training

In [None]:
from bertopic import BERTopic

In [None]:
topic_model = BERTopic(language="english", calculate_probabilities=True, verbose=True)
topics, prob = topic_model.fit_transform(archive_df["content"])

# Extracting topics

In [None]:
# most frequent topics
freq = topic_model.get_topic_info(); freq.head(5)

In [None]:
# most important tokens for a topic
topic_model.get_topic(0)

# Visualization

Different visualization options included in BERTopic.

## Visualizing topics

In [None]:
topic_model.visualize_topics()

## Visualizing topic probabilities

The variable `probabilities` that is returned from `transform()` or `fit_tansform()` can be used to understand how confident BERTopic is that certan topics can be found in a document.

Visualize it like this:

In [None]:
topic_model.visualize_distribution(prob[3], min_probability=0.015)

## Visualizing topic hierarchy

The created topics can be hierarchically reduced. When knowing how they relate to one another, it might help in selecting `nr_topics` to reduce the number of created topics.

In [None]:
topic_model.visualize_hierarchy(top_n_topics=300)

## Visualize terms

Look at selected terms for certain topics.

In [None]:
topic_model.visualize_barchart(top_n_topics=5)

## Visualize topic similarity

Having generated topic embeddings, through both c-TF-IDF and embeddings, we can create a similarity matrix by simply applying cosine similarities through those topic embeddings. The result will be a matrix indicating how similar certain topics are to each other.

In [None]:
topic_model.visualize_heatmap(n_clusters=20, width=1000, height=1000)

## Visualizing topics over time

## Visualize term score decline

In [None]:
topic_model.visualize_term_rank()

# Topic reduction

BERTopic offers a way to merge topics. One option is to specify the maximum amount of topics, but if this number is too low, merging occurs for topics that should not be merged.

Instead, we will use the paramter "auto" to merge topics that have a similarity of at least 0.9.

In [None]:
topic_model_reduced = BERTopic(nr_topics="auto",
                               language="english",
                               calculate_probabilities=True, 
                               verbose=True).fit(archive_df["content"])

# topic_model = BERTopic(language="english", calculate_probabilities=True, verbose=True)
# topics_reduced, probs_reduced = topic_model.fit_transform(archive_df["content"])

In [None]:
topic_model_reduced.get_topic_info()

In [None]:
topic_model_reduced.visualize_topics()

In [None]:
topic_model_reduced.visualize_hierarchy(top_n_topics=98)

In [None]:
# find all topics related to a given word
# TODO: figure out if/how we can comnbine multiple words, this implementation doesn't support it - probably chain multiple requests?
topic_model_reduced.find_topics("robotics")

In [None]:
topic_model_reduced.get_topic_info(23)

In [None]:
# docs that are representative for a given topic
topic_model_reduced.get_representative_docs(30)

In [None]:
topic_model_reduced.get_topic_info()["Name"]

In [None]:
print((topic_model_reduced.get_topic_info(30)["Name"]).to_string().split(" ")[4])

# Word cloud visualization

For every topic, get the representative texts and visualize them in a wordcloud.

In [None]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger') 

import matplotlib.pyplot as plt
import string
# import tensorflow_text as tf_text
import time

from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from wordcloud import WordCloud

In [None]:
# Get stopwords, stemmer and lemmatizer
stopwords = nltk.corpus.stopwords.words('english')
lemmatizer = WordNetLemmatizer()

In [None]:
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

In [None]:
stop_words = stopwords + ['paper', 'study', 'article', 'approach', 'literature', 'data', 'analysis', 'result', 
                       'results', 'case study', 'case studies', 'chapter', 'findings', 'finding', 'model', 'book', 'conference',
                       'say', 'will', 'Mr', 'Ms', 'Mrs', 'year', 'one', 'headline', 'print', 'edition', 'print edition', 'edition headline', 'li', 'ul']

In [None]:
# topic_model_reduced.get_topic_info()["Name"][0]

In [None]:
i = 29

most_representative_texts = topic_model_reduced.get_representative_docs(i)
most_representative_texts = " ".join(most_representative_texts).replace("\r\n", " ")

tokens = nltk.word_tokenize(most_representative_texts)
lemmatized_text = " ".join([lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in tokens if w not in string.punctuation])
stopworded_text = " ".join(w for w in nltk.word_tokenize(lemmatized_text) if w not in stop_words)

wordcloud = WordCloud(width=900, height=600,
                    background_color='white').generate(stopworded_text)

# filename = "/content/drive/MyDrive/colab_data/wordclouds/economist/topic_{}.png".format(i)
filename = "/content/drive/MyDrive/colab_data/wordclouds/bsh/{}.png".format((topic_model_reduced.get_topic_info(i)["Name"]).to_string().split(" ")[4])
    
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.savefig(filename, dpi=400)

In [None]:
for i in range(0, 90):
    most_representative_texts = topic_model_reduced.get_representative_docs(i)
    most_representative_texts = " ".join(most_representative_texts).replace("\r\n", " ")

    tokens = nltk.word_tokenize(most_representative_texts)
    lemmatized_text = " ".join([lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in tokens if w not in string.punctuation])
    stopworded_text = " ".join(w for w in nltk.word_tokenize(lemmatized_text) if w not in stop_words)

    wordcloud = WordCloud(width=900, height=600,
                     background_color='white').generate(stopworded_text)

    # filename = "/content/drive/MyDrive/colab_data/wordclouds/economist/topic_{}.png".format(i)
    filename = "/content/drive/MyDrive/colab_data/wordclouds/bbc/{}.png".format(topic_model_reduced.get_topic_info()["Name"][i+1])
    
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")
    plt.savefig(filename, dpi=400)

In [None]:
from keras.models import load_model
import joblib
# Save model
topic_model_reduced.save("/content/drive/MyDrive/colab_data/economist/BERT_model")	
#joblib.dump()


#keras.models.save_model(topic_model_reduced, "/content/drive/MyDrive/colab_data/BERT_model.h5")

In [None]:
# my_model = BERTopic.load("my_model")	

In [None]:
topic_model_reduced.find_topics("kitchen")

In [None]:
topic_model_reduced.find_topics("cooking")

In [None]:
topic_model_reduced.find_topics("hidden tech")

In [None]:
topic_model_reduced.find_topics("cyber security")

In [None]:
topic_model_reduced.find_topics("self sufficiency")

In [None]:
topic_model_reduced.find_topics("circular economy")

In [None]:
topic_model_reduced.find_topics("cloud computing")

In [None]:
topic_model_reduced.find_topics("recommendation systems")

In [None]:
topic_model_reduced.get_topic_info(13)

In [None]:
topic_model_reduced.get_topic_info(6)

In [None]:
topic_model_reduced.get_topic_info(3)

In [None]:
topic_model_reduced.get_topic_info(18)

In [None]:
topic_model_reduced.get_topic_info(88)