<a href="https://colab.research.google.com/github/ayamlearning/masters_thesis_model_final/blob/main/RQ3_Topc_Modelling_Results.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [53]:
#!pip install bertopic

In [54]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [55]:
import pandas as pd
import time
from matplotlib import pyplot as plt
from datetime import datetime
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from umap import UMAP
import re
import joblib

In [56]:
path = "https://raw.githubusercontent.com/ayamlearning/masters_thesis_model_final/main/data/sample.csv"
df = pd.read_csv(path,delimiter=",",dtype=str)

In [57]:
g_path = "/content/drive/MyDrive/models"

In [58]:
negative_topic_model = BERTopic.load(g_path+"/model_negative.bin")
neutral_topic_model = BERTopic.load(g_path+"/model_neutral.bin")
positive_topic_model = BERTopic.load(g_path+"/model_positive.bin")



```
# This is formatted as code
```

#Neutral Sentiments

In [59]:
df_query = df.query("vader_polarity == '{}'".format("Neutral"))
docs = list(map(str,  df_query['prep_text']))


df_query.text = df_query.apply(lambda row: re.sub(r"http\S+", "", row.text).lower(), 1)
df_query.text = df_query.apply(lambda row: " ".join(filter(lambda x:x[0]!="@", row.text.split())), 1)
df_query.text = df_query.apply(lambda row: " ".join(re.sub("[^a-zA-Z]+", " ", row.text).split()), 1)
timestamps = df_query.date.to_list()
tweets = df_query.text.to_list()

In [60]:
neutral_topic_model.visualize_barchart()

In [61]:
topics_over_time = neutral_topic_model.topics_over_time(tweets, timestamps, nr_bins=20)
tt =neutral_topic_model.visualize_topics_over_time(topics_over_time, top_n_topics=8)
tt

20it [00:04,  4.72it/s]


In [62]:
reduced_embeddings = joblib.load(g_path+"/neutral_umap_model.pkl")
neutral_topic_model.visualize_documents(docs, reduced_embeddings=reduced_embeddings)

In [63]:
neutral_topic_model.visualize_hierarchy(top_n_topics=7)

In [64]:
# Calculate the topic distributions on a token-level
topic_distr, topic_token_distr = neutral_topic_model.approximate_distribution(docs, calculate_tokens=True)

# Visualize the token-level distributions
neutral_topic_model.visualize_approximate_distribution(docs[1], topic_token_distr[1])

100%|██████████| 13/13 [00:04<00:00,  3.03it/s]


Unnamed: 0,day,left,remember,keep,best,attitude,avoid,getting,conflict,law,vote,count,visiting
13_day_remaining_till_left,0.232,0.36,0.36,0.36,0.128,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
87_voice_alot_task_micomyizajohn,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.135,0.135,0.135,0.135
91_decides_day_ol_30,0.118,0.118,0.118,0.118,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
97_let_working_bugger_keep,0.173,0.304,0.404,0.404,0.231,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0
99_law_lawyer_studying_concocted,0.0,0.0,0.0,0.0,0.0,0.0,0.113,0.246,0.383,0.514,0.401,0.268,0.13
110_day_13hrs_sevendays_snub,0.154,0.154,0.154,0.154,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
122_wire_ku_ka_left,0.163,0.286,0.286,0.286,0.123,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
159_checklist_geared_twelve_merely,0.178,0.288,0.288,0.288,0.11,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
203_remember_shot_booth_throw,0.313,0.545,0.723,0.723,0.41,0.178,0.0,0.0,0.0,0.0,0.0,0.0,0.0
215_internet_keep_switching_elearning,0.267,0.468,0.622,0.764,0.498,0.297,0.142,0.0,0.0,0.0,0.0,0.0,0.0


In [65]:
pd.DataFrame(neutral_topic_model.get_topic(0),columns=["Word","Score"])

Unnamed: 0,Word,Score
0,registration,0.038138
1,voter,0.02962
2,register,0.025228
3,million,0.021561
4,30,0.016794
5,online,0.012248
6,period,0.011726
7,registered,0.010651
8,day,0.010298
9,2020,0.010279


## **Negative** Sentiments

In [66]:
df_query = df.query("vader_polarity == '{}'".format("Negative"))
docs = list(map(str,  df_query['prep_text']))

df_query.text = df_query.apply(lambda row: re.sub(r"http\S+", "", row.text).lower(), 1)
df_query.text = df_query.apply(lambda row: " ".join(filter(lambda x:x[0]!="@", row.text.split())), 1)
df_query.text = df_query.apply(lambda row: " ".join(re.sub("[^a-zA-Z]+", " ", row.text).split()), 1)
timestamps = df_query.date.to_list()
tweets = df_query.text.to_list()

In [67]:
negative_topic_model.visualize_barchart()

In [68]:
topics_over_time = negative_topic_model.topics_over_time(tweets, timestamps, nr_bins=20)
tt =negative_topic_model.visualize_topics_over_time(topics_over_time, top_n_topics=8)
tt

20it [00:00, 36.10it/s]


In [69]:
reduced_embeddings = joblib.load(g_path+"/negative_umap_model.pkl")
negative_topic_model.visualize_documents(docs, reduced_embeddings=reduced_embeddings)

In [70]:
negative_topic_model.visualize_hierarchy(top_n_topics=7)

In [71]:
topic_distr, topic_token_distr = negative_topic_model.approximate_distribution(docs, calculate_tokens=True)
negative_topic_model.visualize_approximate_distribution(docs[1], topic_token_distr[1])

100%|██████████| 3/3 [00:00<00:00,  4.59it/s]


Unnamed: 0,disappointing,see,zambia,falling,low


In [72]:
pd.DataFrame(negative_topic_model.get_topic(0),columns=["Word","Score"])

Unnamed: 0,Word,Score
0,election,0.062423
1,zambia,0.056545
2,pf,0.03033
3,people,0.02968
4,vote,0.029276
5,zambian,0.025919
6,voter,0.022991
7,lungu,0.022822
8,violence,0.022798
9,2021,0.021673


## POSTIVE Sentiments

In [73]:
df_query = df.query("vader_polarity == '{}'".format("Positive"))
docs = list(map(str,  df_query['prep_text']))

df_query.text = df_query.apply(lambda row: re.sub(r"http\S+", "", row.text).lower(), 1)
df_query.text = df_query.apply(lambda row: " ".join(filter(lambda x:x[0]!="@", row.text.split())), 1)
df_query.text = df_query.apply(lambda row: " ".join(re.sub("[^a-zA-Z]+", " ", row.text).split()), 1)
timestamps = df_query.date.to_list()
tweets = df_query.text.to_list()

In [74]:
positive_topic_model.visualize_barchart()

In [75]:
topics_over_time = positive_topic_model.topics_over_time(tweets, timestamps, nr_bins=20)
positive_topic_model.visualize_topics_over_time(topics_over_time, top_n_topics=8)

20it [00:01, 12.94it/s]


In [76]:
reduced_embeddings = joblib.load(g_path+"/positive_umap_model.pkl")
positive_topic_model.visualize_documents(docs, reduced_embeddings=reduced_embeddings)

In [77]:
positive_topic_model.visualize_hierarchy(top_n_topics=7)

In [79]:
topic_distr, topic_token_distr = positive_topic_model.approximate_distribution(docs, calculate_tokens=True)
positive_topic_model.visualize_approximate_distribution(docs[1], topic_token_distr[1])

100%|██████████| 6/6 [00:01<00:00,  4.38it/s]


Unnamed: 0,like,kkia,four,time,big,projected,traffic,zambia,airway,doe,take,like.1,seven,time.1,big.1,term,loan,china,control,come,something,like.2,15,year
13_loan_chinese_debt_china,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.23,0.538,0.835,1.143,1.108,0.8,0.504,0.195,0.0,0.0,0.0
40_unity_masuwa_600km_exon,0.0,0.0,0.0,0.0,0.0,0.0,0.106,0.106,0.106,0.106,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
58_hichilema_hakainde_projected_marketfriendly,0.0,0.0,0.168,0.322,0.48,0.615,0.447,0.293,0.135,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
86_nail_polish_thumb_remove,0.103,0.103,0.213,0.213,0.11,0.11,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [80]:
pd.DataFrame(positive_topic_model.get_topic(0),columns=["Word","Score"])

Unnamed: 0,Word,Score
0,fair,0.061351
1,free,0.056762
2,election,0.020133
3,zambia,0.016629
4,zambian,0.016425
5,appeal,0.015518
6,need,0.013732
7,transparent,0.013007
8,credible,0.011525
9,opposition,0.011313
