In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import tqdm
import copy
from pprint import pprint

import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('vader_lexicon')

import gensim
from gensim.models import *
from gensim.models.fasttext import FastText
from gensim.utils import simple_preprocess
import gensim.corpora as corpora

from sklearn.model_selection import *
from sklearn.linear_model import *
from sklearn.cluster import *
from sklearn.feature_extraction.text import *
from sklearn.preprocessing import *
from sklearn.decomposition import *
from sklearn.metrics import *
from sklearn.svm import *

import warnings
warnings.filterwarnings('ignore')

from bertopic import BERTopic

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/arjunkhanchandani/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/arjunkhanchandani/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


# Loading the Negative Emotion Tweets 

In [2]:
tweets_df = pd.read_csv('/Users/arjunkhanchandani/Desktop/twitter_data_analysis/v2/data/tweets_with_sentiment__version_2_1.csv')
tweets_neg_df = tweets_df[tweets_df['sentiment_flair']==-1]
tweets_neg_df.head()

Unnamed: 0,tweet_id,tweet,city,year,tweets_tokens,sentiment_flair,sentiment_nltk,sentiment_flair_glove_embed_kmeans,sentiment_fast_text_embed_kmeans,sentiment_fast_text_embed_nmf_kmeans
0,0,very bad thing say that government did not med...,Mumbai,2022,"['very', 'bad', 'thing', 'say', 'that', 'gover...",-1,-1,-1,-1,1
1,1,all netas and their family should admitted onl...,Mumbai,2022,"['all', 'netas', 'and', 'their', 'family', 'sh...",-1,-1,-1,-1,1
5,5,very hygiene government veterinary hospital as...,Mumbai,2022,"['very', 'hygiene', 'government', 'veterinary'...",-1,1,-1,-1,-1
7,7,shall throw out india let take care our health...,Mumbai,2022,"['shall', 'throw', 'out', 'india', 'let', 'tak...",-1,1,-1,-1,1
9,9,not online school wild guess going south mumba...,Mumbai,2022,"['not', 'online', 'school', 'wild', 'guess', '...",-1,-1,-1,1,-1


In [3]:
tweets_neg_df

Unnamed: 0,tweet_id,tweet,city,year,tweets_tokens,sentiment_flair,sentiment_nltk,sentiment_flair_glove_embed_kmeans,sentiment_fast_text_embed_kmeans,sentiment_fast_text_embed_nmf_kmeans
0,0,very bad thing say that government did not med...,Mumbai,2022,"['very', 'bad', 'thing', 'say', 'that', 'gover...",-1,-1,-1,-1,1
1,1,all netas and their family should admitted onl...,Mumbai,2022,"['all', 'netas', 'and', 'their', 'family', 'sh...",-1,-1,-1,-1,1
5,5,very hygiene government veterinary hospital as...,Mumbai,2022,"['very', 'hygiene', 'government', 'veterinary'...",-1,1,-1,-1,-1
7,7,shall throw out india let take care our health...,Mumbai,2022,"['shall', 'throw', 'out', 'india', 'let', 'tak...",-1,1,-1,-1,1
9,9,not online school wild guess going south mumba...,Mumbai,2022,"['not', 'online', 'school', 'wild', 'guess', '...",-1,-1,-1,1,-1
...,...,...,...,...,...,...,...,...,...,...
18047,25825,also this concept queuing very subjective here...,Hyderabad,2018,"['also', 'this', 'concept', 'queuing', 'very',...",-1,-1,-1,-1,-1
18048,25830,think this creates lakh job anim husbandary ho...,Hyderabad,2018,"['think', 'this', 'creates', 'lakh', 'job', 'a...",-1,1,1,-1,1
18052,25845,sir please take action supraja hospital nagole...,Hyderabad,2021,"['sir', 'please', 'take', 'action', 'supraja',...",-1,1,-1,-1,1
18053,25859,why meme police did not waited for his report ...,Hyderabad,2020,"['why', 'meme', 'police', 'did', 'not', 'waite...",-1,-1,-1,-1,-1


# Applying BERTopic to the Negative Emotion Tweets to extract Topics

In [4]:
topic_model = BERTopic(embedding_model="all-MiniLM-L6-v2", language="multilingual", min_topic_size = 10, top_n_words = 15)

In [5]:
topics, probs = topic_model.fit_transform(tweets_neg_df["tweet"])

In [6]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,4922,-1_hospital_the_not_and,"[hospital, the, not, and, are, you, doctor, fo...",[relative wild guess admitted chennai hospital...
1,0,597,0_oxygen_lack_due_dying,"[oxygen, lack, due, dying, people, cylinder, b...",[oxygen bed ventilator medicine available peop...
2,1,349,1_eye_tear_doctor_you,"[eye, tear, doctor, you, for, that, this, appo...",[why waste patient tear eye giving appointment...
3,2,281,2_poor_delhi_school_government,"[poor, delhi, school, government, corrupt, for...",[crore for coastal not understand what you mea...
4,3,231,3_unhygienic_water_condition_garbage,"[unhygienic, water, condition, garbage, toilet...",[the real condition toilet government school w...
...,...,...,...,...,...
90,89,11,89_based_gap_dos_science,"[based, gap, dos, science, self, philosophical...",[not meaning knowledge how can take sitting ho...
91,90,11,90_urgent_radevsivir_theropy_mahendra,"[urgent, radevsivir, theropy, mahendra, asha, ...",[rohit shrivastava admitted drdo hospital inma...
92,91,11,91_negligence_medical_lost_action,"[negligence, medical, lost, action, respect, l...",[aicc alagiri callously say hospital birth dea...
93,92,10,92_fever_february_syrup_symptom,"[fever, february, syrup, symptom, take, active...",[today went esi dispensary for medicine which ...


In [7]:
topic_model.get_topic(0)

[('oxygen', 0.060622704155222765),
 ('lack', 0.040654580561037976),
 ('due', 0.024534241918460785),
 ('dying', 0.01563227968935385),
 ('people', 0.014261477010536528),
 ('cylinder', 0.014103077576550674),
 ('bed', 0.013785519015888283),
 ('died', 0.013232183857544148),
 ('death', 0.012227681595852866),
 ('shortage', 0.011612854908399452),
 ('supply', 0.011312281037741194),
 ('you', 0.010575487652500081),
 ('not', 0.010371014930831582),
 ('what', 0.00978284772508501),
 ('understand', 0.009271770015279516)]

In [8]:
topic_model.get_document_info(tweets_neg_df["tweet"])

Unnamed: 0,Document,Topic,Name,Representation,Representative_Docs,Top_n_words,Probability,Representative_document
0,very bad thing say that government did not med...,-1,-1_hospital_the_not_and,"[hospital, the, not, and, are, you, doctor, fo...",[relative wild guess admitted chennai hospital...,hospital - the - not - and - are - you - docto...,0.000000,False
1,all netas and their family should admitted onl...,-1,-1_hospital_the_not_and,"[hospital, the, not, and, are, you, doctor, fo...",[relative wild guess admitted chennai hospital...,hospital - the - not - and - are - you - docto...,0.000000,False
2,very hygiene government veterinary hospital as...,19,19_dog_animal_cat_veterinary,"[dog, animal, cat, veterinary, vet, stray, cow...",[you people ask money for the treatment and am...,dog - animal - cat - veterinary - vet - stray ...,0.299678,False
3,shall throw out india let take care our health...,-1,-1_hospital_the_not_and,"[hospital, the, not, and, are, you, doctor, fo...",[relative wild guess admitted chennai hospital...,hospital - the - not - and - are - you - docto...,0.000000,False
4,not online school wild guess going south mumba...,-1,-1_hospital_the_not_and,"[hospital, the, not, and, are, you, doctor, fo...",[relative wild guess admitted chennai hospital...,hospital - the - not - and - are - you - docto...,0.000000,False
...,...,...,...,...,...,...,...,...
9892,also this concept queuing very subjective here...,10,10_your_terminal_doctor_appointment,"[your, terminal, doctor, appointment, waiting,...",[you are unbelievable now day get doctor appoi...,your - terminal - doctor - appointment - waiti...,1.000000,False
9893,think this creates lakh job anim husbandary ho...,2,2_poor_delhi_school_government,"[poor, delhi, school, government, corrupt, for...",[crore for coastal not understand what you mea...,poor - delhi - school - government - corrupt -...,1.000000,False
9894,sir please take action supraja hospital nagole...,8,8_bed_patient_room_more,"[bed, patient, room, more, you, hospital, not,...",[availability ventilator bed any hospital sear...,bed - patient - room - more - you - hospital -...,0.273535,False
9895,why meme police did not waited for his report ...,56,56_what_understand_mean_why,"[what, understand, mean, why, you, not, action...",[this system significant other corrupt that li...,what - understand - mean - why - you - not - a...,0.928856,True


In [9]:
topic_model.get_representative_docs(0)

['oxygen bed ventilator medicine available people dying due lack oxygen cylinder bed injection where are you hospital your terminal',
 'covid death due the lack oxygen',
 'about patient died due lack oxygen']

In [10]:
topics_df = pd.DataFrame({"topic": topics, "tweets": tweets_neg_df["tweet"]})

topics_df

Unnamed: 0,topic,tweets
0,-1,very bad thing say that government did not med...
1,-1,all netas and their family should admitted onl...
5,19,very hygiene government veterinary hospital as...
7,-1,shall throw out india let take care our health...
9,-1,not online school wild guess going south mumba...
...,...,...
18047,10,also this concept queuing very subjective here...
18048,2,think this creates lakh job anim husbandary ho...
18052,8,sir please take action supraja hospital nagole...
18053,56,why meme police did not waited for his report ...


# Visualizations

In [11]:
topic_model.visualize_topics()

In [12]:
topic_model.visualize_barchart()

In [13]:
topic_model.visualize_distribution(probs)

In [14]:
topic_model.visualize_heatmap()

In [15]:
topic_model.visualize_hierarchy()

In [16]:
topic_model.visualize_term_rank()

In [17]:
from umap import UMAP
from sentence_transformers import SentenceTransformer

sentence_model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = sentence_model.encode(tweets_neg_df["tweet"], show_progress_bar=False)

topic_model = BERTopic().fit(tweets_neg_df["tweet"], embeddings)

topic_model.visualize_documents(tweets_neg_df["tweet"], embeddings=embeddings)

reduced_embeddings = UMAP(n_neighbors=50, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings)

topic_model.visualize_documents(tweets_neg_df["tweet"], reduced_embeddings=reduced_embeddings)

KeyError: 9468

In [None]:
topic_distr, topic_token_distr = topic_model.approximate_distribution(tweets_neg_df["tweet"], calculate_tokens=True)

approx_df = topic_model.visualize_approximate_distribution(tweets_neg_df["tweet"][1], topic_token_distr[1])
approx_df

In [None]:
hierarchical_topics = topic_model.hierarchical_topics(tweets_neg_df["tweet"])

In [None]:
topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics)

In [None]:
sentence_model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = sentence_model.encode(tweets_neg_df["tweet"], show_progress_bar=False)

topic_model = BERTopic().fit(tweets_neg_df["tweet"], embeddings)
hierarchical_topics = topic_model.hierarchical_topics(tweets_neg_df["tweet"])

topic_model.visualize_hierarchical_documents(tweets_neg_df["tweet"], hierarchical_topics, embeddings=embeddings)

reduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings)
topic_model.visualize_hierarchical_documents(tweets_neg_df["tweet"], hierarchical_topics, reduced_embeddings=reduced_embeddings)

# Generating Topic Labels

In [None]:
tweets_neg_df = tweets_neg_df.reset_index(drop=True)

In [None]:
topic_model.generate_topic_labels()

['-1_hospital_the_not',
 '0_oxygen_lack_due',
 '1_eye_tear_doctor',
 '2_wild_guess_his',
 '3_vaccine_vaccination_vaccinated',
 '4_covid_positive_test',
 '5_poor_government_free',
 '6_money_doctor_corporate',
 '7_negligence_medical_should',
 '8_check_pradesh_uttar',
 '9_bed_patient_room',
 '10_terminal_your_doctor',
 '11_parking_parked_board',
 '12_ambulance_you_one',
 '13_ventilator_icu_bed',
 '14_protest_against_protesting',
 '15_baby_pregnant_woman',
 '16_covid_death_positive',
 '17_fire_broke_floor',
 '18_bjp_party_delhi',
 '19_corona_patient_positive',
 '20_dog_animal_cat',
 '21_hai_mein_okay',
 '22_blood_plasma_donor',
 '23_child_kota_rajasthan',
 '24_toilet_unhygienic_washroom',
 '25_mask_wearing_are',
 '26_rich_strike_help',
 '27_road_traffic_near',
 '28_water_drainage_drain',
 '29_charge_fee_consultation',
 '30_bihar_nitish_chief',
 '31_road_pothole_repair',
 '32_dengue_mosquito_breeding',
 '33_garbage_unhygienic_pollution',
 '34_hindu_muslim_religious',
 '35_food_restaurant_un

In [None]:
topic_model = BERTopic(nr_topics="auto")

In [None]:
topic_model.generate_topic_labels()

TypeError: 'NoneType' object is not iterable

In [None]:
new_topics = topic_model.reduce_outliers(tweets_neg_df["tweet"], topics)

NotFittedError: Vocabulary not fitted or provided