In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import torch
import tqdm

from flair.data import Sentence
from flair.nn import Classifier
from flair.embeddings import WordEmbeddings
tagger = Classifier.load('sentiment-fast')
glove_embeddings = WordEmbeddings('glove')
# twitter_embeddings = WordEmbeddings('twitter')

from sklearn.cluster import KMeans

In [3]:
tweets_df = pd.read_csv('/Users/nitanshjain/Documents/Projects/Twitter_Data_Analysis/v2/data/tweets_cleaned_translated_v2.csv')
tweets_df.tweet = tweets_df.tweet_translated
tweets_df.drop(['tweet_translated'], axis=1, inplace=True)
print(tweets_df.shape)
tweets_df.head()

(17909, 4)


Unnamed: 0,tweet_id,tweet,city,year
0,0,bad thing say government medical spare operati...,Mumbai,2022
1,1,netas family admitted government hospital priv...,Mumbai,2022
2,2,govandi one hotspot well respiratory disease f...,Mumbai,2022
3,3,till medical negligence exist government hospi...,Mumbai,2022
4,4,doctor reading also government hospital resident,Mumbai,2022


# Flair Sentiment Analysis

In [4]:
def assign_sentiment_column(df):
    sentiment_flair = list()
    for tweets in df.loc[:,'tweet']:
        sentence = Sentence(tweets)
        tagger.predict(sentence)
        value = sentence.labels[0].to_dict()['value'] 
        if value == 'POSITIVE':
            result = sentence.labels[0].to_dict()['confidence']
        else:
            result = -(sentence.labels[0].to_dict()['confidence'])
        
        if result>0:
            result = 1
        else:
            result = -1
        sentiment_flair.append(result)
    df['sentiment_flair'] = sentiment_flair
    return df

tweets_sentiment_df = assign_sentiment_column(tweets_df)
tweets_sentiment_df.head()

Unnamed: 0,tweet_id,tweet,city,year,sentiment_flair
0,0,bad thing say government medical spare operati...,Mumbai,2022,-1
1,1,netas family admitted government hospital priv...,Mumbai,2022,-1
2,2,govandi one hotspot well respiratory disease f...,Mumbai,2022,-1
3,3,till medical negligence exist government hospi...,Mumbai,2022,-1
4,4,doctor reading also government hospital resident,Mumbai,2022,1


In [5]:
print(tweets_sentiment_df.sentiment_flair.value_counts())
# number of tweets per city grouped by sentiment
print(tweets_sentiment_df.groupby(['city','sentiment_flair']).size())
# number of tweets per year grouped by sentiment
print(tweets_sentiment_df.groupby(['year','sentiment_flair']).size())

sentiment_flair
-1    9374
 1    8535
Name: count, dtype: int64
city       sentiment_flair
Bangalore  -1                 1277
            1                 1022
Chennai    -1                  397
            1                  399
Delhi      -1                 3420
            1                 3376
Hyderabad  -1                 1202
            1                 1201
Kolkata    -1                  565
            1                  454
Mumbai     -1                 2513
            1                 2083
dtype: int64
year  sentiment_flair
2018  -1                 1029
       1                  942
2019  -1                 1338
       1                 1258
2020  -1                 2977
       1                 3025
2021  -1                 2795
       1                 2217
2022  -1                 1235
       1                 1093
dtype: int64


# Sentiment Analysis using Embeddings and Clustering


In [6]:
s = torch.zeros(0,100)

for tweets in tweets_df.tweet:
    w = torch.zeros(0,100)
    sentence = Sentence(tweets)
    glove_embeddings.embed(sentence)
    
    for token in sentence:
        w = torch.cat((w,token.embedding.view(-1,100)),0)
    s = torch.cat((s, w.mean(dim = 0).view(-1, 100)),0)

In [7]:
tweets_flair_embeddings = s.numpy()
print(tweets_flair_embeddings.shape)

(17909, 100)


In [11]:
model = KMeans(n_clusters=2, max_iter=1000, random_state=True, n_init=50).fit(X=tweets_flair_embeddings)
positive_cluster_center = model.cluster_centers_[0]
negative_cluster_center = model.cluster_centers_[1]

In [15]:
positive_cluster_center

array([-4.88442071e-02,  1.40136182e-01,  1.70148194e-01, -2.51966529e-02,
       -1.66111127e-01,  1.86558262e-01, -1.47842959e-01,  1.78272113e-01,
        1.28935203e-02,  1.49864964e-02, -3.24097686e-02,  3.21295634e-02,
        2.74292320e-01,  1.02452829e-01,  6.48642182e-02, -2.32312039e-01,
        1.10063024e-01,  2.84158625e-02, -2.83991992e-01,  1.76492512e-01,
        9.99475867e-02, -3.09692584e-02, -4.65598851e-02, -7.44716078e-02,
       -4.14153077e-02,  3.53076346e-02, -5.70847690e-02, -4.07256097e-01,
        4.76390719e-02,  2.85783429e-02,  1.78822875e-02,  3.29773247e-01,
       -4.04403433e-02,  6.85736984e-02, -5.17936945e-02,  1.41405717e-01,
       -1.64617732e-01,  1.64646447e-01,  7.40966760e-04, -6.63919747e-02,
       -2.82265127e-01, -2.64665522e-02,  1.03441551e-01, -3.37885141e-01,
        2.80081593e-02,  5.67578338e-03,  5.60916886e-02, -7.43293017e-02,
       -7.86803514e-02, -4.73817438e-01,  1.29266649e-01, -1.09637462e-01,
        1.14360370e-01,  

In [16]:
negative_cluster_center

array([-7.08849430e-02,  2.62137577e-02,  8.77078474e-02,  9.98413656e-03,
       -1.05667576e-01,  1.69522107e-01, -9.20886546e-02,  2.46776968e-01,
       -8.03269446e-04,  7.03831762e-02, -2.27087103e-02,  2.06271634e-02,
        2.68004388e-01,  1.62332058e-01,  9.40009132e-02, -1.94145828e-01,
        1.47978336e-01, -2.45837439e-02, -2.29569152e-01,  1.04130864e-01,
        8.78155008e-02, -5.16638644e-02, -4.67704199e-02,  2.11181864e-03,
       -3.74176688e-02,  5.08225337e-02,  3.54699790e-03, -2.91603804e-01,
        3.31391171e-02,  8.36247653e-02, -1.48474146e-03,  2.35747650e-01,
       -3.52732465e-03,  1.28969803e-01, -8.75963345e-02,  1.03399843e-01,
       -1.56433225e-01,  1.63220420e-01,  3.82006727e-02, -2.60678958e-02,
       -2.10792035e-01,  3.90230976e-02,  1.09582894e-01, -2.92384565e-01,
        1.23589620e-01,  6.01295829e-02,  4.43350449e-02,  1.25127919e-02,
       -2.05541663e-02, -2.08978668e-01,  7.76787475e-02, -1.11606717e-01,
        1.43246219e-01,  