# Analysing Topic Labels

In [1]:
import os
from os.path import join
import eland as ed
import pandas as pd
import numpy as np
import seaborn as sns; sns.set()
from wordcloud import WordCloud, STOPWORDS
from operator import itemgetter
from nltk.collocations import BigramAssocMeasures, BigramCollocationFinder
import nltk
import string
import plotly.express as px
import plotly.graph_objects as go
import preprocessor as prep
import matplotlib.pyplot as plt

project_dir = join(os.getcwd(), os.pardir)
models_dir = join(project_dir, 'models')
zstc_dir = join(project_dir, 'reports', 'figures', 'zstc-wordclouds')

pd.set_option('display.max_colwidth', -1)

%config InlineBackend.figure_format = 'svg'

In [2]:
TERMS = ['resource availability', 'volunteers', 'power supply', 'relief measures', 
         'food supply', 'infrastructure', 'medical assistance', 'rescue', 'shelter', 
         'utilities', 'water supply', 'evacuation', 'government', 'crime violence', 
         'mobile network', 'sympathy', 'news updates', 'internet', 'grievance', 
         'livelihood', 'income', 'ecosystem', 'biodiversity', 'agriculture']
         
THRESHOLD = 0.6

## Import Tweets from ES

In [3]:
ed_df = ed.DataFrame('localhost', 'twitter', columns=['full_text', 'sentiment', 'retweet_count'])

# defining the full-text query we need: Retrieving records for full_text_processed with the condition is_retweet=False and is_quote_status=False

query_unique = {
    "bool": {
        "must": {
            "term":{"is_retweet":"false"},
        },
        "filter": {
            "term":{"is_quote_status":"false"},
            "term":{"lang.keyword":"en"}
        },
    }
}
# using full-text search capabilities with Eland:
df_ed = ed_df.es_query(query_unique)
df_tweets = df_ed.to_pandas()

## Basic Tweet Preprocessing
- Remove URLs and reserved words (RTs)
- Remove # and @ symbols


In [4]:
## Set options for the tweet-preprocessor
prep.set_options(prep.OPT.URL, prep.OPT.RESERVED, prep.OPT.EMOJI, prep.OPT.SMILEY)

## Clean text and remove #,@ symbols
def clean_tweet(text):
    text = prep.clean(text)
    table = str.maketrans('','',string.punctuation)
    return text.translate(table)

In [5]:
df_tweets['full_text'] = df_tweets['full_text'].apply(lambda x: clean_tweet(x))

In [6]:
df_tweets.head()

Unnamed: 0,full_text,sentiment,retweet_count,name
1262961673708675072,Live Cyclone Amphan Map Tracking the Storms Path,0.0,0,newspointpn
1262961660932894720,NYT Live Cyclone Amphan Map Tracking the Storms Path,0.0,0,Vishal Tripathi
1262961652359729152,LIVE Now news update on Super Cyclone Amphan AmphanUpdate CycloneAmphan AmphanCyclone CycloneAmphanUpdate 120 km nearly south of Paradip Odisha 200 km southsouthwest of Digha West Bengal and 360 km southsouthwest of Khepupara Bangladesh,0.5994,0,MJ News
1262937945214005248,LIVE news update on Super Cyclone Amphan AmphanUpdate CycloneAmphan AmphanCyclone CycloneAmphanUpdate 125 km nearly south of Paradip Odisha 225 km southsouthwest of Digha West Bengal and 380 km southsouthwest of Khepupara Bangladesh,0.5994,0,MJ News
1262961522994806784,Stay safe Odisha and West Bengal AmphanUpdates Amphan AmphanSuperCyclone,0.4404,0,Sourabh Mathur


## Loading the tweet labels

In [7]:
df_labels = pd.read_json(join(models_dir,'zstc_labels.json'), orient='index', convert_axes=False)

In [8]:
df_labels.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,14,15,16,17,18,19,20,21,22,23
1264253979002843136,"[livelihood, 0.43]","[relief measures, 0.42]","[medical assistance, 0.31]","[grievance, 0.29]","[ecosystem, 0.28]","[evacuation, 0.14]","[sympathy, 0.1]","[resource availability, 0.08]","[shelter, 0.07]","[agriculture, 0.05]",...,"[rescue, 0.01]","[food supply, 0.01]","[mobile network, 0.0]","[power supply, 0.0]","[water supply, 0.0]","[utilities, 0.0]","[news updates, 0.0]","[volunteers, 0.0]","[government, 0.0]","[internet, 0.0]"
1264253893632016384,"[government, 0.8200000000000001]","[grievance, 0.72]","[crime violence, 0.59]","[ecosystem, 0.53]","[livelihood, 0.52]","[sympathy, 0.42]","[relief measures, 0.4]","[shelter, 0.33]","[news updates, 0.32]","[rescue, 0.28]",...,"[resource availability, 0.11]","[evacuation, 0.09]","[infrastructure, 0.08]","[mobile network, 0.08]","[power supply, 0.05]","[internet, 0.05]","[agriculture, 0.04]","[water supply, 0.02]","[food supply, 0.01]","[volunteers, 0.01]"
1264253882580045824,"[income, 0.91]","[livelihood, 0.88]","[grievance, 0.8200000000000001]","[sympathy, 0.73]","[shelter, 0.22]","[news updates, 0.13]","[evacuation, 0.1]","[infrastructure, 0.09]","[relief measures, 0.07]","[utilities, 0.05]",...,"[internet, 0.01]","[biodiversity, 0.01]","[government, 0.01]","[power supply, 0.01]","[ecosystem, 0.01]","[crime violence, 0.0]","[volunteers, 0.0]","[food supply, 0.0]","[water supply, 0.0]","[medical assistance, 0.0]"


In [9]:
'''Method to return topics for every tweet with confidence score above threshold'''
def get_labels(tweet, threshold=THRESHOLD):
    topics = []
    for topic in tweet:
        topic_name, value = topic[0], topic[1]
        if value>threshold:
            topics.append((topic_name, np.round(value,2)))
    if not topics:
        topics.append((tweet[0][0], tweet[0][1]))
    return topics  

In [10]:
df_labels['labels'] = df_labels.apply(lambda x: get_labels(x, THRESHOLD), axis=1)
df_labels = df_labels[['labels']]

In [11]:
df_labels.head()

Unnamed: 0,labels
1264253979002843136,"[(livelihood, 0.43)]"
1264253893632016384,"[(government, 0.82), (grievance, 0.72)]"
1264253882580045824,"[(income, 0.91), (livelihood, 0.88), (grievance, 0.82), (sympathy, 0.73)]"
1264253658763612160,"[(grievance, 0.98), (livelihood, 0.96), (sympathy, 0.92), (government, 0.88), (relief measures, 0.83), (resource availability, 0.81), (rescue, 0.71), (shelter, 0.69)]"
1264253569525592064,"[(grievance, 0.98), (livelihood, 0.96), (sympathy, 0.91), (resource availability, 0.83), (relief measures, 0.81), (rescue, 0.69), (income, 0.69), (government, 0.68), (shelter, 0.68), (infrastructure, 0.6)]"


## Merging the Tweets with Labels

In [12]:
df_tweet_labels = pd.merge(df_tweets, df_labels, left_index=True, right_index=True)
df_tweet_labels.head(3)

Unnamed: 0,full_text,sentiment,retweet_count,name,labels
1262961673708675072,Live Cyclone Amphan Map Tracking the Storms Path,0.0,0,newspointpn,"[(livelihood, 0.29)]"
1262961660932894720,NYT Live Cyclone Amphan Map Tracking the Storms Path,0.0,0,Vishal Tripathi,"[(news updates, 0.93)]"
1262961652359729152,LIVE Now news update on Super Cyclone Amphan AmphanUpdate CycloneAmphan AmphanCyclone CycloneAmphanUpdate 120 km nearly south of Paradip Odisha 200 km southsouthwest of Digha West Bengal and 360 km southsouthwest of Khepupara Bangladesh,0.5994,0,MJ News,"[(news updates, 1.0), (internet, 0.82), (shelter, 0.6)]"


## WordClouds for each Topic Label

In [17]:
'''Method to extract the tweet text for given topic'''
def get_label_agg(term):
    text = []
    sentiments = []
    retweet_count = 0
    for idx, row in df_tweet_labels.iterrows():
        try:
            tweet = [row['full_text'] for item in row['labels'] if item[0]==term]
            if tweet:
                text.append(tweet[0])
                sentiments.append(row['sentiment'])
                retweet_count += row['retweet_count']
        except:
            pass
    return text, sentiments, retweet_count

In [16]:
'''Print WordCloud for unigrams for specified topic'''
def get_wordcloud_unigrams(text):
    wordcloud_unigrams = WordCloud(width = 800, height = 800, include_numbers=True,
                background_color ='black', max_words=200, collocations=False, max_font_size=120,
                min_font_size = 8, stopwords = set(STOPWORDS)).generate(text.lower())
    
    return wordcloud_unigrams

'''Print WordCloud for bigrams for specified topic'''
def get_wordcloud_bigrams(text):
    tokens = nltk.word_tokenize(text.lower())
    tokens = [word for word in tokens if word not in STOPWORDS]
    finder = BigramCollocationFinder.from_words(tokens)
    bigram_measures = BigramAssocMeasures()
    scored = finder.score_ngrams(bigram_measures.raw_freq)

    scoredList = sorted(scored, key=itemgetter(1), reverse=True)

    word_dict = {}
    listLen = len(scoredList)
    
    # Get the bigram and make a contiguous string for the dictionary key. 
    # Set the key to the scored value. 
    for i in range(listLen):
        word_dict['_'.join(scoredList[i][0])] = scoredList[i][1]

    wordcloud_bigrams = WordCloud(width = 800, height = 800, 
                background_color ='black', max_words=100,
                min_font_size = 10).generate_from_frequencies(word_dict)
    return wordcloud_bigrams

'''Print and Save Wordclouds'''
def print_wordclouds(wordcloud_unigrams, wordcloud_bigrams, topic):

    # Create images directory if it doesn't exist
    if os.path.isdir(zstc_dir) == False:
        os.system('mkdir '+zstc_dir)

    fig = plt.figure(figsize = (10, 10), facecolor = None)
    
    # plot the WordCloud Unigram image
    fig.add_subplot(1, 2, 1)
    plt.title('Wordcloud over Unigrams')
    plt.imshow(wordcloud_unigrams, interpolation='bilinear') 
    plt.axis("off")
    plt.tight_layout(pad = 2) 
    
    # plot the WordCloud image
    fig.add_subplot(1, 2, 2)                        
    plt.imshow(wordcloud_bigrams, interpolation='bilinear') 
    plt.axis("off") 
    plt.title('Wordcloud over Bigrams')
    plt.tight_layout(pad = 2)
    plt.savefig(zstc_dir+'/{}.png'.format(topic), dpi=300) # Save figure

    plt.show() 

In [18]:
label_data = {}
for label in TERMS:
    label_data[label] = {}
    text, sentiments, rc = get_label_agg(label)
    label_data[label]['sentiments'] = sentiments
    label_data[label]['retweet_count'] = rc
    text = ' '.join(text)
    wordcloud_unigrams = get_wordcloud_unigrams(text)
    wordcloud_bigrams = get_wordcloud_bigrams(text)
    print('\nWordcloud for Label -',label)
    print_wordclouds(wordcloud_unigrams, wordcloud_bigrams, label)


## Sentiment Boxplot for Labels

In [26]:
fig = go.Figure()

for label in label_data.keys():
    fig.add_trace(go.Box(y=label_data[label]['sentiments'], name=label))

fig.update_layout(
    title='Sentiment Values of Labels',
    autosize=False,
    width=1200,
    height=400,
    margin=dict(l=20, r=20, t=40, b=20),
    showlegend=False
)

fig.show()

## Number of Tweets per Label

In [33]:
fig = go.Figure()

fig.add_trace(go.Bar(y=[len(label_data[label]['sentiments']) for label in label_data], x=[label for label in label_data]))

fig.update_layout(
    title='Number of Tweets per Label',
    autosize=False,
    width=1200,
    height=400,
    margin=dict(l=20, r=20, t=40, b=20),
    showlegend=False
)

fig.show()

## Retweet Count per Label

In [38]:
fig = go.Figure()

fig.add_trace(go.Bar(y=[label_data[label]['retweet_count'] for label in label_data], x=[label for label in label_data]))

fig.update_layout(
    title='Total Retweet Count per Label',
    autosize=False,
    width=1200,
    height=400,
    margin=dict(l=20, r=20, t=40, b=20),
    showlegend=False
)

fig.show()