# Unsupervised Tweet Summarization with Network Analysis

In [349]:
import os
from os.path import join
import pandas as pd
import numpy as np
import nltk
import networkx as nx
from collections import Counter
import eland as ed
from sklearn.metrics import pairwise_distances
from scipy.spatial.distance import cosine
from numpy import dot
from numpy.linalg import norm
import matplotlib.pyplot as plt
from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.cluster import DBSCAN, KMeans
from sklearn.decomposition import PCA
from networkx.algorithms import community

pd.set_option('display.max_colwidth', -1)



In [2]:
project_dir = join(os.getcwd(), os.pardir)
models_dir = join(project_dir, 'models')

In [3]:
TERMS = ['sympathy', 'complaint', 'hope', 'job', 'relief measures', 'compensation',
        'evacuation', 'income', 'ecosystem', 'government', 'corruption', 'news updates', 
        'volunteers', 'donation', 'mobile network', 'housing', 'farm', 'utilities', 
        'water supply', 'power supply', 'food supply', 'medical assistance', 'coronavirus', 
        'petition', 'poverty']

THRESHOLD = 0.7

## Import data from Elasticsearch

In [4]:
ed_df = ed.DataFrame('localhost', 'twitter', columns=['full_text_processed', 'sentiment', 'retweet_count'])

# defining the full-text query we need: Retrieving records for full_text_processed with the condition is_retweet=False and is_quote_status=False
query_unique = {
    "bool": {
        "must": {
            "term":{"is_retweet":"false"},
        },
        "filter": {
            "term":{"is_quote_status":"false"}
        },
    }
}
# using full-text search capabilities with Eland:
df_ed = ed_df.es_query(query_unique)
df_tweets = df_ed.to_pandas()

In [5]:
df_tweets['length'] = df_tweets['full_text_processed'].apply(lambda x: len([w for w in x.split()]))
df_tweets = df_tweets[df_tweets['length']>5]

In [6]:
df_tweets.shape

(103417, 4)

In [7]:
df_tweets.head()

Unnamed: 0,full_text_processed,sentiment,retweet_count,length
1262961673708675072,live cyclone amphan map tracking storm’s path,0.0,0,7
1262961660932894720,nyt live cyclone amphan map tracking storm’s path,0.0,0,8
1262961652359729152,live news update super cyclone amphan amphanupdate cycloneamphan amphancyclone cycloneamphanupdate 120 km nearly south paradip odisha 200 km southsouthwest digha west bengal 360 km southsouthwest khepupara bangladesh,0.5994,0,27
1262960808742522880,cyclone ampan came closer live super cyclone amphan update pradip odisha 120 km orissa digha west bengal 200 km west bengal khepupara bangladesh 360 km bangladesh bangladesh bangla west bengal cyclone amphan,0.5994,0,32
1262937945214005248,live news update super cyclone amphan amphanupdate cycloneamphan amphancyclone cycloneamphanupdate 125 km nearly south paradip odisha 225 km southsouthwest digha west bengal 380 km southsouthwest khepupara bangladesh,0.5994,0,27


## Load the Tweet2Vec Model

In [8]:
## Loading the tweet2vec model
model = Doc2Vec.load(join(models_dir,'tweet2VecJared.model'))
doc_tags = list(model.docvecs.doctags.keys())   ## Tweet Ids
doc_vectors = model.docvecs.vectors_docs        ## Tweet Vectors

In [9]:
print(len(doc_tags))
print(len(doc_vectors))

113342
113342


## Loading the tweet labels

In [10]:
df_labels = pd.read_json(join(models_dir,'zstc_labels.json'), orient='index', convert_axes=False)

In [11]:
df_labels.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,23,24
1264253979002843136,"[relief measures, 0.67]","[complaint, 0.63]","[poverty, 0.48]","[evacuation, 0.46]","[sympathy, 0.44]","[medical assistance, 0.32]","[income, 0.30000000000000004]","[housing, 0.29]","[petition, 0.23]","[corruption, 0.22]",...,"[food supply, 0.07]","[hope, 0.07]","[utilities, 0.05]","[news updates, 0.05]","[coronavirus, 0.04]","[farm, 0.03]","[donation, 0.03]","[volunteers, 0.02]","[government, 0.02]","[job, 0.01]"
1264253959918632960,"[relief measures, 0.5]","[job, 0.06]","[farm, 0.04]","[volunteers, 0.01]","[evacuation, 0.01]","[petition, 0.0]","[complaint, 0.0]","[sympathy, 0.0]","[compensation, 0.0]","[income, 0.0]",...,"[medical assistance, 0.0]","[mobile network, 0.0]","[power supply, 0.0]","[government, 0.0]","[poverty, 0.0]","[housing, 0.0]","[corruption, 0.0]","[food supply, 0.0]","[hope, 0.0]","[water supply, 0.0]"
1264253893632016384,"[government, 0.98]","[hope, 0.97]","[power supply, 0.93]","[sympathy, 0.86]","[ecosystem, 0.77]","[medical assistance, 0.72]","[complaint, 0.6900000000000001]","[relief measures, 0.67]","[petition, 0.5700000000000001]","[income, 0.44]",...,"[evacuation, 0.24]","[housing, 0.21]","[poverty, 0.2]","[job, 0.18]","[farm, 0.16]","[food supply, 0.16]","[coronavirus, 0.12]","[donation, 0.1]","[corruption, 0.08]","[volunteers, 0.0]"


In [12]:
'''Method to return topics for every tweet with confidence score above threshold'''
def get_labels(tweet, threshold=THRESHOLD):
    topics = []
    for topic in tweet:
        topic_name, value = topic[0], topic[1]
        if value>threshold:
            topics.append((topic_name, np.round(value,2)))
    if not topics:
        topics.append((tweet[0][0], tweet[0][1]))
    return topics  

In [13]:
df_labels['labels'] = df_labels.apply(lambda x: get_labels(x, THRESHOLD), axis=1)
df_labels = df_labels[['labels']]

In [14]:
df_labels.head()

Unnamed: 0,labels
1264253979002843136,"[(relief measures, 0.67)]"
1264253959918632960,"[(relief measures, 0.5)]"
1264253893632016384,"[(government, 0.98), (hope, 0.97), (power supply, 0.93), (sympathy, 0.86), (ecosystem, 0.77), (medical assistance, 0.72)]"
1264253882580045824,"[(income, 0.75)]"
1264253658763612160,"[(complaint, 0.94), (relief measures, 0.93), (petition, 0.88), (donation, 0.87), (sympathy, 0.86)]"


## Merging the Tweets with Labels

In [660]:
df_tweet_labels = pd.merge(df_tweets, df_labels, left_index=True, right_index=True)
df_tweet_labels.head(3)

Unnamed: 0,full_text_processed,sentiment,retweet_count,length,labels
1262961673708675072,live cyclone amphan map tracking storm’s path,0.0,0,7,"[(sympathy, 0.11)]"
1262961660932894720,nyt live cyclone amphan map tracking storm’s path,0.0,0,8,"[(utilities, 0.25)]"
1262961652359729152,live news update super cyclone amphan amphanupdate cycloneamphan amphancyclone cycloneamphanupdate 120 km nearly south paradip odisha 200 km southsouthwest digha west bengal 360 km southsouthwest khepupara bangladesh,0.5994,0,27,"[(news updates, 1.0)]"


## Filter on Label

In [708]:
LABEL = 'utilities'

In [709]:
df_tweet_labels['labels_list'] = df_tweet_labels['labels'].apply(lambda x: x if LABEL in [item[0] for item in x] else np.nan)
df_label = df_tweet_labels[df_tweet_labels['labels_list'].notnull()][['full_text_processed', 'length', 'sentiment', 'retweet_count', 'labels']]

In [710]:
df_label.head()

Unnamed: 0,full_text_processed,length,sentiment,retweet_count,labels
1262961660932894720,nyt live cyclone amphan map tracking storm’s path,8,0.0,0,"[(utilities, 0.25)]"
1262974855282069504,sucs amphan 123 km eastsoutheast paradip odisha 0930 ist 20th may cross west bengalbangladesh coast digh west bengal hatiya island bangladesh close sunderbans landfall process commence afternoon time8news amphan,29,0.0,0,"[(news updates, 0.99), (utilities, 0.83), (sympathy, 0.8), (ecosystem, 0.73)]"
1262972846936068096,live cyclone amfan havoc 102 km per hour wind speed cyclone,11,-0.5994,0,"[(utilities, 0.25)]"
1262980176910209024,nairhena panditsripathak nalini51purohit romy1965 divyam1079 sanjaygobind nutanjyot muralydoctrack1 dilipswatisri drsdwivedi7 rajesh201963 wetwokrishna lakshmianand96 anju1951purohit swetasamadhiya ssharmajsr latarai5 jugalkhetan ankahi vprakash68 vaidyvoice vinishind nilotpalmukher6 reinebow23 pksrivastava6 paperrose2k keyesen2000 jagannkaushik city feeder breakdown due precautionary shutting power supply due cyclone amphan since storm city charging feeder going throughout city power supply restored fully 2 hour,53,0.34,2,"[(power supply, 0.99), (utilities, 0.76), (relief measures, 0.75)]"
1262979950413590528,srcodisha honble src sir kindly give official order d2h service provider give free d2h antenna amp reinstall respective customer premise 12 affected district odishaif damaged due amphan cyclonic storm amp rain,31,0.7003,0,"[(sympathy, 0.98), (relief measures, 0.96), (mobile network, 0.85), (hope, 0.84), (compensation, 0.78), (utilities, 0.74)]"


## Creating Summaries based on Similarity & Connected Components

In [711]:
label_vectors = np.array([doc_vectors[doc_tags.index(idx)] for idx in df_label.index.tolist()])

In [712]:
cos_distances = pairwise_distances(label_vectors, n_jobs=-1, metric='cosine')

In [713]:
"""Given pairwise distances and length parameter, returns the summary"""
def create_summary(cos_distances , K):
    summary = [] ## Store the summary
    remove = set() ## Set of nodes to be ignored (iteratively)
    threshold = 0.1

    while len(summary)<K and threshold<=1:
        print('Threshold =',threshold)
        G = nx.Graph()   ## Create a new Graph
        result = np.where(cos_distances<=threshold)
        listOfCoordinates = list(zip(result[0], result[1]))
        listOfCoordinates = [item for item in listOfCoordinates if item[0]!=item[1]]

        ## Create graph edges
        for node_1, node_2 in listOfCoordinates:
            item_1 = df_label.iloc[node_1]
            item_2 = df_label.iloc[node_2]
            id_1 = item_1.name
            id_2 = item_2.name

            ## Check if node hasn't been removed
            if id_1 not in remove and id_2 not in remove:
                if not G.has_node(id_1):
                    G.add_node(
                        id_1,
                        node_size=np.log(item_1['retweet_count']+item_1['length']+1),
                        sentiment=item_1['sentiment'],
                        labels=item_1['labels'],
                        text=item_1['full_text_processed']
                    )
                if not G.has_node(id_2):
                    G.add_node(
                        id_2,
                        node_size=np.log(item_2['retweet_count']+item_2['length']+1),
                        sentiment=item_2['sentiment'],
                        labels=item_2['labels'],
                        text=item_2['full_text_processed']
                    )
                if not G.has_edge(id_1, id_2):
                    G.add_edge(id_1, id_2, weight=(1-cos_distances[node_1][node_2]))
        
        conn_components = nx.connected_components(G)
        conn_components = sorted(conn_components, key=len, reverse=True)  ## Sorting the connected components based on length or size
        print("Number of connected components =",len(conn_components))

        ## Sort each connected component based on the aggregated score of node_size and pick the highest scoring tweet from each             component. The other nodes are removed.
        for component in conn_components:
            if len(component)>=4 and len(summary)<K: 
                node_pick = sorted(component, 
                            key=lambda x: (np.log(G.degree[x]+1)+G.nodes()[x]['node_size']), 
                            reverse=True)
                summary.append((node_pick[0], G.nodes()[node_pick[0]]['text'], G.nodes()[node_pick[0]]['sentiment']))
                remove.update(node_pick)
        
        threshold = threshold+0.1
    
    return summary

In [714]:
K = 50 ## Number of tweets to create the summary
summary = create_summary(cos_distances, K)

Threshold = 0.1
Number of connected components = 163
Threshold = 0.2
Number of connected components = 214


In [715]:
df_summary = pd.DataFrame(summary, columns=['tweet_id', 'full_text', 'sentiment'])

In [716]:
df_summary

Unnamed: 0,tweet_id,full_text,sentiment
0,1264459173372350464,jdhankhar1 mamataofficial pmoindia npr budget 3941 cr delhi power corridor 20000 cr npa loan waiver 140000 cr patels statue cost 2989 cr remaking central vista 22000 cr pending amount 4 bengal 52000 cr excluding bulbul relief bengal get 4 amphan declared national disaster 1000 cr advance aid,-0.25
1,1265524767681056768,first coronathen amphan made worse mamata’s incompetence people still haven’t relief electricity food water cut many part state mamata government aarnoimamata আরনয়মমতা,-0.6597
2,1263755579576799232,npr budget 3941 cr delhi power corridor 20000 cr npa loan waiver 14 lakh cr statue unity cost 2989 cr pending amount bengal 52000 cr excluding bulbul relief bengal get amphan 1000 crore declaration national emergency pmdoesnotcare pmoindia,0.128
3,1263110118176165888,electric outage phool bagan kolkata caused due cyclone please pray one fortunate enough proper shelter challenging time cycloneamphan cycloneamphanupdate amphansupercyclone amphan,0.7964
4,1264156369201971200,west bengal seek army’s help restore essential service hit cycloneamphan,0.5994
5,1264136622242557952,amphan toll rise 85 bengal protest electricity water supply,-0.25
6,1264907290978562048,power minister review restoration power infrastructure post amphan via video conferencing,0.0
7,1266036922754887680,many area kolkata still without electricity lack water house life bedridden patient joint negligence state government cesc dealing amphan ha exposed people west bengal victim one nefarious conspiracy mamatarcescmrityudut,-0.3481
8,1263526004250873856,cyclone amphan odhisha west bengal nearly 45 lakh people affected amfan cyclone odisha power infrastructure destroyed,-0.5859
9,1267118789075836928,cyclone amphan update weather forecast today live update death toll rise 86 bengal seek army support restore essential infrastructure 86 death powerwater shortage bengal army help 44 lakh people affected odisha,-0.8402


In [417]:
 def save_graph(graph,file_name):
    #initialze Figure
    plt.figure(num=None, figsize=(20, 20), dpi=80)
    plt.axis('off')
    fig = plt.figure(1)
    pos = nx.spring_layout(graph)
    nx.draw_networkx_nodes(graph,pos)
    nx.draw_networkx_edges(graph,pos)
    nx.draw_networkx_labels(graph,pos)

    cut = 1.00
    xmax = cut * max(xx for xx, yy in pos.values())
    ymax = cut * max(yy for xx, yy in pos.values())
    plt.xlim(0, xmax)
    plt.ylim(0, ymax)

    plt.savefig(file_name,bbox_inches="tight")
    del fig