In [2]:
from collections import Counter
import string
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from wordcloud import WordCloud
import dask.dataframe as dd
from dask.multiprocessing import get
import networkx as nx
import psycopg2
import psycopg2.extras
from tqdm.notebook import tqdm
import datetime
import os,stat

CLUSTERS_OF_INTEREST = [36041,65124]
MRH_FILE_PATH = "pickles/mention_retweet_hastags(trending).pkl"
MRH_TIME_FILE_PATH = "pickles/mention_retweet_hastags_timeobj(trending).pkl"
CLUSTER0_LINK_PATH = "files/cluster36041.link"
CLUSTER1_LINK_PATH = "files/cluster65124.link"
CLUSTER0_LINK_IMPORTANCE_PATH = "files/cluster_36041(trending).urls.importance"
CLUSTER1_LINK_IMPORTANCE_PATH = "files/cluster_65124(trending).urls.importance"
CLUSTER0_TEXT_IMPORTANCE_PATH = "files/cluster_36041(trending).text.importance"
CLUSTER1_TEXT_IMPORTANCE_PATH = "files/cluster_65124(trending).text.importance"

def pg_get_conn(database="fakenews", user="fakenews", password="fnd"):
    """Get Postgres connection for fakenews

    Returns:
        Connection object : returns Post gres connection object

    Args:
        database (str, optional): Name of database
        user (str, optional): Name of User
        password (str, optional): Password of user
    """
    try:
        conn = psycopg2.connect(database=database,
                                user=user, password=password, host='localhost', port='5432')
        return conn
    except Exception as e:
        print(str(e))
        
def run_query(query="""Select * from tweets_cleaned""", realDict = False, arg=None):
    with pg_get_conn(database="abhishek",user="abhishek",password="vaishu") as conn:
        cur = conn.cursor(cursor_factory = psycopg2.extras.RealDictCursor) if realDict else conn.cursor()    
        print(query) if not arg else print(cur.mogrify(query,(arg,)))
        cur.execute(query) if not arg else cur.execute(query,(arg,))
        try:
            ans = cur.fetchall()
        except psycopg2.ProgrammingError as e:
            ans = None
        return(ans)

def create_graph(ls_tup):
    G = nx.DiGraph()
    for dc in tqdm(ls_tup):
        if isinstance(ls_tup,dict):
            tfrom=dc['tweet_from']
            rt = dc['retweeted_status_user_handle']
        else:
            tfrom=dc[0]
            rt=dc[1]
        if G.has_edge(tfrom,rt):
            G[tfrom][rt]['weight'] += 1
        else:
            G.add_edge(tfrom,rt,weight=1)
    return(G)
def __custom_words_accumulator(series,limit=None):
    c = Counter()
    for sentence in series:
        if sentence:
            sent_list = sentence.split(",")
            c.update(sent_list)
    return c.most_common() if not limit else c.most_common(limit)

def split_list(series,handleBool=True):
    handles = []
    listNoOfX = []
    for groupList in series:
        for handle,x in groupList:
            handles.append(handle)
            listNoOfX.append(x)
    if handleBool :
        return(handles)
    else:
        return(listNoOfX)
        
def get_barcharts(df,column_name="retweets"):
    wf = df.groupby("cluster")[column_name].apply(__custom_words_accumulator,limit=50).reset_index()
    wf2 = pd.DataFrame({
    'cluster_id' : np.repeat(wf['cluster'],50),
    'handle': split_list(wf[column_name]),
    'noOfX': split_list(wf[column_name],handleBool=False)
    })
    clusters = wf2.cluster_id.unique()
    sns.set(rc={'figure.figsize': (40,10)})
    i = 0
    f, ax = plt.subplots(len(clusters), 1, figsize=(40, 100))
    f.tight_layout(pad=6.0)
    for cid in clusters:
        g = sns.barplot(x="handle", y="noOfX", hue="cluster_id", data=wf2[wf2.cluster_id==cid],ax=ax[i])
        g.set_xticklabels(g.get_xticklabels(), rotation=50, horizontalalignment='right')
        i+=1    

def plot_word_cloud(word_freq_dict,background_color="white", width=800, height=1000,max_words=300, 
                    figsize=(50, 50), wc_only=False,color_map="viridis"):
    """
    Display the Word Cloud using Matplotlib
    :param word_freq_dict: Dictionary of word frequencies
    :type word_freq_dict: Dict
    :return: None
    :rtype: None
    """
    word_cloud = WordCloud(background_color=background_color, width=width, height=height,
                           max_words=max_words,colormap=color_map).generate_from_frequencies(frequencies=word_freq_dict)
    if wc_only:
        return word_cloud
    plt.figure(figsize=figsize)
    plt.imshow(word_cloud, interpolation='bilinear')
    plt.axis("off")
    plt.show()
    
def reset_everything(min_degree=1,suffix=str(datetime.datetime.now())):
    ls = run_query("Select tweet_from,retweeted_status_user_handle from tweet_articles_tweepy where retweeted_status_user_handle is not null")
    G = create_graph(ls)
    remove_nodes = [x[0] for x in G.degree(weight='weight') if x[1] <= min_degree]
    G.remove_nodes_from(remove_nodes)
    %reset_selective -f "^ls$"
    %reset_selective -f "^remove_node$"
    nx.write_gexf(G,"graphs/G_{0}.gexf".format(suffix))
    update_MRH_PKL(suffix)
    update_MRH_TIME_PKL(suffix)
    write_info_to_file(tablename,suffix)
    
def write_info_to_file(tablename,suffix,text_columns=['text','urls']):
    FILE_NAME="files/cluster_{}({}).{}.importance"
    for column in text_columns:
        for cluster in CLUSTERS_OF_INTEREST:
            try:
                f = open(FILE_NAME.format(cluster,suffix,column),'w+')
                absolute_path= os.path.realpath(f.name)
                f.close()
                os.chmod(absolute_path,stat.S_IRWXO |stat.S_IRWXG|stat.S_IRWXU)
                query = "COPY (SELECT t.{},c.importance from {} t JOIN cluster_mapping c ON t.tweet_from = c.id where c.cluster = {}) TO '{}';".format(column,tablename,cluster,absolute_path)
                run_query(query)
            except PermissionError as e:
                print("Please delete the file {}".format(FILE_NAME.format(cluster,suffix,column)))
#                 os.remove(FILE_NAME.format(cluster,suffix,column))
                    
def update_MRH_PKL(tablename="tweet_articles_tweepy",suffix=str(datetime.datetime.now())):
    ls = run_query("SELECT t.tweet_from,t.user_mentions_name,t.retweeted_status_user_handle,t.hashtags,c.cluster,c.importance FROM {} AS t INNER JOIN cluster_mapping AS c ON t.tweet_from = c.id WHERE c.cluster in %s".format(tablename),arg=tuple(CLUSTERS_OF_INTEREST))
    df = pd.DataFrame(ls,columns=['handle','mentions','retweets','hashtags','cluster','importance'])
    for x in ['mentions','hashtags']:
        df[x]=df[x].replace('{}',None)
        df[x]=df[x].str.lstrip('{')
        df[x]=df[x].str.rstrip('}')
    global MRH_FILE_PATH
    MRH_FILE_PATH = 'pickles/mention_retweet_hastags({0}).pkl'.format(suffix) 
    df.to_pickle(MRH_FILE_PATH)
    return df

def update_MRH_TIME_PKL(tablename="tweet_articles_tweepy",suffix=str(datetime.datetime.now())):
    ls = run_query("SELECT t.created_at,t.tweet_from,t.user_mentions_name,t.retweeted_status_user_handle,t.hashtags,c.cluster,c.importance FROM {} AS t INNER JOIN cluster_mapping AS c ON t.tweet_from = c.id WHERE c.cluster in %s".format(tablename),arg=tuple(CLUSTERS_OF_INTEREST))
    df = pd.DataFrame(ls ,columns = ["time","handle","mentions","retweets","hashtags","cluster","importance"])
    df['time'] = pd.to_datetime(df['time'], format="%a %b %d %H:%M:%S %z %Y")
    for x in ['mentions','hashtags']:
        df[x]=df[x].replace('{}',None)
        df[x]=df[x].str.lstrip('{')
        df[x]=df[x].str.rstrip('}')
    global MRH_TIME_FILE_PATH
    MRH_TIME_FILE_PATH = 'pickles/mention_retweet_hastags_timeobj({0}).pkl'.format(suffix) 
    df.to_pickle(MRH_TIME_FILE_PATH)
    return df

In [40]:
import re
ls_text=[]
count =0 
with open(CLUSTER1_TEXT_IMPORTANCE_PATH,'r') as f:
    for line in f:
        if not line.startswith("RT"):
            line = re.sub('#[a-zA-Z0-9_]*'," <hashtag> ",line)
            line = re.sub('(http|https)[a-zA-Z0-9://.]*'," <link> ",line)
            line = re.sub('@[a-zA-Z0-9_]*'," <mention> ",line)
            line = re.sub(r"(\\n)*","",line)
            line = re.sub('[0-9.]*',"",line)
            print(line)
            count+=1
        if count > 100:
            break

 <hashtag>  <mention>   <mention>   <mention>   <link> 	

 <hashtag>  <mention>   <mention>   <mention>   <link> 	

 <hashtag>   <link> 	

 <hashtag>   <link> 	

 <hashtag>   <link> 	

 <hashtag>   <link> 	

When will BJP start caring? <mention>   <mention>   <mention>  <hashtag>   <link> 	

Modi ji again topped the list of LIARS!!! <hashtag>   <link> 	

Anyone remember Syria migration crisisIt's happening in Gujarat now <hashtag>   <link> 	

Flowers ain't enough  <mention>  <hashtag>  <mention>   <mention>   <mention>   <link> 	

Stop it yaar!!Stop your LIES!!!!!! <hashtag>   <link> 	

Yeh hai Iss desh ke kalakar! <hashtag>  <mention>   <mention>   <mention>   <link> 	

 <hashtag>  <mention>   <mention>   <mention>   <link> 	

 <hashtag> মোদী একটু ভাট কম বক!!  <link> 	

Tum Sirf baat karoge, kab tum kaam karoge??  <mention>  <hashtag>  <mention>   <mention>   <mention> 	

 <mention>   <mention>   <hashtag> 	

This lockdown has shown us how the government has done nothing to alleviate 