In [18]:
import tweepy
import csv
import pandas as pd
import numpy as np
import networkx as nx
import os
from Events_NER.TweetSegmenter import SEDTWikSegmenter
import pickle
from rouge import Rouge 
import re
import nltk
import json

In [2]:
wiki_titles_file = "Events_NER/data/final.txt"
segmenter = SEDTWikSegmenter(wiki_titles_file, 4, 3, False)

Initializing SEDTWik Segmenter
SEDTWik Segmenter Ready



In [54]:
class Tweet():
    
    def __init__(self, status_response):
        self.id = status_response.id
        self._get_text(status_response)
        self.user_info = status_response.user
        self.entitites = status_response.entities
        self.json = status_response._json
        self.json['text'] = self.text
        arr = []
        for users in self.json['entities']['user_mentions']: 
            arr += [users['name']]
        self.json['entities']['user_mentions'] = arr
        arr = []
        for users in self.json['entities']['hashtags']: 
            arr += [users['text']]
        self.json['entities']['hashtags'] = arr
        
    def _get_text(self, status):
        if hasattr(status, "retweeted_status"):  # Check if Retweet
            try:
                self.text = status.retweeted_status.extended_tweet["full_text"]
            except AttributeError:
                self.text = status.retweeted_status.full_text
        else:
            try:
                self.text = status.extended_tweet["full_text"]
            except AttributeError:
                self.text = status.full_text
    
    #code for generating named entities of each tweet
    def _get_named_entities(self):
        ne = segmenter.tweet_segmentation(self.json)
        #print(ne)
        return ne
    
        
    #code for generating event phrases
    def _get_event_phrases(self):
        return []
    
    def get_graph_entities(self):
        return self._get_named_entities() + self._get_event_phrases()
    
    def __hash__(self):
        return self.id
    def __eq__(self, other):
        return self.id == other.id


class TweetRetriever():

    def __init__(self):
        consumer_key = '95cMtk1vJvEEW2rlMR0kIU9lE'
        consumer_secret = 'pMQFi7LBdcudKDNZOokUJGS8mDxQanUv8spxBDdTLiwSZBuUOM'
        access_token = '1036313393114767360-BZ8Qpi02ghRvehhcITEIyl7SmGWmU6'
        access_token_secret = 'C7VAqGDhTdB424iBtEwF1CJI9YPTcvNvLjFmaCXENNv3G'
        auth = tweepy.AppAuthHandler(consumer_key, consumer_secret)
        #auth.set_access_token(access_token, access_token_secret)
        self.api = tweepy.API(auth,wait_on_rate_limit=True, wait_on_rate_limit_notify = True)
    
    def _filterDuplicates(self, tweets):
        tweet_text = set()
        filtered_tweets = []
        for tweet in tweets:
            if tweet.text not in tweet_text:
                filtered_tweets += [tweet]
                tweet_text.add(tweet.text)
        return filtered_tweets

    def getTweets(self, hashtag, count = 10):
        tweets = []
        for status in tweepy.Cursor(self.api.search, q = hashtag + " -filter:retweets", count = count, tweet_mode = 'extended',
                                    lang = 'en',).items():
            tweets.append(Tweet(status))
        return self._filterDuplicates(tweets)

    

In [5]:
class GraphNode():
    
    def __init__(self, name):
        self.name = name
        self.tweets = set()
        self.value = 0
        
    def add_tweet(self, tweet):
        self.tweets.add(tweet)
    
    def common_tweets(self, other):
        return len(self.tweets.intersection(other.tweets))
    
    def __hash__(self):
        return hash(self.name)
    def __eq__(self, other):
        return self.name == other.name
#     def __print__(self):
#         print(self.name)
    
class TweetGraph():
    
    def __init__(self, topic):
        self.topic = topic
        self.nodes = {}
        self.edge_map = {}
        
    def add_entity(self, name, tweet_ref):
        if name not in self.nodes:
            self.nodes[name] = GraphNode(name)
        self.nodes[name].add_tweet(tweet_ref)
    
    def add_edge(self, node1, node2):
        assert node1.name in self.nodes
        assert node2.name in self.nodes
        weight = node1.common_tweets(node2)
        self.edge_map.setdefault(node1.name, {}).setdefault(node2.name, weight)
        self.edge_map.setdefault(node2.name, {}).setdefault(node1.name, weight)
    
    def compute_all_edges(self):
        for node1 in self.nodes.values():
            for node2 in self.nodes.values():
                self.add_edge(node1, node2)
    
    def _get_pagerank_matrix(self):
        x = [[0 for _ in range(len(self.nodes))] for _ in range(len(self.nodes))]
        for i, node1 in enumerate(self.nodes.values()):
            wsum = 0
            for node2 in self.nodes.values():
                wsum += self.edge_map.get(node1.name, {}).get(node2.name, 0)
            for j, node2 in enumerate(self.nodes.values()):
                x[i][j] = self.edge_map.get(node1.name, {}).get(node2.name, 0)/wsum
        return np.array(x)
    
    def set_textrank_values(self, d = 0.85):
        rank_graph = nx.from_numpy_array(self._get_pagerank_matrix())
        node_scores = nx.pagerank(rank_graph, alpha = d)
        for i, node in enumerate(self.nodes.values()):
            node.value = node_scores[i]
    
    def get_weight(self, node1, node2):
        return self.edge_map.get(node1.name, {}).get(node2.name, 0)
    
    def get_topic_similarity(self, node):
        if node.name in self.topic:
            return len(node.tweets)
        return 1
    
    def get_all_node_values(self):
        arr = []
        for node in self.nodes.values():
            arr.append((node.name, node.value))
        return sorted(arr, key = lambda x: x[1])
    
    def get_avg_thres(self):
        values = self.get_all_node_values()
        return sum([i[1] for i in values])/len(values)
        
    def get_nodes_above_thres(self, thres = 1):
        nodes = []
        value_sum = 0
        for node in self.nodes.values():
            if node.value > thres:
                nodes.append(node)
                value_sum += node.value
        return nodes, value_sum
        

In [6]:
def createGraph(topic, tweets):
    tweetGraph = TweetGraph(topic)
    for tweet in tweets:
        graph_entities = tweet.get_graph_entities()
        for name in graph_entities:
            tweetGraph.add_entity(name, tweet)
    tweetGraph.compute_all_edges()
    tweetGraph.set_textrank_values()
    return tweetGraph

def partitionGraph(tweetGraph, alpha, beta, high_rank_thres = 1):
    
    #initialize highly ranked nodes and their total values sum
    high_ranked_nodes, total_value_sum = tweetGraph.get_nodes_above_thres(high_rank_thres)
    high_ranked_nodes = sorted(high_ranked_nodes, key = lambda x: x.value)
    partitions = []
    
    #partitioning loop
    while len(high_ranked_nodes):
        #entity set is the nodes in the partition
        entity_set = set()
        repr_node = high_ranked_nodes.pop()
        entity_set.add(repr_node)
        repr_node_topic_similarity = tweetGraph.get_topic_similarity(repr_node)
        value_sum = repr_node.value
        
        for node in high_ranked_nodes:
            node_edge_weight = tweetGraph.get_weight(repr_node, node)
            node_topic_similarity = tweetGraph.get_topic_similarity(node)
            
            if node_edge_weight/repr_node_topic_similarity > alpha and \
                node_topic_similarity/repr_node_topic_similarity > alpha:
                entity_set.add(node)
                value_sum += node.value
        
        if value_sum/total_value_sum > beta:
            temp = []
            for node in high_ranked_nodes:
                if node not in entity_set:
                    temp.append(node)
            high_ranked_nodes = temp
            
            partitions.append([])
            for node in list(entity_set):
                partitions[-1].append(node)
            
    return partitions
    

In [7]:
def summarization(partitions, tweet_cutoff = 1):
    summary = []
    #print(partitions)
    for part in partitions:
        tweet_set = set()
        for node in part:
            tweet_set = tweet_set.union(node.tweets)
        node_entity_count = []
        tweet_set = list(tweet_set)
        #print(tweet_set)
        for i, tweet in enumerate(tweet_set):
            count = 0
            for node in part:
                if node.name in tweet.text:
                    count += 1
            node_entity_count += [(i, count)]
        node_entity_count = sorted(node_entity_count, key = lambda x: x[1], reverse = True)
        #print(node_entity_count)
        for i in range(min(tweet_cutoff, len(tweet_set))):
            summary += [tweet_set[node_entity_count[i][0]].text]
    return summary

In [8]:
import json
import os
def createDataset(hashtags, count, json_save_file = None, pickle_loc = None):
    
    summary_dataset = {}
    ret = TweetRetriever()
    if json_save_file is not None and os.path.exists(json_save_file):
        with open(json_save_file, "r") as fp:
            summary_dataset = json.load(fp)
            
    for hasht in hashtags:
        
        if not os.path.exists(pickle_loc):
            os.mkdir(pickle_loc)
            
        tweets = ret.getTweets(hasht, count)
        summary_dataset[hasht] = {"tweets": []}
        it = 1
        for tweet in tweets:
            pickle_file_name = hasht + "_" + str(it)
            print(tweet.text)
            summary_dataset[hasht]["tweets"].append(tweet.text)
            with open(pickle_loc + "/" + pickle_file_name, "wb") as fp:
                pickle.dump(tweet, fp)
            it += 1
            print("=================")
        
        print("Enter the topic")
        topic = str(input())
        print("Please Enter User summary")
        user_summary = str(input())
        summary_dataset[hasht]["topic"] = topic
        summary_dataset[hasht]["user_summary"] = user_summary
        with open(json_save_file, "w") as fp:
            json.dump(summary_dataset, fp, indent = 4)

In [78]:
def getpickeledTweets(pickle_dir, hashtag, count):
    it = 1
    tweets = []
    while it < count:
        try:
            file = pickle_dir + "/" + hashtag + "_" + str(it)
            with open(file, 'rb') as fp:
                tweets.append(pickle.load(fp))
        except:
            raise
        it += 1
    return tweets
                
def getRouge1Score(dataset, pickle_dir, alpha, beta, tweets_summary_count, show_scores = False, rouge_thres = 0.5):
    
    with open(dataset, 'r') as fp:
        dataset = json.load(fp)
    
    score_data = []
    for hashtag, data in dataset.items():
        tweets = getpickeledTweets(pickle_dir, hashtag, len(data["tweets"]))
        topic = data['topic'].split(' ')
        try:
            graph = createGraph(topic, tweets)
        except:
            continue
        avg = graph.get_avg_thres()
        partitions = partitionGraph(graph, alpha, beta, high_rank_thres = avg)
        
        
        psummary = summarization(partitions, tweet_cutoff = tweets_summary_count)
        user_summary = data['user_summary']
        
        summary = ' '.join(psummary)
        #print("========")
        #print(summary)
        summary = re.sub(r'http\S+', '', summary)
        summary = re.sub('[^A-Za-z0-9]+', ' ', summary)
        user_summary = re.sub('[^A-Za-z0-9]+', ' ', user_summary)
        #print("=========")
        #print(summary)
        #print("=========")
        #print(user_summary)
        
        rouge = Rouge()
        ROUGEscores = rouge.get_scores(summary, user_summary)[0]['rouge-1']['f']
        
        bleu_summary = summary.split(' ')
        bleu_user_summary = user_summary.split(' ')
        BLEUscore = nltk.translate.bleu_score.sentence_bleu([bleu_summary], bleu_user_summary, weights = [1])
        f1 = 2*(BLEUscore * ROUGEscores) / ( BLEUscore + ROUGEscores)
        if show_scores:
            print(hashtag, "\n", "rouge 1 ", ROUGEscores, " bleu-score ", BLEUscore, " F1 ", f1)
        if ROUGEscores < 0.15: # and ROUGEscores < 0.5:
            for tweet in tweets:
                print(tweet.text)
                print("-------------------")
            print(psummary)
            print("=================")
            
        score_data.append(ROUGEscores)
    
    avg_score = 0
    for data in score_data:
        avg_score += data
    print(avg_score / len(score_data))
        

In [32]:
def testcode():
    for i in np.arange(0,1,0.1):
        for j in np.arange(0, 1, 0.1):
            print(round(i,2), round(j,2), end = " ")
            try:
                getRouge1Score("dataset_1.json", "Datasets_1", i, j, 2)
            except:
                print("No output")
        

In [33]:
hashtagset = ["#balasahebthackeray"]
createDataset(hashtagset, 1, "dataset_3.json", "Datasets_3")

KeyboardInterrupt: 

In [79]:
getRouge1Score("combined_data.json", "CDataset", 0.1, 0.2, 4, False)

We are Reds ❤️we are Liverpool 
Keep going to the title 🏆
 #LIVMCI
-------------------
City got comprehensively chowed today in every way. VAR included #LIVMCI
-------------------
i could jerk this glorious scene off till the end of the days #LFC #LIVMCI https://t.co/H3uqlEYNQ8
-------------------
I just hope y'all are planning to hand Fabinho the MOTM award straight up! Man's a midfield monster. #LIVMCI
-------------------
More than anything I was worried about Lovern vs Agüero but The croatian was tooo good tonight #LIVMCI
-------------------
#LIVMCI @ManCity i just wanna say that Aguero performed his worst tonight, he needs to earn game time from now on becoz dat shit waz not acceptable . @aguerosergiokun u disappointed me today man o man 💔
-------------------
*plays Anfield Rap on loop all night long*

#LIVMCI
-------------------
I'm not sure what it's called in England, but in Texas, USA... We call it an ass whoopin' #LIVMCI #LiverpoolManchesterCity  #LiverpoolManCity
------------

Just finished watching Joker for the second time. I enjoyed it as much as the first time but I noticed more little details. God I hope Joaquin Phoenix wins the Oscar. #Joker #JokerMovie #JokerFilm
-------------------
Is it just me or is it getting crazier out there?
#joker #jokermovie #jokermovie2019 #jokerart https://t.co/xLLFOAb68h
-------------------
Joker Part 5 is ready to watch via link below, this wraps up this tutorial series 😉🙏🏻 thanks again to all who have been watching. 

https://t.co/OctiIGeXvH

#airbrushasylum #airbrushart #joker #joaquinphoenix #videotutorial #learntoairbrush https://t.co/B0F7ALugzD
-------------------
Careful this is how #joker became a serial killer https://t.co/Q43SqwzAfz
-------------------
Joker Lollipop at Target #chupachups #joker #batman #dccomics https://t.co/PcqtkB1o03
-------------------
I got to go see this movie. 

#JokerMovie #JokerFilm #joker2019 #jokermemes #Joker https://t.co/90KfT1AbCv
-------------------
sketches 
#thedarkknight #joker2

0.28209525101117444


In [80]:
tweets = getpickeledTweets("CDataset", "#RamMandir", 51)

In [81]:
graph = createGraph(['Ram', 'Mandir', 'Ayodhya'], tweets)

In [82]:
graph.get_all_node_values()

[('thankful', 0.0014586058939796618),
 ('owaisi', 0.0014626757815777204),
 ('said', 0.0014626757815777204),
 ('alms', 0.0014626757815777204),
 ('we do', 0.0014626757815777208),
 ('want', 0.0014626757815777208),
 ('acres', 0.0014626757815777208),
 ('author', 0.0014689137341255363),
 ('vote', 0.0014805093788363133),
 ('eternal', 0.0014805093788363133),
 ('deity', 0.001494385224682649),
 ('stood', 0.0015145018129331812),
 ('ram mandir construction', 0.0015145018129331814),
 ('arrested', 0.0015151755229324269),
 ('inflammatory', 0.0015151755229324269),
 ('social media', 0.0015151755229324269),
 ('posts', 0.0015151755229324269),
 ('named', 0.0015521085749999298),
 ('article 370', 0.0015677977061849483),
 ('owaisi jailed nowhera', 0.0015737312190643373),
 ('the reaction', 0.0015823391231062868),
 ('supporters', 0.0015823391231062868),
 ('shows', 0.0015823391231062868),
 ('issue', 0.0015823391231062868),
 ('thanks to', 0.0015823391231062868),
 ('impact', 0.0015823391231062868),
 ('gets', 0.00