In [1]:
import tweepy
import csv
import pandas as pd
import numpy as np
import networkx as nx
import os
from Events_NER.TweetSegmenter import SEDTWikSegmenter

In [2]:
wiki_titles_file = "Events_NER/data/final.txt"
segmenter = SEDTWikSegmenter(wiki_titles_file, 4, 3, False)

Initializing SEDTWik Segmenter
SEDTWik Segmenter Ready



In [3]:
class Tweet():
    
    def __init__(self, status_response):
        self.id = status_response.id
        self._get_text(status_response)
        self.user_info = status_response.user
        self.entitites = status_response.entities
        self.json = status_response._json
        self.json['text'] = self.text
        arr = []
        for users in self.json['entities']['user_mentions']: 
            arr += [users['name']]
        self.json['entities']['user_mentions'] = arr
        arr = []
        for users in self.json['entities']['hashtags']: 
            arr += [users['text']]
        self.json['entities']['hashtags'] = arr
        
    def _get_text(self, status):
        if hasattr(status, "retweeted_status"):  # Check if Retweet
            try:
                self.text = status.retweeted_status.extended_tweet["full_text"]
            except AttributeError:
                self.text = status.retweeted_status.full_text
        else:
            try:
                self.text = status.extended_tweet["full_text"]
            except AttributeError:
                self.text = status.full_text
    
    #code for generating named entities of each tweet
    def _get_named_entities(self):
        return segmenter.tweet_segmentation(self.json)
        
    #code for generating event phrases
    def _get_event_phrases(self):
        return []
    
    def get_graph_entities(self):
        return self._get_named_entities() + self._get_event_phrases()
    
    def __hash__(self):
        return self.id
    def __eq__(self, other):
        return self.id == other.id

def filterDuplicates(tweets):
    tweet_text = set()
    filtered_tweets = []
    for tweet in tweets:
        if tweet.text not in tweet_text:
            filtered_tweets += [tweet]
            tweet_text.add(tweet.text)
    return filtered_tweets

class TweetRetriever():

    def __init__(self):
        consumer_key = '95cMtk1vJvEEW2rlMR0kIU9lE'
        consumer_secret = 'pMQFi7LBdcudKDNZOokUJGS8mDxQanUv8spxBDdTLiwSZBuUOM'
        access_token = '1036313393114767360-BZ8Qpi02ghRvehhcITEIyl7SmGWmU6'
        access_token_secret = 'C7VAqGDhTdB424iBtEwF1CJI9YPTcvNvLjFmaCXENNv3G'
        auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
        auth.set_access_token(access_token, access_token_secret)
        self.api = tweepy.API(auth,wait_on_rate_limit=True)
    
    def getTweets(self, hashtag, count = 10):
        tweets = []
        for status in tweepy.Cursor(self.api.search, q = hashtag, count = count, tweet_mode = 'extended',
                                    lang = 'en').items():
            tweets.append(Tweet(status))
        return filterDuplicates(tweets)

    

In [18]:
class GraphNode():
    
    def __init__(self, name):
        self.name = name
        self.tweets = set()
        self.value = 0
        
    def add_tweet(self, tweet):
        self.tweets.add(tweet)
    
    def common_tweets(self, other):
        return len(self.tweets.intersection(other.tweets))
    
    def __hash__(self):
        return hash(self.name)
    def __eq__(self, other):
        return self.name == other.name
#     def __print__(self):
#         print(self.name)
    
class TweetGraph():
    
    def __init__(self, topic):
        self.topic = topic
        self.nodes = {}
        self.edge_map = {}
        
    def add_entity(self, name, tweet_ref):
        if name not in self.nodes:
            self.nodes[name] = GraphNode(name)
        self.nodes[name].add_tweet(tweet_ref)
    
    def add_edge(self, node1, node2):
        assert node1.name in self.nodes
        assert node2.name in self.nodes
        weight = node1.common_tweets(node2)
        self.edge_map.setdefault(node1.name, {}).setdefault(node2.name, weight)
        self.edge_map.setdefault(node2.name, {}).setdefault(node1.name, weight)
    
    def compute_all_edges(self):
        for node1 in self.nodes.values():
            for node2 in self.nodes.values():
                self.add_edge(node1, node2)
    
    def _get_pagerank_matrix(self):
        x = [[0 for _ in range(len(self.nodes))] for _ in range(len(self.nodes))]
        for i, node1 in enumerate(self.nodes.values()):
            wsum = 0
            for node2 in self.nodes.values():
                wsum += self.edge_map.get(node1.name, {}).get(node2.name, 0)
            for j, node2 in enumerate(self.nodes.values()):
                x[i][j] = self.edge_map.get(node1.name, {}).get(node2.name, 0)/wsum
        return np.array(x)
    
    def set_textrank_values(self, d = 0.85):
        rank_graph = nx.from_numpy_array(self._get_pagerank_matrix())
        node_scores = nx.pagerank(rank_graph, alpha = d)
        for i, node in enumerate(self.nodes.values()):
            node.value = node_scores[i]
    
    def get_weight(self, node1, node2):
        return self.edge_map.get(node1.name, {}).get(node2.name, 0)
    
    def get_topic_similarity(self, node):
        if node.name in self.topic:
            return len(node.tweets)
        return 1
    
    def get_all_node_values(self):
        arr = []
        for node in self.nodes.values():
            arr.append((node.name, node.value))
        return sorted(arr, key = lambda x: x[1])
    
    def get_avg_thres(self):
        values = self.get_all_node_values()
        return sum([i[1] for i in values])/len(values)
        
    def get_nodes_above_thres(self, thres = 1):
        nodes = []
        value_sum = 0
        for node in self.nodes.values():
            if node.value > thres:
                nodes.append(node)
                value_sum += node.value
        return nodes, value_sum
        

In [5]:
def createGraph(topic, tweets):
    tweetGraph = TweetGraph(topic)
    for tweet in tweets:
        graph_entities = tweet.get_graph_entities()
        for name in graph_entities:
            tweetGraph.add_entity(name, tweet)
    tweetGraph.compute_all_edges()
    tweetGraph.set_textrank_values()
    return tweetGraph

def partitionGraph(tweetGraph, alpha, beta, high_rank_thres = 1):
    
    #initialize highly ranked nodes and their total values sum
    high_ranked_nodes, total_value_sum = tweetGraph.get_nodes_above_thres(high_rank_thres)
    high_ranked_nodes = sorted(high_ranked_nodes, key = lambda x: x.value)
    partitions = []
    
    #partitioning loop
    while len(high_ranked_nodes):
        #entity set is the nodes in the partition
        entity_set = set()
        repr_node = high_ranked_nodes.pop()
        entity_set.add(repr_node)
        repr_node_topic_similarity = tweetGraph.get_topic_similarity(repr_node)
        value_sum = repr_node.value
        
        for node in high_ranked_nodes:
            node_edge_weight = tweetGraph.get_weight(repr_node, node)
            node_topic_similarity = tweetGraph.get_topic_similarity(node)
            
            if node_edge_weight/repr_node_topic_similarity > alpha and \
                node_topic_similarity/repr_node_topic_similarity > alpha:
                entity_set.add(node)
                value_sum += node.value
        
        if value_sum/total_value_sum > beta:
            temp = []
            for node in high_ranked_nodes:
                if node not in entity_set:
                    temp.append(node)
            high_ranked_nodes = temp
            
            partitions.append([])
            for node in list(entity_set):
                partitions[-1].append(node)
            
    return partitions
    

In [6]:
ret = TweetRetriever()

In [194]:
tweets = ret.getTweets('#gravity', 1)

TweepError: Failed to send request: HTTPSConnectionPool(host='api.twitter.com', port=443): Max retries exceeded with url: /1.1/search/tweets.json?max_id=1186334810874077184&q=%23gravity&count=1&tweet_mode=extended&lang=en (Caused by ConnectTimeoutError(<urllib3.connection.VerifiedHTTPSConnection object at 0x7f64499c1e10>, 'Connection to api.twitter.com timed out. (connect timeout=60)'))

In [192]:
tweets[0].get_graph_entities()

['do you', 'think', 'day6', 'superpower', 'day', 'day jae', 'mydays', 'sungjin', 'wonpil', 'dowoon', 'young k', 'jae', 'gravity', 'entropy', 'the book of us', 'day gravity', 'dayworldtour', 'day', '데이식스', 'sweet chaos', 'day sweet chaos']


['do you',
 'think',
 'day6',
 'superpower',
 'day',
 'day jae',
 'mydays',
 'sungjin',
 'wonpil',
 'dowoon',
 'young k',
 'jae',
 'gravity',
 'entropy',
 'the book of us',
 'day gravity',
 'dayworldtour',
 'day',
 '데이식스',
 'sweet chaos',
 'day sweet chaos']

In [193]:
len(tweets)

9

In [183]:
graph = createGraph(['gravity'], tweets)

['interstellar', 'space travel', 'time travel', 'tech', 'astronomy', 'astrophysics', 'space', 'science', 'physics']
['interstellar']
['introducing', 'next-gen', 'trickshot', 'solar', 'interstellar', 'out of this world']
['soundtrack', 'amazing', 'listen', 'do you', 'listen', 'interstellar', 'writers', 'writing', 'writing community']
['major major', 'thanks to', 'dropping', 'alison wonderland', 'sxmelectro', 'in rotation', 'insomniac records', 'interstellar', 'radio wonderland']
['rocket', 'science.', 'dog', 'science', 'physics', 'biology', 'cosmos', 'space', 'chemistry', 'blackhomeschoolers', 'astronomy', 'nasa', 'universe', 'spaceexploration', 'nature', 'interstellar', 'mars', 'solarsystem', 'art', 'study', 'science', 'school']
['hours', 'to go', 'neck', 'going to', 'don’t', 'let it happen', 'folks', 'votes', 'help', 'rob', 'ross', 'gotyoucovered', 'bond', 'interstellar', 'movie', 'moviechat', 'filmtwitter', 'movie poster', 'cinema']
['bunch', 'dancing', 'people', 'tight', 'clothes', 

In [184]:
graph.get_all_node_values()

[('space travel', 0.007132641058254577),
 ('time travel', 0.007132641058254577),
 ('tech', 0.007132641058254577),
 ('astrophysics', 0.007735622579121715),
 ('for the first time', 0.00866752878287172),
 ('comets', 0.008738474102461341),
 ('sky', 0.008738474102461341),
 ('telescope', 0.008738474102461341),
 ('maria lorca', 0.00898215702361635),
 ('the avengers', 0.00898215702361635),
 ('groundbreaking', 0.008982157023616351),
 ('sciencefiction', 0.008982157023616351),
 ('borisov', 0.00905087105960878),
 ('solar system', 0.00905087105960878),
 ('introducing', 0.009078096450958245),
 ('next-gen', 0.009078096450958245),
 ('trickshot', 0.009078096450958245),
 ('solar', 0.009078096450958245),
 ('out of this world', 0.009078096450958245),
 ('police unit', 0.009083684364101941),
 ('at the time', 0.009083684364101941),
 ('hard beat', 0.009083684364101941),
 ('trusty', 0.009083684364101943),
 ('the only', 0.009083684364101943),
 ('alistair mcknight', 0.009083684364101943),
 ('ghettos', 0.00908368

In [185]:
avg = graph.get_avg_thres()

In [186]:
partitions = partitionGraph(graph, 0.05, 0.01, high_rank_thres=avg)

In [187]:
def summarization(partitions, tweet_cutoff = 1):
    summary = []
    for part in partitions:
        tweet_set = set()
        for node in part:
            tweet_set = tweet_set.union(node.tweets)
        node_entity_count = []
        tweet_set = list(tweet_set)
        #print(tweet_set)
        for i, tweet in enumerate(tweet_set):
            count = 0
            for node in part:
                if node.name in tweet.text:
                    count += 1
            node_entity_count += [(i, count)]
        node_entity_count = sorted(node_entity_count, key = lambda x: x[1], reverse = True)
        #print(node_entity_count)
        for i in range(tweet_cutoff):
            summary += [tweet_set[node_entity_count[i][0]].text]
    return summary

In [188]:
for p in partitions:
    for p2 in p:
        print(p2.name, len(p2.tweets))
    print("====\n")

astronomy 2
space 3
science 3
interstellar 13
comet 2
physics 2
====



In [149]:
s = summarization(partitions)
print(s[0])
print("=====")
print(s[1])
print("=====")
print(s[2])


Renting is a far better and wiser option to choose when compared to buying furniture for your start up office. #entrepreneur #startupindia #startups #india #startupslist https://t.co/cAAXtLeM3F
=====
Within a span of less than a week, #OPEC held high-level meetings with the two fastest-growing oil consuming nations in the world – #India and #China – to further strengthen cooperation in support of oil market stability and the world economy. https://t.co/kNajp5ildg
=====
Funeral of soldier Zahid Farooqi, martyred yesterday, in his native Kalsan village of #Haveli district with full military honours.
In #IHK, #Kashmiris fight against #India &amp; in #AJK they fight for #Pakistan. That's what many #Indians either fail to understand or tend to ignore. https://t.co/qzM615KshC


In [160]:
list(partitions[0][0].tweets)[0].json['text']

'We need not to visualize a black hole now as the image is available. Powehi, a  black hole whose image was issued b… https://t.co/06Y5pQTmmU'