In [1]:
import sys, logging, datetime, warnings

from pymongo import MongoClient
import pandas as pd  
import numpy as np
from utils import tokenize_and_stem
from nltk.corpus import stopwords

client = MongoClient('localhost', 27017)
db = client['twitter']

pipeline=[{ "$project": {  "id_user" : 1, "replyTo_userId" : 1, "created_at" : 1, "hashtag" : 1, "mention" : 1} }]

cursor_list = list(db['tweets'].aggregate(pipeline))

tweets = pd.DataFrame(cursor_list)
data = pd.DataFrame(columns=['Utente'])
data['Utente'] = tweets['id_user']
data['Retweettato da'] = tweets['replyTo_userId']
data['timestamp'] = tweets['created_at']
data['hashtag'] = tweets['hashtag']
data['mention'] = tweets['mention']

In [None]:
from birdy.twitter import UserClient, TwitterApiError, TwitterRateLimitError

ACCESS_TOKEN = ''
ACCESS_SECRET = ''
CONSUMER_KEY = ''
CONSUMER_SECRET = ''

client = UserClient(CONSUMER_KEY,
                    CONSUMER_SECRET,
                    ACCESS_TOKEN,
                    ACCESS_SECRET)

In [None]:
import time 

users = set(data['Utente'])

# users_temp= set(data['Utente'])

names={}

for user in users:
    
    try:
        name = client.api.users.show.get(id=user).data.name
        names[name]=user
    
    except TwitterApiError as err:    
        
        if (type(err)==TwitterRateLimitError):
        
            print(err, 'aspetto 15' )
            time.sleep(15*60)
            name = client.api.users.show.get(id=user).data.name
            names[name]=user
        
        else:
        
            print(err)
#           users.remove(user)
            data = data.drop(data[data.Utente == user].index)
#           print('users_temp :', len(users_temp), 'users :', len(users)) 


In [None]:
len(names)

In [None]:
import pickle

with open('names.pickle', 'wb') as fp:
    pickle.dump(names, fp)

In [2]:
import pickle

with open ('names.pickle', 'rb') as fp:
    names = pickle.load(fp)

In [3]:
import networkx as nx

G=nx.DiGraph()

for user in names.values():
    G.add_node(user,label=user)

![title](img/mention.png)

In [4]:
mention = data.dropna(subset=['mention'])

In [5]:
mention_df = mention.groupby('Utente')['mention'].apply(list)

In [None]:
mention_df

In [6]:
for user in mention_df.keys():
    for mention in mention_df[user]:
        if mention in names.keys():
            G.add_edges_from([(user,names[mention])]) 

![title](img/retweet.png)

In [7]:
retweet = data[data["Retweettato da"] != -1].reset_index()

In [8]:
retweet_df = retweet.groupby('Utente')['Retweettato da'].apply(list)

In [None]:
retweet_df

In [9]:
for user in retweet_df.keys():
    for retweet in retweet_df[user]:
        if retweet in names.values():
            G.add_edges_from([(retweet,user)])              

![title](img/hashtag.png)

In [10]:
hashtag = data.dropna(subset=['hashtag'])

In [11]:
hashtag = hashtag.reset_index(drop='index')
hashtag = hashtag.drop(['mention', 'Retweettato da'], axis=1)

In [12]:
def splitDataFrameList(df,target_column,separator):
    ''' df = dataframe to split,
    target_column = the column containing the values to split
    separator = the symbol used to perform the split
    returns: a dataframe with each entry for the target column separated, with each element moved into a new row. 
    The values in the other columns are duplicated across the newly divided rows.
    '''
    def splitListToRows(row,row_accumulator,target_column,separator):
        split_row = row[target_column].split(separator)
        for s in split_row:
            new_row = row.to_dict()
            new_row[target_column] = s
            row_accumulator.append(new_row)
    new_rows = []
    df.apply(splitListToRows,axis=1,args = (new_rows,target_column,separator))
    new_df = pandas.DataFrame(new_rows)
    return new_df

In [13]:
import pandas

hashtag_final = splitDataFrameList(hashtag,'hashtag',',')
hashtag_final = hashtag_final[hashtag_final['Utente'].isin(names.values())]
hashtag_final = hashtag_final.sort_values('timestamp').reset_index().drop(['index'],axis=1)

In [None]:
# hashtag_grouped = hashtag_final.drop(['timestamp'], axis=1)
# hashtag_grouped = hashtag_grouped[hashtag_grouped['Utente'].isin(names.values())]
# hashtag_grouped = hashtag_grouped.groupby('hashtag')['Utente'].apply(list).reset_index()

In [None]:
for i in range(len(hashtag_final.hashtag)):
    
    temp = hashtag_final.hashtag[i]
    
    for a,h in enumerate(hashtag_final.hashtag[i:]):
        if  temp == h :
            G.add_edges_from([(hashtag_final.Utente[i],hashtag_final.Utente[a+i])])            

In [None]:
#remove selfloops
G.remove_edges_from(G.selfloop_edges())

In [None]:
print(len('number of nodes:',G.nodes()))
print(len('number of edges:',G.edges()))

In [None]:
nx.write_gexf(G, "relGraph")