# Use this on a Twitter account to data about the activity of a network

## Setup

### Import these packages

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tweepy
import datetime
import json
import os

### Choose your parameters

In [2]:
# Screen name of network's center
center = 'erdosinstitute'
# Name to save files to
name = 'Test'
try: os.mkdir('./'+name)
except: pass
# Number of mutuals to find
breadth = 2
# Number of times to find them
depth = 3

### Choose your senitment analyzer. For example here we use Vader

In [3]:
# Necessary parts of the analyzer
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')

# The analyzer itself
def analyzer(tweet):
    return SentimentIntensityAnalyzer().polarity_scores(tweet)['compound']

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Ysgard\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [22]:
## We could also use this
#from joblib import load
#import nltk
#from nltk.corpus import stopwords
#nltk.download('stopwords')
#import string
#from nltk.stem import PorterStemmer
#import re
#
#stop_words=stopwords.words('english')
#punct=string.punctuation
#stemmer=PorterStemmer()
#
#vectord = load('vectord.joblib') 
#model = load('model.joblib') 
#
#def analyzer(tweet):
#    #clean the text
#    #this removes mentions
#    twittre = re.sub(r'@[A-Za-z0-9_]+', '', tweet)
#    #this removes urls
#    twittre = re.sub(r'https?:\/\/[A-Za-z0-9\.\/]+', '', twittre)        
#    #this removes everything but letters from the str
#    twittre = re.sub('[^a-zA-Z]',' ',twittre)
#    #this makes everything lowercase, then turns everything that is separated by a space into its own str
#    twittre = twittre.lower().split()
#    #this removes stop words and stems everything else
#    twittre = [stemmer.stem(word) for word in twittre if (word not in stop_words)]
#    # this puts it all back together with spaces inbetween
#    twittre = ' '.join(twittre)
#    activity.['Cleaned'] = cleets
#
#    probs = model.predict_proba(vectord.transform(tweet).toarray())
#    return probs[:,1]-probs[:,0]


### You'll need a Twitter developer account

In [5]:
# Enter your credentials
API_key = ''
API_secret_key = ''
access_token = ''
access_token_secret = ''

# Connect to the API
authenticator = tweepy.OAuthHandler(API_key, API_secret_key)
authenticator.set_access_token(access_token, access_token_secret)
API = tweepy.API(authenticator, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)

## Functions

### Definitions

In [6]:
def build_network(user, breadth=10, depth=2, network=[], save=False):
    #type user as a screen name so this works
    howrite = 'a'
    
    # Make sure we're finding the right user
    if type(user)==str:
        user = API.get_user(screen_name=user).id
        howrite = 'w'
    elif type(user)!=int:
        raise Exception("Bad user.")
    
    #make sure we have good data
    try:
        friends = API.friends_ids(user_id=user)
        friends[0]
    except: 
        return False
    
    #one list for user's mutuals, one list for everyone
    mutuals = []
    network.append(user)
    if save:
        try:
            with open(save+'.txt', howrite) as save_file:
                save_file.write(str(user)+'\n')
        except: 
            raise Exception('Bad save file')
    
    #we won't delve too deep or too greedily
    if depth:
        #find random friends of user who follow user back (mutuals)
        for friend in np.random.permutation(friends):
            friend = int(friend)
            if friend not in network:
                if API.show_friendship(source_id=friend, target_id=user)[0].following:
                    #find their mutuals, but only add them to the list if it's good data
                    if build_network(friend, breadth=breadth, depth=depth-1, network=network, save=save):
                        #add them to the list
                        mutuals.append(friend)
                    #but only some of them
                    if len(mutuals)==breadth:
                        print(friend,'has mutuals',mutuals)
                        break
    
    #and here's the list
    return network

In [7]:
def get_activity(user, count=200,
                 streak_lengths=[datetime.timedelta(minutes=30),datetime.timedelta(hours=6)], 
                 per_periods=[datetime.timedelta(hours=1),datetime.timedelta(days=1)], save=False):
    # Make sure we're finding the right user
    if type(user)==str:
        apitivity = API.user_timeline(screen_name=user, count=count)
    elif type(user)==int:
        apitivity = API.user_timeline(user_id=user, count=count)
    else:
        raise Exception("Bad user.") 
        
    # Make a list of the statuses
    activiray = []
    #and add the relevant data
    for tweet in apitivity:
        try: 
            tweet.retweeted_status
            is_retweet = True
        except: 
            is_retweet = False
        activiray.append([tweet.text,
                         tweet.id,
                         is_retweet,
                         tweet.is_quote_status,
                         type(tweet.in_reply_to_status_id)==int,
                         tweet.retweet_count,
                         tweet.favorite_count,
                         tweet.created_at])
        
    # turn it into a dataframe
    activity = pd.DataFrame(activiray, 
                            columns=['Text', 'Id', 'Is_retweet', 'Is_quote', 'Is_reply', 'Retweets', 'Favorites', 'Created'])
    
    # record the time since the last status update
    tsl = [activity.Created[n].to_pydatetime() - activity.Created[n+1].to_pydatetime()
                                   for n in range(len(activity)-1)] + [np.nan]
    activity['Time_since_last'] = tsl.copy()
    
    #record how many times in a row the last activity was less than length 
    for length in streak_lengths:
        #record when it was less than length
        streak_break = [time>length for time in tsl[:-1]]+[True]
        
        #intialize beginning of a streak
        start = 0
        streak = []
        for end in range(len(streak_break)):
            #if this is the end of a streak,  
            if streak_break[end]:
                #record it,
                streak += [end+1-start for indx in range(end+1-start)]
                #and begin the next
                start = end+1
        
        #finally, add it to the data frame
        activity['Streak of '+str(length)] = streak
        
    #record how much activity occured within a given period 
    for period in per_periods:
        per = []
        for n in range(len(tsl)):
            elapsed = tsl[n]
            count, m = 0, 1
            #first we look at earlier tweets
            while (m+n<len(tsl)) and (elapsed<=period/2):
                count += 1
                try: elapsed += tsl[m+n]
                except: pass
                m += 1
            m = 0
            elapsed = datetime.timedelta(0)
            #then later tweets
            while n>=m and elapsed<=period/2:
                count += 1
                m += 1
                try: elapsed += tsl[n-m]
                except: pass
            per += [count]
        activity['Per '+str(period)] = per
        
    if save:
        activity.to_json(save, index=False)
        
    return activity

In [19]:
def assemble_activities(network, activities=pd.DataFrame(), save=False):    
    #concatenate each 
    for n, user in enumerate(network):
        try:
            activity = get_activity(user, count=1000)
            activity['Predicted_polarity'] = [analyzer(text) for text in activity.Text]
            activity['User'] = [user for n in range(len(activity))]
            activities = pd.concat([activities, activity])
        except: continue
        if not n%10: print('Finished', n)
    
    #reindex because concatenation screwed it up
    activities.index = range(len(activities.index))
    
    if save:
        activities.to_json(save+'_full.json')
        activities[['Is_retweet','Is_quote','Is_reply','Retweets','Favorites','Created','Time_since_last','Streak of 0:30:00','Streak of 6:00:00','Per 1:00:00',"Per 1 day, 0:00:00",'Predicted_polarity','User']].to_csv(save+'.csv', index=False)
        
    return activities

In [53]:
def frame_data(core, crust, activities, data=[], save=False, restart=0):
    #we assign core members as features
    columns = ['user']+['user_'+str(n) for n in range(len(core))]
    if save and not restart: 
        with open(save+'.csv', 'r') as save_file:
            text = ['user'] + save_file.readlines()
        for column in columns:
            text[0] += ',' + str(column)
        text[0] += '\n'
        with open(save+'.csv', 'a') as save_file:
            save_file.writelines(text)
        
    #now we see which crust members are following which core members
    for n, crust_member in [x for x in enumerate(crust) if x[0]>=restart]:
        #this will hold what we find
        next_line = [crust_member]
        #look up who the crust member is following
        following, pages = API.friends_ids(user_id=crust_member, cursor=-1)
        
        for core_member in core:
            #page through if necessary
            while pages[1] and (core_member not in following):
                more_following, pages = API.friends_ids(user_id=crust_member, cursor=pages[1])
                following += more_following
            next_line.append(core_member in following)
        
        #We'll say people follow themselves
        if n < len(core):
            next_line[n]==True
        
        #now we store it
        data.append(next_line)
        if save: 
            with open(save+'.csv', 'r') as save_file:
                text = save_file.readlines()
            text[n+1] = text[n+1][:-1]
            for datum in next_line:
                text[n+1] += ',' + str(datum)
            text[n+1] += '\n'
            with open(save+'.csv', 'w') as save_file:
                save_file.writelines(text)
        print('Stored data for member', n+1, 'of', len(crust))
    
    #and put it all together
    df = pd.DataFrame(data, index=crust, columns=columns)
    
    #attach each user's average predicted polarity
    averages = []
    for user in network:
        polarities = activities.Predicted_polarity[[i for i in activities.index if activities.User[i]==user]]
        mean = (0 if len(polarities)==0 else np.mean(polarities))
        averages.append(mean)
    df['Average_polarity'] = averages
    if save:
        df.to_csv(save+'.csv', index=False)
    
    return df

### Implementation

In [10]:
network = []
network = build_network(center, breadth, depth, network, './'+name+'/network')

Rate limit reached. Sleeping for: 141


925904609511690240 has mutuals [1175725425949073409, 925904609511690240]
92508756 has mutuals [1151938716438552577, 92508756]
834453600730484736 has mutuals [1149011056716500997, 834453600730484736]
100554366 has mutuals [541774909, 100554366]


Rate limit reached. Sleeping for: 881


196290445 has mutuals [24183453, 196290445]
252677295 has mutuals [18162640, 252677295]
38720479 has mutuals [23627253, 38720479]


In [20]:
activities=pd.DataFrame()
activities=assemble_activities(network, activities, './'+name+'/activity')

Finished 0
Finished 10


This next one takes a while. Go play with Activity_investigator while you wait.

In [54]:
with open('./'+name+'/network.csv', 'w+')as writer:
    writer.writelines('\n'.join([str(user) for user in network]))
data = []
df = frame_data(network, network, activities, data, './'+name+'/network')

Stored data for member 1 of 15
Stored data for member 2 of 15
Stored data for member 3 of 15
Stored data for member 4 of 15
Stored data for member 5 of 15
Stored data for member 6 of 15
Stored data for member 7 of 15
Stored data for member 8 of 15
Stored data for member 9 of 15
Stored data for member 10 of 15
Stored data for member 11 of 15
Stored data for member 12 of 15
Stored data for member 13 of 15


Rate limit reached. Sleeping for: 651


Stored data for member 14 of 15
Stored data for member 15 of 15


Now you can play with the rest in Network_investigator!