# Use this on a Twitter account to find bad eggs.

## Setup

### Import these packages

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tweepy
import datetime
import json

### Choose your parameters

In [21]:
# Screen name of network's center
center = 'erdosinstitute'
# Name to save files to
name = 'Test'
# Number of mutuals to find
breadth = 2
# Number of times to find them
depth = 3

### Choose your senitment analyzer. For example here we use Vader

In [5]:
# Necessary parts of the analyzer
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')

# The analyzer itself
def analyzer(tweet):
    return SentimentIntensityAnalyzer().polarity_scores(tweet)['compound']

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Ysgard\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [22]:
## We could also use this
#from joblib import load
#import nltk
#from nltk.corpus import stopwords
#nltk.download('stopwords')
#import string
#from nltk.stem import PorterStemmer
#import re
#
#stop_words=stopwords.words('english')
#punct=string.punctuation
#stemmer=PorterStemmer()
#
#vectord = load('vectord.joblib') 
#model = load('model.joblib') 
#
##tweets must be cleaned first
#def analyzer(tweets, analogue=False):
#    #clean the text
#    cleets = []
#    for twittre in activity.Text:
#        #this removes mentions
#        twittre = re.sub(r'@[A-Za-z0-9_]+', '', twittre)
#        #this removes urls
#        twittre = re.sub(r'https?:\/\/[A-Za-z0-9\.\/]+', '', twittre)        
#        #this removes everything but letters from the str
#        twittre = re.sub('[^a-zA-Z]',' ',twittre)
#        #this makes everything lowercase, then turns everything that is separated by a space into its own str
#        twittre = twittre.lower().split()
#        #this removes stop words and stems everything else
#        twittre = [stemmer.stem(word) for word in twittre if (word not in stop_words)]
#        # this puts it all back together with spaces inbetween
#        twittre = ' '.join(twittre)
#        cleets.append(twittre)
#    activity['Cleaned'] = cleets
#
#    probs = model.predict_proba(vectord.transform(tweets).toarray())
#    return probs[:,1]-probs[:,0]


### You'll need a Twitter developer account

In [17]:
# Enter your credentials
API_key = ''
API_secret_key = ''
access_token = ''
access_token_secret = ''

# Connect to the API
authenticator = tweepy.OAuthHandler(API_key, API_secret_key)
authenticator.set_access_token(access_token, access_token_secret)
API = tweepy.API(authenticator, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)

## Functions

In [None]:
def build_network(user, breadth=10, depth=2, network=[], save=False):
    #type user as a screen name so this works
    howrite = 'a'
    
    # Make sure we're finding the right user
    if type(user)==str:
        user = API.get_user(screen_name=user).id
        howrite = 'w'
    elif type(user)!=int:
        raise Exception("Bad user.")
    
    #make sure we have good data
    try:
        friends = API.friends_ids(user_id=user)
        friends[0]
    except: 
        return False
    
    #one list for user's mutuals, one list for everyone
    mutuals = []
    network.append(user)
    if save:
        try:
            with open(save, howrite) as save_file:
                save_file.write(str(user)+'\n')
        except: 
            raise Exception('Bad save file')
    
    #we won't delve too deep or too greedily
    if depth:
        #find random friends of user who follow user back (mutuals)
        for friend in np.random.permutation(friends):
            friend = int(friend)
            if friend not in network:
                if API.show_friendship(source_id=friend, target_id=user)[0].following:
                    #find their mutuals, but only add them to the list if it's good data
                    if build_network(friend, breadth=breadth, depth=depth-1, network=network, save=save):
                        #add them to the list
                        mutuals.append(friend)
                    #but only some of them
                    if len(mutuals)==breadth:
                        print(friend,'has mutuals',mutuals)
                        break
    
    #and here's the list
    return network

In [None]:
def get_activity(user, count=200,
                 streak_lengths=[datetime.timedelta(minutes=30),datetime.timedelta(hours=6)], 
                 per_periods=[datetime.timedelta(hours=1),datetime.timedelta(days=1)], save=False):
    # Make sure we're finding the right user
    if type(user)==str:
        apitivity = API.user_timeline(screen_name=user, count=count)
    elif type(user)==int:
        apitivity = API.user_timeline(user_id=user, count=count)
    else:
        raise Exception("Bad user.") 
        
    # Make a list of the statuses
    activiray = []
    #and add the relevant data
    for tweet in apitivity:
        try: 
            tweet.retweeted_status
            is_retweet = True
        except: 
            is_retweet = False
        activiray.append([tweet.text,
                         tweet.id,
                         is_retweet,
                         tweet.is_quote_status,
                         type(tweet.in_reply_to_status_id)==int,
                         tweet.retweet_count,
                         tweet.favorite_count,
                         tweet.created_at])
        
    # turn it into a dataframe
    activity = pd.DataFrame(activiray, 
                            columns=['Text', 'Id', 'Is_retweet', 'Is_quote', 'Is_reply', 'Retweets', 'Favorites', 'Created'])
    
    # record the time since the last status update
    tsl = [activity.Created[n].to_pydatetime() - activity.Created[n+1].to_pydatetime()
                                   for n in range(len(activity)-1)] + [np.nan]
    activity['Time_since_last'] = tsl.copy()
    
    #record how many times in a row the last activity was less than length 
    for length in streak_lengths:
        #record when it was less than length
        streak_break = [time>length for time in tsl[:-1]]+[True]
        
        #intialize beginning of a streak
        start = 0
        streak = []
        for end in range(len(streak_break)):
            #if this is the end of a streak,  
            if streak_break[end]:
                #record it,
                streak += [end+1-start for indx in range(end+1-start)]
                #and begin the next
                start = end+1
        
        #finally, add it to the data frame
        activity['Streak of '+str(length)] = streak
        
    #record how much activity occured within a given period 
    for period in per_periods:
        per = []
        for n in range(len(tsl)):
            elapsed = tsl[n]
            count, m = 0, 1
            #first we look at earlier tweets
            while (m+n<len(tsl)) and (elapsed<=period/2):
                count += 1
                try: elapsed += tsl[m+n]
                except: pass
                m += 1
            m = 0
            elapsed = datetime.timedelta(0)
            #then later tweets
            while n>=m and elapsed<=period/2:
                count += 1
                m += 1
                try: elapsed += tsl[n-m]
                except: pass
            per += [count]
        activity['Per '+str(period)] = per
        
    if save:
        activity.to_json(save, index=False)
        
    return activity

In [None]:
def assemble_activities(network , save=False):
    #initialize the dataframe
    activities = pd.DataFrame()
    
    #concatenate each 
    for n, user in enumerate(network):
        try:
            activity = get_activity(user, count=1000)
            prep_activity(activity)
            activity['Predicted_polarity'] = sentiment(activity.Cleaned, True)
            activity['User'] = [user for n in range(len(activity))]
            activities = pd.concat([activities, activity])
        except: continue
        if not n%10: print(n)
    
    

In [None]:
def frame_data(core, crust, data=[], save=False, restart=0):
    #we assign core members as features
    columns = ['core_'+str(n) for n in range(len(core))]
    if save and not restart: 
        with open(save, 'r') as save_file:
            text = [''] + save_file.readlines()
        for column in columns:
            text[0] += ',' + str(column)
        text[0] += '\n'
        with open(save, 'w+') as save_file:
            save_file.writelines(text)
        
    #now we see which crust members are following which core members
    for n, crust_member in [x for x in enumerate(crust) if x[0]>=restart]:
        #this will hold what we find
        next_line = []
        #look up who the crust member is following
        following, pages = API.friends_ids(user_id=crust_member, cursor=-1)
        
        for core_member in core:
            #page through if necessary
            while pages[1] and (core_member not in following):
                more_following, pages = API.friends_ids(user_id=crust_member, cursor=pages[1])
                following += more_following
            next_line.append(core_member in following)
        
        #We'll say people follow themselves
        if n < len(core):
            next_line[n]==True
        
        #now we store it
        data.append(next_line)
        if save: 
            with open(save, 'r') as save_file:
                text = save_file.readlines()
            text[n+1] = text[n+1][:-1]
            for datum in next_line:
                text[n+1] += ',' + str(datum)
            text[n+1] += '\n'
            with open(save, 'w') as save_file:
                save_file.writelines(text)
        print('Stored data for member', n+1, 'of', len(crust))
    
    #and finally put it all together
    return pd.DataFrame(data, index=crust, columns=columns)