In [1]:
#Jared Balkman
#DS710
#Final Project - Python - Data Gathering and Processing
#16 December, 2020

In [2]:
#import libraries

#for the Twitter session and collecting tweet data
import tweepy

#for data processing
import numpy as np
import pandas as pd

#to save the raw data before processing
import pickle

#for polarity and subjectivity scores, respectively
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from textblob import TextBlob

#for removing URLs and username mentions in the tweet text
#I added comments where I used regular expressions (two lines of code) to show my understanding of what they're doing
import re

In [3]:
#run the script containing my Twitter credentials
%run C:/Users/jared/OneDrive/Desktop/DS710/twitter_credentials.py
    
#authenticate
auth = tweepy.OAuthHandler(consumer_key=con_key, consumer_secret=con_secret)
auth.set_access_token(acc_token, acc_secret)

#Connect to the Twitter API using the authentication
api = tweepy.API(auth, wait_on_rate_limit = True, wait_on_rate_limit_notify = True)

In [4]:
#Next we'll define all functions

In [5]:
#Note: For some reason I couldn't pass the date in as an argument and get a successful search, so I kindly ask that 
#for your purposes (assuming 2020-12-14 is more than 7 days ago) you adjust the date below in the api.search function

def get_x_tweets(num_needed):
    """Take in the number of tweets needed, perform a REST API search using the 10 most common words on Twitter,
       and return the search results in a list"""
    
    tweet_list = []
    last_id = -1 # id of last tweet seen
    while len(tweet_list) < num_needed:
        
        try:
            new_tweets = api.search(q = 'the OR i OR to OR a OR and OR is OR in OR it OR you OR of until:2020-12-14',
                                    
                                    #restrict to English language because sentiment dictionaries are only in English
                                    lang='en',
                                    
                                    #restrict the count to the rate limit per 15 minute window
                                    count = 180,
                                    max_id = str(last_id - 1),
                                    
                                    #extended tweet mode to make sure we get the full text of each tweet
                                    tweet_mode='extended',
                                    wait_on_rate_limit=True,
                                    wait_on_rate_limit_notify=True)
            
        except tweepy.TweepError as e:
            print("Error", e)
            break
            
        else:
            if not new_tweets:
                print("Could not find any more tweets!")
                break
            tweet_list.extend(new_tweets)
            last_id = new_tweets[-1].id
            
    return tweet_list
            

In [6]:
#mostly for archival purposes but also in case I goofed something up and had to start over

def pickle_raw_data(filename, data):
    """Takes a data object and pickles it with the given filename"""
    
    with open('tweet_list.pickle', 'wb') as raw_tweets:
        pickle.dump(tweet_list, raw_tweets, protocol=pickle.HIGHEST_PROTOCOL)
    
    return None

In [7]:
def df_from_tweet_json(tweet_list):
    """Gets the _json dictionaries each Tweet Status object in a list and puts them all in a pandas dataframe"""
    
    tweet_list_json = [tweet_list[x]._json for x in range(len(tweet_list))]
    df = pd.DataFrame(tweet_list_json)
    
    return df

In [8]:
def classify_retweets(df):
    """Adds a logical column to the dataframe with value True if the tweet is a retweet and False if an original tweet"""
    
    #pre-allocate the column
    df.loc[:, 'is_retweet'] = 0
    
    #change 'retweeted_status' attribute of original tweets from NaN to 0
    df['retweeted_status'] = df['retweeted_status'].fillna(0)
    
    #fill in the values for the 'is_retweet' column
    df.loc[df['retweeted_status'] != 0, 'is_retweet'] = True
    df.loc[df['retweeted_status'] == 0, 'is_retweet'] = False
    
    return df

In [9]:
def subset_for_analysis(df, list_of_attributes):
    """This takes a list of attributes from the _json dataframe and creates a new dataframe of those attributes. Some of
       these attributes - namely 'user' and 'retweeted_status' are dictionaries themselves, and we'll use this function
       again on those dictionaries to get the attributes we want out of there"""
    
    df_subset = df[list_of_attributes]
    
    return df_subset

In [10]:
def concat_dfs(frames):
    """Concatenates a list of dataframes horizontally along the x-axis"""
    
    df_flattened = pd.concat(frames, axis=1)
    
    return df_flattened

In [11]:
def combine_columns(df, list_of_column_lists):
    """For a given dataframe, combines pairs of columns from that dataframe where the first column's NaN values are 
       replace with the second column's values for corresponding rows. Then, deletes the second column and returns
       the modified dataframe"""
    
    for column_pair in list_of_column_lists:
        column_pair[0].fillna(column_pair[1], inplace=True)
        del column_pair[1]
        
    return df

In [12]:
def clean_tweet_text(df, text_column):
    """Makes a list of strings from a dataframe column, removes URLs, user mentions, and newline characters from each
       string, and replaces the old dataframe column with a cleaned version. Returns the updated dataframe"""
    
    tweettext = [text_column[i] for i in range(len(text_column))]
    
    #Here are my regular expression substitutions with comments for what's going on:
    
    #Remove URLs:
    #'r' - Python will not interpret backslash sequences. Needed because of the backslashes in URLs
    #'http' - the beginning of the string to search for
    #'\' - indicates a special sequence
    #'S' - that sequence being all following non-whitespace characters. In other words, the entirety of the URL
    #'+' - matches repititions in case there are multiple URLs
    
    tweettext = [re.sub(r'http\S+', '', tweettext[i]) for i in range(len(tweettext))]
    
    #same as above, except we don't worry about backslashes in the @username sequence
    tweettext = [re.sub('@\S+', '', tweettext[i]) for i in range(len(tweettext))]
    
    #Finally, remove '\n' characters
    tweettext_clean = [tweettext[i].replace('\n', '') for i in range(len(tweettext))]
    
    #And replace the text column with the cleaned version:
    df['text_column'] = tweettext_clean
    
    return df
                       

In [13]:
def pol_and_subj_scores(df, list_of_strings):
    """Computes TextBlob subjectivity score and vaderSentiment compound polarity score for each string in a list;
       adds those scores as attributes in a dataframe and returns the updated dataframe"""
    #TextBlob polarity, a score from -1 (negative) to 1 (positive). Ultimately not used for analysis
    df['textblob_polarity'] = [TextBlob(list_of_strings[i]).polarity for i in range(len(df))]
    #TextBlob subjectivity, a score from 0 (objective) to 1 (most subjective)
    df['textblob_subjectivity'] = [TextBlob(list_of_strings[i]).subjectivity for i in range(len(df))]

    #vaderSentiment polarity. vaderSentiment calculates a positive, negative, and neutral score for a string, and then
    #averages them all into one 'compound' score that will be used for this analysis. But all four values were collected
    analyzer = SentimentIntensityAnalyzer()
    vs = [analyzer.polarity_scores(sentence) for sentence in list_of_strings]
    
    #add the scores to the df and then only keep the compound score. Probably a more efficient way to do this
    df['vaderSentiment_polarity_scores'] = vs
    
    df = pd.concat([df.drop(['vaderSentiment_polarity_scores'], axis=1),
                         pd.DataFrame(df['vaderSentiment_polarity_scores'].tolist())], axis=1)
    
    return df

In [14]:
def remove_duplicates(df, attribute):
    """Removes rows of a dataframe based on duplicate values of an attribute in that dataframe,
       and returns the dataframe."""
    
    df.drop_duplicates([attribute], inplace=True, ignore_index=True)
        
    return df

In [15]:
#Somehow even with the wait_on_rate_limit arguments passed when connecting to the API,
#I was still getting rate limit errors for a bit. I used this line to check when I could
#do another search

api.rate_limit_status('search')

{'rate_limit_context': {'access_token': '1075678699-KyvUfsgguposFgmoluF28hksrqyGNxTdFwmCO4N'},
 'resources': {'search': {'/search/tweets': {'limit': 180,
    'remaining': 180,
    'reset': 1608186805}}}}

In [16]:
#Run the code!

In [17]:
#Get the tweets. I got back 10037 using num_needed = 10000. The number of tweets obviously can be adjusted to preference
tweet_list = get_x_tweets(1000)

In [18]:
#Extract the _json and create a pandas dataframe
df = pd.DataFrame(df_from_tweet_json(tweet_list))

#Make the boolean column 'is_retweet'
df = classify_retweets(df)

#Get the subset of the dataframe we're interested in
df_tweets = subset_for_analysis(df, ['created_at',
                                     'id',
                                     'full_text',
                                     'user',
                                     'retweeted_status',
                                     'retweet_count',
                                     'favorite_count',
                                     'is_retweet'])

#Now we need to make new dataframes from the 'user' and 'retweeted_status' attributes, because they contain the user
#info and, for retweets, info on the original tweet that we want for analysis
df_tweets_userinfo = df_tweets['user'].apply(pd.Series)
df_tweets_retweetinfo = df_tweets['retweeted_status'].apply(pd.Series)

#Next, get the subsets of those dataframes that we're interested in
df_tweets_userinfo = subset_for_analysis(df_tweets_userinfo, ['id',
                                                              'name',
                                                              'screen_name',
                                                              'followers_count',
                                                              'verified'])

df_tweets_retweetinfo = subset_for_analysis(df_tweets_retweetinfo, ['created_at',
                                                                    'id',
                                                                    'full_text',
                                                                    'user',
                                                                    'retweet_count',
                                                                    'favorite_count'])

#We have to do this one more time, for the 'user' attribute in df_tweets_retweetinfo, to get the user info for the
#author of the original tweet
df_tweets_retweetinfo_userinfo = df_tweets_retweetinfo['user'].apply(pd.Series)
df_tweets_retweetinfo_userinfo = subset_for_analysis(df_tweets_retweetinfo_userinfo, ['id',
                                                                                      'name',
                                                                                      'screen_name',
                                                                                      'followers_count',
                                                                                      'verified'])

#Now we have all the attributes we want in four dataframes. Time to concatenate:
frames = [df_tweets, df_tweets_userinfo, df_tweets_retweetinfo, df_tweets_retweetinfo_userinfo]
df_flattened = concat_dfs(frames)

#The next line drops the original 'user' and 'retweeted_status' attributes from the dataframe since we have what we
#want out of there. This is not the best way to do this.
df_flattened.drop(df_flattened.iloc[:,[3,4]], axis=1, inplace=True)

#Now I need to rename some attributes so they're all unique:
df_flattened.columns = ['created_at',
                        'id', 
                        'full_text',
                        'retweet_count',
                        'favorite_count',
                        'is_retweet',
                        'user_id',
                        'user_name',
                        'user_screen_name',
                        'followers_count',
                        'verified',
                        'orig_created_at',
                        'orig_id',
                        'orig_full_text',
                        'orig_retweet_count',
                        'orig_favorite_count',
                        'orig_user_id',
                        'orig_user_name',
                        'orig_screen_name',
                        'orig_followers_count',
                        'orig_verified']

#All of the 'orig' columns pertain to retweet data, so these have NaN values in the case of original tweets. We'll
#replace the NaN values with the values from their corresponding columns, to ultimately have one list of all, and only,
#original tweet data

df_flattened = combine_columns(df_flattened, [[df_flattened.orig_full_text, df_flattened.full_text],
                                              [df_flattened.orig_created_at, df_flattened.created_at],
                                              [df_flattened.orig_id, df_flattened.id],
                                              [df_flattened.orig_retweet_count, df_flattened.retweet_count],
                                              [df_flattened.orig_favorite_count, df_flattened.favorite_count],
                                              [df_flattened.orig_user_id, df_flattened.user_id],
                                              [df_flattened.orig_user_name, df_flattened.user_name],
                                              [df_flattened.orig_screen_name, df_flattened.user_screen_name],
                                              [df_flattened.orig_followers_count, df_flattened.followers_count],
                                              [df_flattened.orig_verified, df_flattened.verified]])

#Next, clean the tweet text:
df_flattened = clean_tweet_text(df_flattened, list(df_flattened['orig_full_text']))



#Assign polarity and subjectivity scores
df_flattened = pol_and_subj_scores(df_flattened, list(df_flattened.orig_full_text))

#Remove duplicates based on original tweet id (My results gave 888 duplicates removed to give remaining dataset of 
#9419 rows x 17 columns)
df_flattened = remove_duplicates(df_flattened, 'orig_id')

In [19]:
#Save the whole dataframe as a CSV (n=9149 for me)
df_flattened.to_csv('twitter_data.csv')
#Subset the first 100 rows for submission
df_flattened.iloc[:100, :].to_csv('twitter_data_100_tweets.csv')

#Subset the data to be used for R
twitter_data_for_r = subset_for_analysis(df_flattened, ['orig_retweet_count',
                                                        'orig_favorite_count',
                                                        'orig_followers_count',
                                                        'textblob_subjectivity',
                                                        'compound'])

#Rename the columns appropriately
twitter_data_for_r.columns = ['num_retweets',
                              'num_likes',
                              'num_followers',
                              'subjectivity',
                              'polarity']

#Originally I let Python figure out the dtypes and convert them, but they all showed up as doubles in R anyway
twitter_data_for_r = twitter_data_for_r.convert_dtypes()
twitter_data_for_r.dtypes

#Write to CSV
twitter_data_for_r.to_csv('twitter_data_for_r.csv')

#Subset the first 1000 rows for submission
twitter_data_for_r.iloc[:1000, :].to_csv('twitter_data_for_r_1000_rows.csv')

twitter_data_for_r

Unnamed: 0,num_retweets,num_likes,num_followers,subjectivity,polarity
0,90665,542299,88626936,0.750000,0.0000
1,128,868,1682,0.000000,0.3400
2,392,2307,966,0.000000,0.0000
3,569,3462,23954,0.000000,0.6476
4,24415,131209,6733,0.000000,0.0000
...,...,...,...,...,...
1032,455,542,9,0.000000,0.7125
1033,57253,195563,16068,0.000000,-0.5423
1034,0,1,8,0.300000,-0.2960
1035,46,401,25791362,0.633333,0.7579


In [20]:
#Miscellaneous

In [21]:
#Number of retweets in the dataset (mine was roughly 40%)
twitter_data_for_r.groupby(df_flattened.is_retweet).count()

Unnamed: 0_level_0,num_retweets,num_likes,num_followers,subjectivity,polarity
is_retweet,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
False,596,596,596,596,596
True,441,441,441,441,441


In [22]:
#Min/max scores for polarity and subjectivity
print(twitter_data_for_r.polarity.min(), twitter_data_for_r.polarity.max())
print(twitter_data_for_r.subjectivity.min(), twitter_data_for_r.subjectivity.max())

-0.9772 0.9891
0.0 1.0
