# Twitter API Data Gathering

### This file will gather data from twitter using the twint api
##### -  Grab 50 popular tweets using the movie title as a hashtag from our movies dataset
##### - For each movie, count up the number of likes, retweets, and replies across the tweets
##### - Consolidate info into a dataframe and export to a csv for EDA in a seperate notebook


In [1]:
# import libraries that are needed

import twint
import nest_asyncio
import pandas as pd
import numpy as np
import datetime as dt
from datetime import timedelta
import json
import string


%matplotlib inline
nest_asyncio.apply() # Will get runtime errors without this. Essentially, 
                     # if we make one twint object, it will keep running, preventing us from running again

In [2]:
# Get a subset of our movies df from box office mojo just for the year 2020

movies_df = pd.read_csv('../data/Box_Office_Mojo_Domestic_Box_Office_Chart.csv')
movies_df['Date'] =  pd.to_datetime(movies_df['Date'], format='%Y-%m-%d')


In [3]:
# This cell will loop through our movies, and for each, create a twint object. Each twint object will be stored
# in a list to be later manipulated using string, json, and eventually pandas. Note this takes several hours

#%%capture
c_values = []
for i in range(0, len(movies_df)):
    
    try:
    
        # for each movie, use datetime and our movies dataset to assign appropriate attributes for our twint.config object
        search = '#' + movies_df.iloc[i]['Title']
        since = str(movies_df.iloc[i]['Date'])[:10]
        until = dt.datetime.strptime(since, '%Y-%m-%d') + timedelta(days=60)
        until = until.strftime('%Y-%m-%d')

        # define our twint config object and make sure define our json output
        # Run each twint object, and then store the object with all its data in the c_values array
        c = twint.Config()
        c.Search = search
        c.Since = since
        c.Until = until
        c.Limit = 50
        c.Popular_tweets = True
        c.Hide_output = True
        c.Store_json = True
        c.Output = '{}.json'.format(movies_df.iloc[i]['Title'])
        twint.run.Search(c)
        c_values.append(c)
        
    except:
        
        c_values.append(None)
    

CRITICAL:root:twint.output:checkData:copyrightedTweet
CRITICAL:root:twint.run:Twint:Feed:noDataExpecting value: line 1 column 1 (char 0)
sleeping for 1.0 secs
CRITICAL:root:twint.run:Twint:Feed:noDataExpecting value: line 1 column 1 (char 0)
sleeping for 8.0 secs
CRITICAL:root:twint.run:Twint:Feed:noDataExpecting value: line 1 column 1 (char 0)
sleeping for 27.0 secs
CRITICAL:root:twint.run:Twint:Feed:noDataExpecting value: line 1 column 1 (char 0)
sleeping for 64.0 secs
CRITICAL:root:twint.run:Twint:Feed:noDataExpecting value: line 1 column 1 (char 0)
sleeping for 125.0 secs
CRITICAL:root:twint.run:Twint:Feed:noDataExpecting value: line 1 column 1 (char 0)
sleeping for 216.0 secs
CRITICAL:root:twint.run:Twint:Feed:noDataExpecting value: line 1 column 1 (char 0)
sleeping for 343.0 secs
CRITICAL:root:twint.output:checkData:copyrightedTweet
CRITICAL:root:twint.output:checkData:copyrightedTweet
CRITICAL:root:twint.run:Twint:Feed:noDataExpecting value: line 1 column 1 (char 0)
sleeping for

sleeping for 125.0 secs
CRITICAL:root:twint.run:Twint:Feed:noDataExpecting value: line 1 column 1 (char 0)
sleeping for 216.0 secs
CRITICAL:root:twint.run:Twint:Feed:noDataExpecting value: line 1 column 1 (char 0)
sleeping for 343.0 secs
CRITICAL:root:twint.output:checkData:copyrightedTweet
CRITICAL:root:twint.output:checkData:copyrightedTweet
CRITICAL:root:twint.output:checkData:copyrightedTweet
CRITICAL:root:twint.output:checkData:copyrightedTweet
CRITICAL:root:twint.output:checkData:copyrightedTweet
CRITICAL:root:twint.output:checkData:copyrightedTweet
CRITICAL:root:twint.run:Twint:Feed:noDataExpecting value: line 1 column 1 (char 0)
sleeping for 1.0 secs
CRITICAL:root:twint.run:Twint:Feed:noDataExpecting value: line 1 column 1 (char 0)
sleeping for 8.0 secs
CRITICAL:root:twint.run:Twint:Feed:noDataExpecting value: line 1 column 1 (char 0)
sleeping for 27.0 secs
CRITICAL:root:twint.run:Twint:Feed:noDataExpecting value: line 1 column 1 (char 0)
sleeping for 64.0 secs
CRITICAL:root:tw

In [4]:
# many tweets have emojis and other complex characters that, for the purpose of our data gathering, are useless
# additionally, they do not work with the json.loads() method. This method will simplify our tweets 
def remove_non_printables(s):   
    l = [c for c in s if c in string.printable]
    l = ''.join(l)
    return l

In [5]:
# Define a list of movie tweets. Each item will be a list of jsons, where each json represents a single 
# tweet for a movie.

movie_tweets = []
bug_is = [] # this was used to identify when in our loop we failed, helped us figure out that we needed
            # the remove_non_printables method we wrote
for i in range(0, len(c_values)):
    
    # for each item, open the file, view it as one long string, remove the bad characterw, split the string into 
    # a list based on when a new line occurs, and ultimately get a list of tweets
    try:
        openfile=open(c_values[i].Output)
        tweet_text = remove_non_printables(openfile.read())
        tweet_list = tweet_text.split('\n')
    except:
        bug_is.append(i) # in case of error, append index to a list so we can check the "bad" indexes
    
    # for each tweet, we will construct a json, and then store in a list of jsons
    jsons = []
    for tweet in tweet_list:
        try:
            j = json.loads(tweet)
            jsons.append(j)
        except:
            continue
    
    movie_tweets.append(jsons)


In [6]:
# In this for loop, each 'movie' is really a list of jsons. For each movie, we create a dataframe from the json data
# to get a list of dataframes
dfs = []
for movie in movie_tweets:
    df = pd.DataFrame(movie)
    dfs.append(df)

In [7]:
# From inspection, we noticed duplicate tweets. We drop based on the tweet column.
for df in dfs:
    df.drop_duplicates('tweet', inplace=True)

In [8]:
# Verifying data we plan to take from each df in our list
print(dfs[5]['likes_count'].sum())
print(dfs[5]['retweets_count'].sum())
print(dfs[5]['replies_count'].sum())
print(movies_df.iloc[5]['Title'])

430
288
51
[REC] 4: Apocalypse


In [9]:
# Verifying the length of our datasets are the same
print(len(movies_df))
print(len(dfs))

4035
4035


In [10]:
# Create an empty list to be populated with dictionaries
twitter_data = []

In [11]:
# Loop through DFw, creating a dictionary of twitter data for each. Each dict is added to twitter_data

for i, df in enumerate(dfs):
    likes = df['likes_count'].sum()
    retweets = df['retweets_count'].sum()
    replies = df['replies_count'].sum()
    twitter_data.append({
        'Movie': movies_df.iloc[i]['Title'],
        'Likes': likes,
        'Retweets': retweets,
        'Replies': replies
    })
    

In [12]:
# Create a final dataframe and export as a csv to be analyzed in a seperate notebook
twitter_df = pd.DataFrame(twitter_data)
twitter_df.to_csv('twitter_movies.csv')