# Downloading data
In this notebook we download the Twitter data we want to use in our prediction tasks. We use the `twint` library to get around the typical Twitter rate limits, but we won't be abusing anything.

The process goes:

1. Pick some keywords that we want Tweets about
2. The script searches for Tweets on those topics from a certain time period
3. From these Tweets, the script then grabs the information on these accounts

Maybe we should do something like finding tweets that have a certain amount of attention - grab those users - and then scrape their tweets in X number of days - that way we don't have that much leakage

In [1]:
from datetime import datetime
import time

import pandas as pd
import twint
import nest_asyncio
nest_asyncio.apply()


users_downloaded_at = '2021-08-11'
download_tweets_from = '2021-08-12'
download_tweets_to = '2021-08-19'

In [2]:
usernames = pd.read_csv('handles.txt', header=0, names=['username'])

In [3]:
results = []
unfound_accounts = []
not_worked = []
for i, username in enumerate(usernames.username.unique()):
    if i % 100 == 0:
        print(f'Done with {i}')
    
    c = twint.Config()
    c.Username = username
    c.Pandas = True
    c.Hide_output = True
    
    try:
        twint.run.Lookup(c)
    
        results.append(twint.storage.panda.User_df)
    except ValueError as e:
        if 'Cannot find twitter account' in str(e):
            unfound_accounts.append(username)
        else:
            raise
    except twint.token.RefreshTokenException:
        not_worked.append(username)
        time.sleep(60)
        
        try:
            twint.run.Lookup(c)
            results.append(twint.storage.panda.User_df)
        except:
            print('Getting refresh errors!')

results = pd.concat(results)
results.to_csv(f'{datetime.today().strftime("%Y-%m-%d")}-handles-data.csv', index=False)

print(unfound_accounts)
print(not_worked)

Done with 0


CRITICAL:root:twint.get:User:'user'
CRITICAL:root:twint.get:User:'user'
CRITICAL:root:twint.get:User:'user'


Done with 100


CRITICAL:root:twint.get:User:'user'
CRITICAL:root:twint.get:User:'user'


Done with 200


CRITICAL:root:twint.get:User:'user'


{'id': 'VXNlcjo0MTUwMDM5Mjg2', 'rest_id': '4150039286', 'legacy_extended_profile': {}, 'is_profile_translatable': False}
Done with 300


CRITICAL:root:twint.get:User:'user'


{'id': 'VXNlcjo3MDcwNzc0NDA=', 'rest_id': '707077440', 'legacy_extended_profile': {}, 'is_profile_translatable': False}
{'id': 'VXNlcjoxMzY4ODM1ODg4ODkzOTg4ODY0', 'rest_id': '1368835888893988864', 'legacy_extended_profile': {}, 'is_profile_translatable': False}
{'id': 'VXNlcjoxMzY4ODM1MDE3ODE2Njk0Nzg2', 'rest_id': '1368835017816694786', 'legacy_extended_profile': {}, 'is_profile_translatable': False}


CRITICAL:root:twint.get:User:'user'


Done with 400


CRITICAL:root:twint.get:User:'user'
CRITICAL:root:twint.get:User:'user'


{'id': 'VXNlcjoxMjk0MDU4OTAxNjg4NzIxNDEz', 'rest_id': '1294058901688721413', 'legacy_extended_profile': {}, 'is_profile_translatable': False}


CRITICAL:root:twint.get:User:'user'


Done with 500


CRITICAL:root:twint.get:User:'user'
CRITICAL:root:twint.get:User:'user'


Done with 600


CRITICAL:root:twint.get:User:'user'


{'id': 'VXNlcjo2MTAwMzgwNA==', 'rest_id': '61003804', 'legacy_extended_profile': {'birthdate': {'day': 14, 'month': 7, 'visibility': 'Public', 'year_visibility': 'Self'}}, 'is_profile_translatable': False}
Done with 700
{'id': 'VXNlcjoxNzMwMTQ0MDI0', 'rest_id': '1730144024', 'legacy_extended_profile': {}, 'is_profile_translatable': False}
{'id': 'VXNlcjo5NjA2MDQ3MTM2NjEyMjI5MTM=', 'rest_id': '960604713661222913', 'legacy_extended_profile': {}, 'is_profile_translatable': False}


CRITICAL:root:twint.get:User:'user'
CRITICAL:root:twint.get:User:'user'
CRITICAL:root:twint.get:User:'user'
CRITICAL:root:twint.get:User:'user'
CRITICAL:root:twint.get:User:'user'
CRITICAL:root:twint.get:User:'user'


Done with 800
{'id': 'VXNlcjoyMzY3Mzg2MDA=', 'rest_id': '236738600', 'legacy_extended_profile': {'birthdate': {'day': 15, 'month': 7, 'year': 1990, 'visibility': 'Public', 'year_visibility': 'Public'}}, 'is_profile_translatable': False}


CRITICAL:root:twint.get:User:'user'
CRITICAL:root:twint.get:User:'user'


Done with 900


CRITICAL:root:twint.get:User:'user'


{'id': 'VXNlcjo0MTMxNTc4MDU3', 'rest_id': '4131578057', 'legacy_extended_profile': {}, 'is_profile_translatable': False}


CRITICAL:root:twint.get:User:'user'
CRITICAL:root:twint.get:User:'user'
CRITICAL:root:twint.get:User:'user'
CRITICAL:root:twint.get:User:'user'


{'id': 'VXNlcjoyOTkwMjYwOTM5', 'rest_id': '2990260939', 'legacy_extended_profile': {}, 'is_profile_translatable': False}
Done with 1000
{'id': 'VXNlcjoxMjU0MTAzOTkzMTk1OTEzMjE2', 'rest_id': '1254103993195913216', 'legacy_extended_profile': {}, 'is_profile_translatable': False}


CRITICAL:root:twint.get:User:'user'
CRITICAL:root:twint.get:User:'user'


{'id': 'VXNlcjo5MDQyNzYzOTc5OTYyMDQwMzQ=', 'rest_id': '904276397996204034', 'legacy_extended_profile': {}, 'is_profile_translatable': False}
Done with 1100
{'id': 'VXNlcjo3MjU2NjUyNDgyODAyMzYwMzI=', 'rest_id': '725665248280236032', 'legacy_extended_profile': {}, 'is_profile_translatable': False}
{'id': 'VXNlcjozNzIzNTUxNjY=', 'rest_id': '372355166', 'legacy_extended_profile': {}, 'is_profile_translatable': False}


CRITICAL:root:twint.get:User:'user'


Done with 1200
{'id': 'VXNlcjoxNTcxNjAzOQ==', 'rest_id': '15716039', 'legacy_extended_profile': {}, 'is_profile_translatable': False}


CRITICAL:root:twint.get:User:'user'


{'id': 'VXNlcjoyNTA3Mzg3Nw==', 'rest_id': '25073877', 'legacy_extended_profile': {}, 'is_profile_translatable': False}
['SenKamalaHarris', 'SenBennetCO', 'SenateMajLdr', 'EmWatson', 'aamir_khan', 'kelly_clarkson', 'ollyofficial', 'officialjaden', 'DrBassemYoussef', 'Ahlam_Alshamsi', 'msleamichele', 'Jenna_Marbles', 'scooterbraun', 'samsmithworld', 'laliespos', 'missA_suzy', 'MPOFFICIAL', 'S_C_', 'RealTracyMorgan', 'rickyrozay', 'mark_wahlberg', 'PointlessBlog', 'Elovera22', 'carocruzosorio', 'fucktyler', 'JackJackJohnson', 'MannyPacquiaoTR', 'S1dharthM', 'superstarrajini', 'msnbc_breaking', 'cnnlive']


In [2]:
user_details = pd.read_csv(f'{users_downloaded_at}-handles-data.csv').drop_duplicates('id')

In [3]:
results = []
not_worked = []
unfound_accounts = []
for i, username in enumerate(user_details.username.dropna().unique()):
    if i % 100 == 0:
        print(f'Done with {i}')
    
    c = twint.Config()
    c.Username = username
    c.Since = download_tweets_from
    c.Until = download_tweets_to
    c.Pandas = True
    c.Hide_output = True
    
    try:
        twint.run.Profile(c)

        results.append(twint.storage.panda.Tweets_df)
    except ValueError as e:
        if 'Cannot find twitter account' in str(e):
            unfound_accounts.append(username)
        else:
            raise
    except twint.token.RefreshTokenException:
        not_worked.append(username)
        time.sleep(60)

        try:
            twint.run.Profile(c)
            results.append(twint.storage.panda.Tweets_df)
        except:
            print('Getting refresh errors!')
        
    
results = pd.concat(results)
results.to_csv(f'{users_downloaded_at}-{download_tweets_from}-{download_tweets_to}-tweets-data.csv', index=False)

Done with 0
[!] No more data! Scraping will stop now.
found 3 deleted tweets in this search.
[!] No more data! Scraping will stop now.
found 14 deleted tweets in this search.
[!] No more data! Scraping will stop now.
found 21 deleted tweets in this search.
[!] No more data! Scraping will stop now.
found 409 deleted tweets in this search.
[!] No more data! Scraping will stop now.
found 45 deleted tweets in this search.
[!] No more data! Scraping will stop now.
found 58 deleted tweets in this search.
[!] No more data! Scraping will stop now.
found 76 deleted tweets in this search.
[!] No more data! Scraping will stop now.
found 4 deleted tweets in this search.
[!] No more data! Scraping will stop now.
found 0 deleted tweets in this search.
[!] No more data! Scraping will stop now.
found 3 deleted tweets in this search.
[!] No more data! Scraping will stop now.
found 3 deleted tweets in this search.
[!] No more data! Scraping will stop now.
found 6 deleted tweets in this search.
[!] No mo

CRITICAL:root:twint.get:User:'user'


[!] No more data! Scraping will stop now.
found 28 deleted tweets in this search.
[!] No more data! Scraping will stop now.
found 42 deleted tweets in this search.
[!] No more data! Scraping will stop now.
found 0 deleted tweets in this search.
[!] No more data! Scraping will stop now.
found 1 deleted tweets in this search.
[!] No more data! Scraping will stop now.
found 1 deleted tweets in this search.
[!] No more data! Scraping will stop now.
found 18 deleted tweets in this search.
[!] No more data! Scraping will stop now.
found 21 deleted tweets in this search.
[!] No more data! Scraping will stop now.
found 30 deleted tweets in this search.
[!] No more data! Scraping will stop now.
found 31 deleted tweets in this search.
[!] No more data! Scraping will stop now.
found 4 deleted tweets in this search.
[!] No more data! Scraping will stop now.
found 39 deleted tweets in this search.
[!] No more data! Scraping will stop now.
found 1 deleted tweets in this search.
[!] No more data! Scr

## Combining the data
We will join the two data frames

In [2]:
user_details = pd.read_csv(f'{users_downloaded_at}-handles-data.csv').drop_duplicates('id')

In [3]:
tweet_data = pd.read_csv('2021-08-11-2021-08-12-2021-08-19-tweets-data.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
user_details.head()

Unnamed: 0,id,name,username,bio,url,join_datetime,join_date,join_time,tweets,location,following,followers,likes,media,private,verified,avatar,background_image
0,409486555,Michelle Obama,MichelleObama,Girl from the South Side and former First Lady...,https://t.co/HPYP9ad13Y,2011-11-10 20:13:01 UTC,2011-11-10,20:13:01 UTC,1770,"Washington, DC",16,20854298,184,461,False,True,https://pbs.twimg.com/profile_images/136674780...,https://pbs.twimg.com/profile_banners/40948655...
1,19397785,Oprah Winfrey,Oprah,,https://t.co/IKOnfAha8E,2009-01-23 15:18:34 UTC,2009-01-23,15:18:34 UTC,13388,,323,43197469,224,951,False,True,https://pbs.twimg.com/profile_images/112335936...,https://pbs.twimg.com/profile_banners/19397785...
2,21447363,KATY PERRY,katyperry,Love. Light.,https://t.co/Rrwt0Kj2Q7,2009-02-20 23:45:56 UTC,2009-02-20,23:45:56 UTC,11420,,235,108819032,7995,2170,False,True,https://pbs.twimg.com/profile_images/139246535...,https://pbs.twimg.com/profile_banners/21447363...
3,14230524,Lady Gaga,ladygaga,“Chromatica” ⚔️💓 OUT NOW https://t.co/dgVb2x2V...,https://t.co/r5yRJxteLd,2008-03-26 22:37:48 UTC,2008-03-26,22:37:48 UTC,9519,,119314,83675119,2310,1795,False,True,https://pbs.twimg.com/profile_images/142258922...,https://pbs.twimg.com/profile_banners/14230524...
4,16409683,Britney Spears,britneyspears,,https://t.co/v9ZAAXrNtg,2008-09-22 20:47:35 UTC,2008-09-22,20:47:35 UTC,5716,"Los Angeles, CA",367095,55569235,2539,1362,False,True,https://pbs.twimg.com/profile_images/132341880...,https://pbs.twimg.com/profile_banners/16409683...


In [5]:
user_details.shape, user_details.id.nunique()

((1226, 18), 1226)

In [6]:
tweet_data.head()

Unnamed: 0,id,conversation_id,created_at,date,timezone,place,tweet,language,hashtags,cashtags,...,geo,source,user_rt_id,user_rt,retweet_id,reply_to,retweet_date,translate,trans_src,trans_dest
0,1425590913959612419,1425590913959612419,1628722000000.0,2021-08-12 00:52:14,200,,RT @girlsalliance: We're so proud of the four ...,en,[],[],...,,,1.047165e+18,We're so proud of the four organizations in ou...,1.42559e+18,[],2021-08-12 00:46:58 CEST,,,
1,1427736867739299841,1427736867739299841,1629234000000.0,2021-08-17 22:59:29,200,,Some casual suggestions to 😏SLIDE😏 into when u...,en,"['shoesdaytuesday', 'afterskewlslide']",[],...,,,,,,[],,,,
2,1427667300488937476,1427667300488937476,1629217000000.0,2021-08-17 18:23:03,200,,RT @ValaAfshar: You are not your job.,en,[],[],...,,,259725200.0,You are not your job.,1.427648e+18,[],2021-08-17 17:05:53 CEST,,,
3,1427667012105371652,1427667012105371652,1629217000000.0,2021-08-17 18:21:55,200,,What have we become 😔😂 Toddler Cites Freedom ...,en,[],[],...,,,,,,[],,,,
4,1427497703596990467,1427497703596990467,1629177000000.0,2021-08-17 07:09:08,200,,The tech giants that refuse to massively addre...,en,[],[],...,,,,,,[],,,,


In [7]:
tweet_data.shape, tweet_data.id.nunique()

((40581, 38), 40581)

In [8]:
combined_data = tweet_data.merge(user_details, left_on='user_id', right_on='id', how='left', suffixes=['', '_user'])

In [9]:
combined_data.shape, combined_data.id.nunique()

((40581, 56), 40581)

In [10]:
combined_data.to_csv('tweet_and_user_data.csv', index=False)