# Downloading data
In this notebook we download the Twitter data we want to use in our prediction tasks. We use the `twint` library to get around the typical Twitter rate limits, but we won't be abusing anything.

The process goes:

1. Pick some keywords that we want Tweets about
2. The script searches for Tweets on those topics from a certain time period
3. From these Tweets, the script then grabs the information on these accounts

Maybe we should do something like finding tweets that have a certain amount of attention - grab those users - and then scrape their tweets in X number of days - that way we don't have that much leakage

In [1]:
from datetime import datetime
import time

import pandas as pd
import twint
import nest_asyncio
nest_asyncio.apply()


users_downloaded_at = '2021-08-11'
download_tweets_from = '2021-08-12'
download_tweets_to = '2021-08-19'

In [2]:
usernames = pd.read_csv('handles.txt', header=0, names=['username'])

In [3]:
results = []
unfound_accounts = []
not_worked = []
for i, username in enumerate(usernames.username.unique()):
    if i % 100 == 0:
        print(f'Done with {i}')
    
    c = twint.Config()
    c.Username = username
    c.Pandas = True
    c.Hide_output = True
    
    try:
        twint.run.Lookup(c)
    
        results.append(twint.storage.panda.User_df)
    except ValueError as e:
        if 'Cannot find twitter account' in str(e):
            unfound_accounts.append(username)
        else:
            raise
    except RefreshTokenException:
        not_worked.append(username)
        time.sleep(60)
        
        try:
            twint.run.Lookup(c)
            results.append(twint.storage.panda.User_df)
        except:
            print('Getting refresh errors!')

results = pd.concat(results)
results.to_csv(f'{datetime.today().strftime("%Y-%m-%d")}-handles-data.csv', index=False)

print(unfound_accounts)
print(not_worked)

Done with 0


CRITICAL:root:twint.get:User:'user'
CRITICAL:root:twint.get:User:'user'
CRITICAL:root:twint.get:User:'user'


Done with 100


CRITICAL:root:twint.get:User:'user'
CRITICAL:root:twint.get:User:'user'


Done with 200


CRITICAL:root:twint.get:User:'user'


{'id': 'VXNlcjo0MTUwMDM5Mjg2', 'rest_id': '4150039286', 'legacy_extended_profile': {}, 'is_profile_translatable': False}
Done with 300


CRITICAL:root:twint.get:User:'user'


{'id': 'VXNlcjo3MDcwNzc0NDA=', 'rest_id': '707077440', 'legacy_extended_profile': {}, 'is_profile_translatable': False}
{'id': 'VXNlcjoxMzY4ODM1ODg4ODkzOTg4ODY0', 'rest_id': '1368835888893988864', 'legacy_extended_profile': {}, 'is_profile_translatable': False}
{'id': 'VXNlcjoxMzY4ODM1MDE3ODE2Njk0Nzg2', 'rest_id': '1368835017816694786', 'legacy_extended_profile': {}, 'is_profile_translatable': False}


CRITICAL:root:twint.get:User:'user'


Done with 400


CRITICAL:root:twint.get:User:'user'
CRITICAL:root:twint.get:User:'user'


{'id': 'VXNlcjoxMjk0MDU4OTAxNjg4NzIxNDEz', 'rest_id': '1294058901688721413', 'legacy_extended_profile': {}, 'is_profile_translatable': False}


CRITICAL:root:twint.get:User:'user'


Done with 500


CRITICAL:root:twint.get:User:'user'
CRITICAL:root:twint.get:User:'user'


Done with 600


CRITICAL:root:twint.get:User:'user'


{'id': 'VXNlcjo2MTAwMzgwNA==', 'rest_id': '61003804', 'legacy_extended_profile': {'birthdate': {'day': 14, 'month': 7, 'visibility': 'Public', 'year_visibility': 'Self'}}, 'is_profile_translatable': False}
Done with 700
{'id': 'VXNlcjoxNzMwMTQ0MDI0', 'rest_id': '1730144024', 'legacy_extended_profile': {}, 'is_profile_translatable': False}
{'id': 'VXNlcjo5NjA2MDQ3MTM2NjEyMjI5MTM=', 'rest_id': '960604713661222913', 'legacy_extended_profile': {}, 'is_profile_translatable': False}


CRITICAL:root:twint.get:User:'user'
CRITICAL:root:twint.get:User:'user'
CRITICAL:root:twint.get:User:'user'
CRITICAL:root:twint.get:User:'user'
CRITICAL:root:twint.get:User:'user'
CRITICAL:root:twint.get:User:'user'


Done with 800
{'id': 'VXNlcjoyMzY3Mzg2MDA=', 'rest_id': '236738600', 'legacy_extended_profile': {'birthdate': {'day': 15, 'month': 7, 'year': 1990, 'visibility': 'Public', 'year_visibility': 'Public'}}, 'is_profile_translatable': False}


CRITICAL:root:twint.get:User:'user'
CRITICAL:root:twint.get:User:'user'


Done with 900


CRITICAL:root:twint.get:User:'user'


{'id': 'VXNlcjo0MTMxNTc4MDU3', 'rest_id': '4131578057', 'legacy_extended_profile': {}, 'is_profile_translatable': False}


CRITICAL:root:twint.get:User:'user'
CRITICAL:root:twint.get:User:'user'
CRITICAL:root:twint.get:User:'user'
CRITICAL:root:twint.get:User:'user'


{'id': 'VXNlcjoyOTkwMjYwOTM5', 'rest_id': '2990260939', 'legacy_extended_profile': {}, 'is_profile_translatable': False}
Done with 1000
{'id': 'VXNlcjoxMjU0MTAzOTkzMTk1OTEzMjE2', 'rest_id': '1254103993195913216', 'legacy_extended_profile': {}, 'is_profile_translatable': False}


CRITICAL:root:twint.get:User:'user'
CRITICAL:root:twint.get:User:'user'


{'id': 'VXNlcjo5MDQyNzYzOTc5OTYyMDQwMzQ=', 'rest_id': '904276397996204034', 'legacy_extended_profile': {}, 'is_profile_translatable': False}
Done with 1100
{'id': 'VXNlcjo3MjU2NjUyNDgyODAyMzYwMzI=', 'rest_id': '725665248280236032', 'legacy_extended_profile': {}, 'is_profile_translatable': False}
{'id': 'VXNlcjozNzIzNTUxNjY=', 'rest_id': '372355166', 'legacy_extended_profile': {}, 'is_profile_translatable': False}


CRITICAL:root:twint.get:User:'user'


Done with 1200
{'id': 'VXNlcjoxNTcxNjAzOQ==', 'rest_id': '15716039', 'legacy_extended_profile': {}, 'is_profile_translatable': False}


CRITICAL:root:twint.get:User:'user'


{'id': 'VXNlcjoyNTA3Mzg3Nw==', 'rest_id': '25073877', 'legacy_extended_profile': {}, 'is_profile_translatable': False}
['SenKamalaHarris', 'SenBennetCO', 'SenateMajLdr', 'EmWatson', 'aamir_khan', 'kelly_clarkson', 'ollyofficial', 'officialjaden', 'DrBassemYoussef', 'Ahlam_Alshamsi', 'msleamichele', 'Jenna_Marbles', 'scooterbraun', 'samsmithworld', 'laliespos', 'missA_suzy', 'MPOFFICIAL', 'S_C_', 'RealTracyMorgan', 'rickyrozay', 'mark_wahlberg', 'PointlessBlog', 'Elovera22', 'carocruzosorio', 'fucktyler', 'JackJackJohnson', 'MannyPacquiaoTR', 'S1dharthM', 'superstarrajini', 'msnbc_breaking', 'cnnlive']


In [None]:
user_details = pd.read_csv(f'{users_downloaded_at}-handles-data.csv')

In [None]:
results = []
for i, username in enumerate(user_details.username.unique()):
    if i % 100 == 0:
        print(f'Done with {i}')
    
    c = twint.Config()
    c.Username = username
    c.Since = download_tweets_from
    c.Until = download_tweets_to
    c.Pandas = True
    c.Hide_output = True
    
    twint.run.Profile(c)
    
    results.append(twint.storage.panda.Tweets_df)
    
results = pd.concat(results)
results.to_csv(f'{users_downloaded_at}-{download_tweets_from}-{download_tweets_to}-tweets-data.csv', index=False)