## Setup

In [105]:
%pip install tweepy requests

Defaulting to user installation because normal site-packages is not writeable


In [106]:
import pandas as pd
import requests
import time

In [221]:
BEARER_TOKEN = 'AAAAAAAAAAAAAAAAAAAAAIYNvQEAAAAAsLYgHbJXVgPAcOjhgfxQecBqTcU%3DZybBFju9hi6LHJveb7yEk6lOHTcG1tgH7JPvNF3KXPO1Ht4Atv'

In [108]:
athletes = pd.read_csv('../data/usernames/accounts_final.csv')

# Get Follower and Tweet Counts

In [109]:
def get_user_metrics_batch(usernames, bearer_token):
    # Filter out any non-string usernames
    usernames = [username for username in usernames if isinstance(username, str)]
    
    if not usernames:  # If the list is empty after filtering
        return {}
    
    usernames_str = ','.join(usernames)
    url = f"https://api.twitter.com/2/users/by?usernames={usernames_str}&user.fields=public_metrics"
    headers = {
        "Authorization": f"Bearer {bearer_token}"
    }
    response = requests.get(url, headers=headers)
    
    if response.status_code == 200:
        users_data = response.json()
        metrics_dict = {user['username']: {
            'followers_count': user['public_metrics']['followers_count'],
            'tweet_count': user['public_metrics']['tweet_count']
        } for user in users_data['data']}
        return metrics_dict
    else:
        print(f"Error: {response.status_code} - {response.text}")
        return {}

In [110]:
batch_size = 100
batches = [athletes['username'][i:i + batch_size] for i in range(0, len(athletes), batch_size)]

In [111]:
all_metrics = {}

for batch in batches:
    metrics = get_user_metrics_batch(batch, BEARER_TOKEN)
    all_metrics.update(metrics)
    time.sleep(1)  # Sleep for 1 second to respect rate limits

In [137]:
athletes['followers_count'] = athletes['username'].map(lambda x: all_metrics.get(x, {}).get('followers_count'))
athletes['tweet_count'] = athletes['username'].map(lambda x: all_metrics.get(x, {}).get('tweet_count'))
athletes

Unnamed: 0,name,username,sport,followers_count,tweet_count
0,Cristiano Ronaldo,Cristiano,soccer,112479386,4126
1,LeBron James,KingJames,basketball,52978276,9343
2,Andrés Iniesta,andresiniesta8,soccer,25156086,3785
3,Ronaldinho,10Ronaldinho,soccer,21525346,6977
4,Karim Benzema,Benzema,soccer,21175235,3997
...,...,...,...,...,...
6563,Will Rackley,WillRackley,football,3,2
6564,Demaryius Thomas,DemaryiusT,football,2,1
6565,Christian Arroyo,arroyo_c,baseball,2,17
6566,Jack Williams,G5Jack,football,1,2


In [163]:
athletes.to_csv('../data/usernames/accounts_final.csv')

In [164]:
athletes = pd.read_csv('../data/usernames/accounts_final.csv')
athletes = athletes.drop(columns=['Unnamed: 0'])
athletes.head()

Unnamed: 0,name,username,sport,followers_count,tweet_count
0,Cristiano Ronaldo,Cristiano,soccer,112479386,4126
1,LeBron James,KingJames,basketball,52978276,9343
2,Andrés Iniesta,andresiniesta8,soccer,25156086,3785
3,Ronaldinho,10Ronaldinho,soccer,21525346,6977
4,Karim Benzema,Benzema,soccer,21175235,3997


## Handle missing values and duplicates

In [165]:
athletes.dropna(inplace=True)
athletes.isna().sum()

name               0
username           0
sport              0
followers_count    0
tweet_count        0
dtype: int64

In [166]:
# remove accounts with 0 tweets or 0 followers
athletes = athletes[(athletes['followers_count'] != 0) & (athletes['tweet_count'] != 0)]

# sort by most followers
athletes = athletes.sort_values(by='followers_count', ascending=False)
athletes

Unnamed: 0,name,username,sport,followers_count,tweet_count
0,Cristiano Ronaldo,Cristiano,soccer,112479386,4126
1,LeBron James,KingJames,basketball,52978276,9343
2,Andrés Iniesta,andresiniesta8,soccer,25156086,3785
3,Ronaldinho,10Ronaldinho,soccer,21525346,6977
4,Karim Benzema,Benzema,soccer,21175235,3997
...,...,...,...,...,...
6550,Will Rackley,WillRackley,football,3,2
6551,Demaryius Thomas,DemaryiusT,football,2,1
6552,Christian Arroyo,arroyo_c,baseball,2,17
6553,Jack Williams,G5Jack,football,1,2


In [167]:
def view_duplicate_name(name):
    print(athletes[athletes['name'] == name])

In [168]:
# view athletes with the same name and sport
athletes[athletes.duplicated(['name', 'sport'], keep=False)]

Unnamed: 0,name,username,sport,followers_count,tweet_count
1301,Marvin Jones,MarvinJonesJr,football,64113,5965
1398,Mike James,TheNatural_05,basketball,58127,25443
1605,Jaylin Williams,Jay_MWilliams_,basketball,46978,1912
1609,D.J. Williams,dj45williams,football,46838,8572
2476,Marvin Jones,MarvinJonesJets,football,22912,22757
2478,Mike James,mikejames7,basketball,22902,9476
3423,D.J. Williams,DjWilliam55,football,11334,12088
4704,Jaylin Williams,iso__jaywill,basketball,4351,40


In [172]:
# drop athletes with a duplicate that has the wrong username
duplicates_to_drop = [1886, 2630, 5498, 5826, 519, 2564, 4603, 5151, 4499, 5168, 5288, 5493, 6097]

In [170]:
athletes.drop(index=duplicates_to_drop, axis=0, inplace=True)

In [171]:
athletes[athletes.duplicated(['name', 'sport'], keep=False)]

Unnamed: 0,name,username,sport,followers_count,tweet_count
1301,Marvin Jones,MarvinJonesJr,football,64113,5965
1398,Mike James,TheNatural_05,basketball,58127,25443
1605,Jaylin Williams,Jay_MWilliams_,basketball,46978,1912
1609,D.J. Williams,dj45williams,football,46838,8572
2476,Marvin Jones,MarvinJonesJets,football,22912,22757
2478,Mike James,mikejames7,basketball,22902,9476
3423,D.J. Williams,DjWilliam55,football,11334,12088
4704,Jaylin Williams,iso__jaywill,basketball,4351,40
