# Dataset Share Orangetheory  & Peloton

This notebook includes the code that scrapes Twitter data from two fitness companies.

It first scrapes both accounts for follower information and then, scrapes for tweets using hashtags.

### Imports

In [1]:
import datetime
import tweepy
import pandas as pd
import numpy as np
from collections import Counter
from pprint import pprint

### Authenticate

In [None]:
# API keys in a .py file called API_keys.py
from API_keys import api_key, api_key_secret, access_token, access_token_secret

In [None]:
# Authenticate the Tweepy API
auth = tweepy.OAuthHandler(api_key,api_key_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth,wait_on_rate_limit=True)

## Grab Follower Information

In [None]:
# Make a list of handles
handles = ['orangetheory','onepeloton']


# This will iterate through each Twitter handle that we're collecting from
for screen_name in handles:
    
    # Tells Tweepy we want information on the handle we're collecting from
    # The next line specifies which information we want, which in this case is the number of followers 
    user = api.get_user(screen_name=screen_name) 
    followers_count = user.followers_count

    # Let's see roughly how long it will take to grab all the follower IDs. 
    print(f'''
    @{screen_name} has {followers_count} followers. 
    That will take roughly {followers_count/(5000*60):.0f} hours and {followers_count/(5000):.2f} minutes
    ''')
    

In [None]:
# This creates a dictionary containing a list for each Twitter handle we'll be grabbing follower IDs from
id_dict = {'orangetheory' : [],
           'onepeloton' : []}

# Grabs the time when we start making requests to the API
start_time = datetime.datetime.now()

# .keys() allows us to iterate through each key in the dictionary
for handle in id_dict.keys():
    
    # Each page contains 5,000 records, so since we know there are much more than 5,000 followers for both
    # we must iterate through each of the pages in order to get all follower IDs
    # To grab the follower IDs, we will be using followers_ids
    for page in tweepy.Cursor(api.get_follower_ids,
                              # This is how we will get around the issue of not being able to grab all ids at once
                              # Once the rate limit is hit, we will be notified that we must wait 15 mins (900 secs)
                              wait_on_rate_limit=True, wait_on_rate_limit_notify=True, compression=True,
                              screen_name=handle).pages():

        # The page variable comes back as a list, so we have to use .extend rather than .append
        id_dict[handle].extend(page)
        

# Let's see how long it took to grab all follower IDs
end_time = datetime.datetime.now()
elapsed_time = end_time - start_time
print(elapsed_time)

In [None]:
# Check out user IDs
id_dict['orangetheory'][:10]

In [None]:
# Check out user IDs
id_dict['onepeloton'][:10]

@orangetheory

In [None]:
#ID's are numbers and are are different from handles. To see the twitter handles we gathered, we'll have to use the screen_name feature.

users = id_dict['orangetheory'][:10]
for name in users:
    
    user = api.get_user(user_id=name)
    print(user.screen_name)
    

@onepeloton

In [None]:
users = id_dict['onepeloton'][:10]

for name in users:
    
    user = api.get_user(user_id=name)
    print(user.screen_name)
    

#### Grab descriptions based on the followers IDs
The code inside this function comes from Brenden Connors and has been updated to refelct API changes 10/2021


In [None]:
# This will quickly grab information about each follower.
def get_screen_names(list_of_ids, list_for_screen_names):
    start=0 #we have feed the API 100 ID's at a time, this will iterate through them
    end=0
    followers=[]
    while end-1 <= len(list_of_ids): #quit when we get past the end of our list
        end += 100 #update the end of our slice
        if end <= len(list_of_ids): #split into if else statement to slice correctly
            try:
                followers_temp = api.lookup_users(user_id = list_of_ids[start:end])    # Added in user_id
            except TweepError as err: 
                if err.code == 103: #if we get a rate limit error, go to sleep
                    print('sleeping, 900 seconds')
                    time.sleep(900)
        else:
            try:
                followers_temp = api.lookup_users(user_id = list_of_ids[start:])     #Added in user_id  object
            except tweepy.TweepError as err:
                if err.code == 103:
                    print('sleeping, 900 seconds')
                    time.sleep(900)
        followers.extend(followers_temp)
        start = end #update our starting slice index for next loop
    list_for_screen_names.extend(followers)

In [None]:
# Let's put the function to use and make a new dictionary holding all user information
user_dict = {'orangetheory': [],
                    'onepeloton' : []}

for handle in user_dict.keys():
    
    
    get_screen_names(id_dict[handle],user_dict[handle])

The function grabs all objects tied to a public Twitter account. If we take a look at the first follower, it'll look ugly. It is helpful to look through the output and see what objects you want, however. Let's just grab the screen_name and description for now, and write it to a .txt file. Since we have all the data stored in a dictionary, there won't be a wait time.

In [None]:
#user_dict['orangetheory'][0]

In [None]:
#user_dict['onepeloton'][0]

In [None]:
headers = ['screen_name','description']

for team in user_dict.keys():

    # Descriptions with emoji or non-Roman letters can cause trouble. Encoding your .txt file in utf-8 will help
    with open(f'{team}_followers.txt','w', encoding='utf-8') as out_file:
        out_file.write('\t'.join(headers) + '\n')

        for idx, user in enumerate(user_dict[team]):
            
            # For accounts set to private, we won't be able to get the description unless we follow them
            # Putting in a try/except statement, we can get around this issue.
            description = str(user.description).replace('\t',' ').replace('\n',' ')
            outline = [user.screen_name, description]

            out_file.write('\t'.join([str(item) for item in outline]) + '\n')
              
            #if idx == 100:
               # break

## Grabbing Tweets with #Orangetheory
Tweets were pulled 11/6/21

#### orangetheory hashtag

In [None]:
# Note: the search API only goes back 7 days
date_start = datetime.date.today()
date_end = date_start - datetime.timedelta(days=2)

search_words = f'#orangetheory since:{date_end} until:{date_start} -filter:retweets'

# Notice the differences between searching tweets and users. 
for idx, item in enumerate(tweepy.Cursor(api.search_tweets, 
                                         q = search_words,
                                         since= date_end,
                                         tweet_mode='extended',# tweet_mode is defaulted to short, which only holds the first 140 characters of a Tweet.
                                         lang='en').items()):
    
    # There's all sort of information you can get from Tweets
    # Find more tweet objects here: https://developer.twitter.com/en/docs/twitter-api/v1/data-dictionary/overview/tweet-object
    print(item.user.screen_name)
    print(item.created_at)
    print(item.full_text)
    print('-'*40)
    
    #if idx == 1000:
       # break
    

In [None]:
search_words = '#orangetheory -filter:retweets'


tweets_all = tweepy.Cursor(api.search_tweets,
                   tweet_mode='extended',
                   q=search_words,
                   lang='en').items()

# Put all the Tweet objects for a single Tweet into a tuple, and put all those into a list
tweets = [(tweet.full_text,tweet.created_at,tweet.user.screen_name) for tweet in tweets_all]


In [None]:
#tweets[:2000]

In [None]:
df = pd.DataFrame(tweets) 
    
# saving the dataframe 
df.to_csv('orangetheory_hashtag.csv')

#### OTF hashtag

In [None]:
# Note: the search API only goes back 7 days
date_start = datetime.date.today()
date_end = date_start - datetime.timedelta(days=2)

search_words = f'#otf since:{date_end} until:{date_start} -filter:retweets'

# Notice the differences between searching tweets and users. 
for idx, item in enumerate(tweepy.Cursor(api.search_tweets, 
                                         q = search_words,
                                         since= date_end,
                                         tweet_mode='extended',# tweet_mode is defaulted to short, which only holds the first 140 characters of a Tweet.
                                         lang='en').items()):
    
    # There's all sort of information you can get from Tweets
    # Find more tweet objects here: https://developer.twitter.com/en/docs/twitter-api/v1/data-dictionary/overview/tweet-object
    print(item.user.screen_name)
    print(item.created_at)
    print(item.full_text)
    print('-'*40)
    
    #if idx == 1000:
       # break
    

In [None]:
search_words = '#otf -filter:retweets'


tweets_all = tweepy.Cursor(api.search_tweets,
                   tweet_mode='extended',
                   q=search_words,
                   lang='en').items()

# Put all the Tweet objects for a single Tweet into a tuple, and put all those into a list
tweets = [(tweet.full_text,tweet.created_at,tweet.user.screen_name) for tweet in tweets_all]

In [None]:
df = pd.DataFrame(tweets) 
    
# saving the dataframe 
df.to_csv('OTF_hashtag.csv')

## Grabbing Tweets with #peleton
Tweets were pulled 11/6/21

In [None]:
# Note: the search API only goes back 7 days
date_start = datetime.date.today()
date_end = date_start - datetime.timedelta(days=2)

search_words = f'#peloton since:{date_end} until:{date_start} -filter:retweets'

# Notice the differences between searching tweets and users. 
for idx, item in enumerate(tweepy.Cursor(api.search_tweets, 
                                         q = search_words,
                                         since= date_end,
                                         tweet_mode='extended',# tweet_mode is defaulted to short, which only holds the first 140 characters of a Tweet.
                                         lang='en').items()):
    
    print(item.user.screen_name)
    print(item.created_at)
    print(item.full_text)
    print('-'*40)
    
    #if idx == 1000:
       # break

In [None]:
search_words = '#peloton -filter:retweets'


tweets_all = tweepy.Cursor(api.search_tweets,
                   tweet_mode='extended',
                   q=search_words,
                   lang='en').items()

# Put all the Tweet objects for a single Tweet into a tuple, and put all those into a list
tweets = [(tweet.full_text,tweet.created_at,tweet.user.screen_name) for tweet in tweets_all]

In [None]:
df = pd.DataFrame(tweets) 
    
# saving the dataframe 
df.to_csv('peloton_hashtag.csv')

# Follower & Hashtag Information

### Orangetheory Followers

In [3]:
# Make df of OTF followers
columns = ["screen_name", "description"]
otf_followers = pd.read_csv('orangetheory_followers.txt', names=columns,sep='\t', lineterminator='\n')
otf_followers = otf_followers.iloc[1: , :]

In [4]:
# check df shape/size
print(otf_followers.shape)

(58260, 2)


In [6]:
#check df head
otf_followers.head()

Unnamed: 0,screen_name,description
1,hingtox,tattoo collector\r
2,alejandrthkmil1,Hades\r
3,beverlgiyodet,I'm pretty boring and not really social lol I ...
4,omcojrex,Soundcloud rapper\r
5,AarinRmorroy,Follow my Twitter @AynniaG\r


### Orangetheory Hashtags
Since text pulled from #orangetheory is small, I also included #otf

#oragngetheory dataframe & stats

In [9]:
columns = ["tweet", "datetime", "screenname"]
orangetheory_hash = pd.read_csv('orangetheory_hashtag.csv', names=columns, sep=',')
orangetheory_hash = orangetheory_hash.iloc[1: , :]

In [11]:
# check df size / shape
print(orangetheory_hash.shape)


(21, 3)


In [12]:
#check df head
orangetheory_hash.head()

Unnamed: 0,tweet,datetime,screenname
0.0,Orangetheory takes over the Komets game! #oran...,2021-11-06 01:08:16+00:00,KAILEYMSHERMAN
1.0,We are excited to announce the promotion of Fr...,2021-11-05 14:09:09+00:00,KianCapital
2.0,The best Monday of 2021 is right around the \r...,2021-11-05 13:17:26+00:00,OTFKempsCorner
3.0,One month (only 9 workouts) with @orangetheory...,2021-11-05 02:12:11+00:00,taterbennett
4.0,When you can literally WORK towards your fitne...,2021-11-04 18:21:33+00:00,CB_Radio82


#otf dataframe & stats

In [15]:
columns = ["tweet", "datetime", "screenname"]
otf_hash = pd.read_csv('otf_hashtag.csv', names=columns, sep=',')
otf_hash = otf_hash.iloc[1: , :]

In [16]:
# check size / shape
print(otf_hash.shape)

(230, 3)


In [17]:
#check df head
orangetheory_hash.head()

Unnamed: 0,tweet,datetime,screenname
0.0,Orangetheory takes over the Komets game! #oran...,2021-11-06 01:08:16+00:00,KAILEYMSHERMAN
1.0,We are excited to announce the promotion of Fr...,2021-11-05 14:09:09+00:00,KianCapital
2.0,The best Monday of 2021 is right around the \r...,2021-11-05 13:17:26+00:00,OTFKempsCorner
3.0,One month (only 9 workouts) with @orangetheory...,2021-11-05 02:12:11+00:00,taterbennett
4.0,When you can literally WORK towards your fitne...,2021-11-04 18:21:33+00:00,CB_Radio82


## @onepeloton Followers

In [18]:
# Make df of Peloton followers
columns = ["screen_name", "description"]
peloton_followers = pd.read_csv('onepeloton_followers.txt', names=columns, lineterminator='\n')
peloton_followers = peloton_followers.iloc[1: , :]

In [19]:
# check df shape/size
print(peloton_followers.shape)

(167340, 2)


In [20]:
#check df head
peloton_followers.head()

Unnamed: 0,screen_name,description
1,JoseRod45214209\tMucha diversión\r,
2,krista_walter\tEarly-stage CFO/IR Consultant,Analyst; Mentor @UNSWFounders & Energylab; As...
3,kdurkin11\t\r,
4,ChazeAbaq\tmuniman\r,
5,Drez2018\tRetired ; Loving Life With Goms !!\r,


## #onepeloton 

In [26]:
# Make df of peloton hashtags
columns = ["tweet", "datetime", "screenname"]
peloton_hash = pd.read_csv('peloton_hashtag.csv', names=columns, sep=',')
peloton_hash = peloton_hash.iloc[1: , :]

In [27]:
# check df shape/size
print(peloton_hash.shape)

(756, 3)


In [28]:
# Check df
peloton_hash.head()

Unnamed: 0,tweet,datetime,screenname
0.0,"#Peloton stock plunged up to 34% yesterday, it...",2021-11-06 15:32:00+00:00,TCollege
1.0,Wasn’t into working out at all. @tune2tunde ha...,2021-11-06 15:29:50+00:00,StrengthInTime
2.0,I’ve been slacking this week. Haven’t worked o...,2021-11-06 15:14:52+00:00,the17thman
3.0,New #Peloton item in the Apparel Store: Peloto...,2021-11-06 15:03:17+00:00,PelotonAlerts
4.0,Even on baecation we got to put in the work. #...,2021-11-06 15:03:01+00:00,idelle4life


### stats

In [36]:
orange_followers = open("orangetheory_followers.txt",'r',  encoding="UTF-8").read()
pelo_followers = open("onepeloton_followers.txt",'r',  encoding="UTF-8").read()

orange_hash = open("orangetheory_hashtag.csv",'r',  encoding="UTF-8").read()
orange_hash_otf = open("otf_hashtag.csv",'r',  encoding="UTF-8").read()
pelo_hash = open("peloton_hashtag.csv",'r',  encoding="UTF-8").read()

orangetheory followers stats

In [50]:
print(f"Orangetheory followers is {len(orange_followers)} tokens long.")
print(f"Orangetheory followers has {len(set(orange_followers))} unique tokens.")
print(f"Orangetheory followers lexical diversity is {len(set(orange_followers))/len(orange_followers):.3f}.")
# Build a vector of token length
orange_followers_len = [len(w) for w in orange_followers]
print(f"Orangetheory followers average token length is {np.mean(orange_followers_len):.2f}.")
pprint(sorted(Counter(orange_followers_len).items()))


print("")
print("All statistics are calculated BEFORE normalization and tokenization.")

Orangetheory followers is 3817792 tokens long.
Orangetheory followers has 3575 unique tokens.
Orangetheory followers lexical diversity is 0.001.
Orangetheory followers average token length is 1.00.
[(1, 3817792)]

All statistics are calculated BEFORE normalization and tokenization.


onepeloton stats

In [44]:
print(f"Onepeloton followers is {len(pelo_followers)} tokens long.")
print(f"Onepeloton followers has {len(set(pelo_followers))} unique tokens.")
print(f"Onepeloton followers lexical diversity is {len(set(pelo_followers))/len(pelo_followers):.3f}.")
# Build a vector of token length
pelo_followers_len = [len(w) for w in pelo_followers]
print(f"Onepeloton followers average token length is {np.mean(pelo_followers_len):.2f}.")
pprint(sorted(Counter(pelo_followers_len).items()))

print("")
print("All statistics are calculated BEFORE normalization and tokenization.")

Onepeloton followers is 10490485 tokens long.
Onepeloton followers has 4561 unique tokens.
Onepeloton followers lexical diversity is 0.000.
Onepeloton followers average token length is 1.00.
[(1, 10490485)]

All statistics are calculated BEFORE normalization and tokenization.


#orangethoery stats

In [46]:
print(f"#orangetheory is {len(orange_hash)} tokens long.")
print(f"#orangetheory has {len(set(orange_hash))} unique tokens.")
print(f"#orangetheory lexical diversity is {len(set(orange_hash))/len(orange_hash):.3f}.")
# Build a vector of token length
orange_hash_len = [len(w) for w in orange_hash]
print(f"#orangetheory average token length is {np.mean(orange_hash_len):.2f}.")
pprint(sorted(Counter(orange_hash_len).items()))

print("")
print("All statistics are calculated BEFORE normalization and tokenization.")

#orangetheory is 4828 tokens long.
#orangetheory has 100 unique tokens.
#orangetheory lexical diversity is 0.021.
#orangetheory average token length is 1.00.
[(1, 4828)]

All statistics are calculated BEFORE normalization and tokenization.


#otf stats

In [48]:
print(f"#otf is {len(orange_hash_otf)} tokens long.")
print(f"#otf has {len(set(orange_hash_otf))} unique tokens.")
print(f"#otf lexical diversity is {len(set(orange_hash_otf))/len(orange_hash_otf):.3f}.")
# Build a vector of token length
orange_hash_otf_len = [len(w) for w in orange_hash_otf]
print(f"#otf average token length is {np.mean(orange_hash_otf_len):.2f}.")
pprint(sorted(Counter(orange_hash_otf_len).items()))

print("")
print("All statistics are calculated BEFORE normalization and tokenization.")

#otf is 47994 tokens long.
#otf has 176 unique tokens.
#otf lexical diversity is 0.004.
#otf average token length is 1.00.
[(1, 47994)]

All statistics are calculated BEFORE normalization and tokenization.


#peloton 

In [49]:
print(f"#peloton is {len(pelo_hash)} tokens long.")
print(f"#peloton has {len(set(pelo_hash))} unique tokens.")
print(f"#peloton lexical diversity is {len(set(pelo_hash))/len(pelo_hash):.3f}.")
# Build a vector of token length
pelo_hash_len = [len(w) for w in pelo_hash]
print(f"#peloton average token length is {np.mean(pelo_hash_len):.2f}.")
pprint(sorted(Counter(pelo_hash_len).items()))

print("")
print("All statistics are calculated BEFORE normalization and tokenization.")

#peloton is 157475 tokens long.
#peloton has 226 unique tokens.
#peloton lexical diversity is 0.001.
#peloton average token length is 1.00.
[(1, 157475)]

All statistics are calculated BEFORE normalization and tokenization.
