In [91]:
import pandas as pd
import numpy as np

# modules for data collection
import GetOldTweets3 as got3
import tweepy
# developer account keys -- create twitter_credentials.py
import twitter_credentials as tc

### Extract tweets on specific search phrase

We are using the GetOldTweets3 package (https://pypi.org/project/GetOldTweets3/) as it can bypass the limits imposed by Twitter for scraping data older than 7 days old. Particular useful for niche topics.

In [92]:
# use the GetOldTweets3 package to bypass Twitter API date restrictions
search_phrase = 'climate emergency'

# start date inclusive but end date is not - recommend 7 day periods at a time.
start_date = '2020-01-01'
until = '2020-01-02'
maxtweets = 10

# run tweets based on criteria
tweetCriteria = got3.manager.TweetCriteria().setQuerySearch(search_phrase)\
                                           .setSince(start_date)\
                                           .setUntil(until)\
                                           .setMaxTweets(maxtweets)

# results
df = pd.DataFrame(columns =['date','username','to','replies','retweets',
                            'favorites','text','geo','mentions','hashtags','id'])

# add objects into results
for i in range(0,maxtweets):
    tweet = got3.manager.TweetManager.getTweets(tweetCriteria)[i]
    rows = {'date':tweet.date,'username':tweet.username,'to':tweet.to,
            'replies':tweet.replies,'retweets':tweet.retweets,'favourites':tweet.favorites,
            'text':tweet.text,'geo':tweet.geo,'mentions':tweet.mentions,'hashtags':tweet.hashtags,
            'id':tweet.id}
    df = df.append(rows,ignore_index=True)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 12 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   date        10 non-null     object 
 1   username    10 non-null     object 
 2   to          7 non-null      object 
 3   replies     10 non-null     object 
 4   retweets    10 non-null     object 
 5   favorites   0 non-null      object 
 6   text        10 non-null     object 
 7   geo         10 non-null     object 
 8   mentions    10 non-null     object 
 9   hashtags    10 non-null     object 
 10  id          10 non-null     object 
 11  favourites  10 non-null     float64
dtypes: float64(1), object(11)
memory usage: 1.1+ KB


This forms the base of our data collection as we will utilise other methods to learn more about the users who have contributed to our chosen topic. We can then model this to classify our users and filter the original dataset of tweets into the groups estimated.

### Extracting account_metadata for Twitter's developer API

Twitter's API allows us to extract user information by providing an account id or username. You need to apply for a developer account (https://developer.twitter.com/en/apply-for-access) and then create a file with your keys to load in as credentials or you can also just write them in as  the 4 variables.

In [93]:
# create a function to authenticate our access to API
def authenticate(api_key, secret_key, access_token, secret_token):

    auth = tweepy.OAuthHandler(api_key, secret_key)
    auth.set_access_token(access_token, secret_token)
    api = tweepy.API(auth, 
                     wait_on_rate_limit=True, 
                     wait_on_rate_limit_notify=True)
  
    return api


api = authenticate(tc.api_key, tc.secret_key, tc.access_token, tc.secret_token)

In [94]:
# create a list of users from tweets
users = df.username.unique().tolist()
users

['RealFakeLiberal',
 'magdalewis',
 'UrsulaHogben',
 'BlackGroodle',
 'jones_kayemary1',
 'kiwirip',
 'rajatraihanda',
 'jakobnaumann',
 'LindaConno',
 'ApilG']

In [95]:
# create lists to store our API results
id_,username,screen_name_,location,url,description,verified,followers_count,friends_count,friends_count,favourites_count,statuses_count,created_at,default_profile, default_profile_image = ([] for i in range(15))

In [96]:
# connect to the API and retrieve user features for each user
for i in users:
    try:
        user = api.get_user(screen_name = i) 
        id_.append(user.id)
        username.append(user.name)
        screen_name_.append(user.screen_name)
        location.append(user.location)
        url.append(user.url)
        description.append(user.description)
        verified.append(user.verified)
        followers_count.append(user.followers_count)
        friends_count.append(user.friends_count)
        favourites_count.append(user.favourites_count)
        statuses_count.append(user.statuses_count)
        created_at.append(user.created_at)
        default_profile.append(user.default_profile)
        default_profile_image.append(user.default_profile_image)

    except tweepy.error.TweepError as t:
        if t.api_code == 50: # The code corresponding to the user not found error
            print("username that failed=",  screen_name)
            # accounts_L.remove(screen_name)
        elif t.api_code == 88: # The code for the rate limit error
            time.sleep(15*60) # Sleep for 15 minutes
    else:# if no error
        continue
        
# we will store them in accounts
accounts = pd.DataFrame({'id':id_,
                            'name': username,
                            'username': screen_name_,
                            'location': location,
                            'url': url,
                            'description': description,
                            'verified': verified,
                            'followers': followers_count,
                            'friends': friends_count,
                            'favourites_count': favourites_count,
                            'statuses_count': statuses_count,
                            'created_at': created_at,
                            'default_profile': default_profile,
                            'default_profile_image' : default_profile_image})

In [97]:
# take a look at the features for each users account
accounts.head()

Unnamed: 0,id,name,username,location,url,description,verified,followers,friends,favourites_count,statuses_count,created_at,default_profile,default_profile_image
0,744840375492214786,Primordial,RealFakeLiberal,,,,False,273,0,14277,25652,2016-06-20 10:32:38,True,False
1,44748523,lenalew,magdalewis,,,,False,20,408,435,983,2009-06-04 22:39:28,False,True
2,1282190791,Ursula Hogben,UrsulaHogben,"Sydney, Australia",https://t.co/e8QbmyDYSl,"Founder, lawyer, advisor & investor for sustai...",False,1743,2653,11211,4334,2013-03-20 03:22:34,False,False
3,85908137,Ranger,BlackGroodle,Australia,,"Soccer ⚽️ & basketball 🏀 nut, engineer, family...",False,35,270,2757,1916,2009-10-28 20:43:24,True,False
4,4260135373,Kaye Jones,jones_kayemary1,nagambie,,Strong advocate for justice. Enjoy the antics ...,False,1018,1890,38678,30644,2015-11-16 20:05:09,True,False


### Extracting tweet activity from these users

We can learn alot from account features such as creation date and location, but we can also extract information from what each user tweets about. We will now use the API to return the most recent 200 tweets from each user.

In [98]:
# create a function to call last 200 tweets
# some of the returns are dictionary objects and so requires a little more 
def get_tweets(screen_name,tweet_count):
    
    #make request for 200 tweets most recent
    new_tweets = api.user_timeline(screen_name = screen_name,tweet_mode='extended',count=tweet_count)

    #2D array
    tweets = [[tweet.id_str, 
               tweet.created_at, 
               tweet.full_text, 
               tweet.in_reply_to_screen_name, 
               tweet.in_reply_to_status_id,
               tweet.in_reply_to_user_id,
               tweet.retweet_count,
               tweet.favorite_count, 
               tweet.lang, 
               tweet.entities] for tweet in new_tweets]
    
    #transform into a df
    df = pd.DataFrame(tweets,columns=['id','created_at','text','in_reply_to_screen_name','in_reply_to_status_id','in_reply_to_user_id',
                                      'retweet_count','favourite_count','lang','entities'])
    
    return df

In [99]:
# take the users and id from our accounts API data - this filters out failed users
usernames = accounts.iloc[:,[2,0]]
usernames.head()

Unnamed: 0,username,id
0,RealFakeLiberal,744840375492214786
1,magdalewis,44748523
2,UrsulaHogben,1282190791
3,BlackGroodle,85908137
4,jones_kayemary1,4260135373


Failed usernames are accounts who have closed/suspended since tweets were collected. We could treat closed accounts as suspicious but there are too many reasons for this to happen and so its easier to just exclude them from the analysis. The return rate decreases when the time between collecting tweets and user info but when there is no lag between collection points - the return rate is > 99.9%.

In [100]:
# return the most recent tweets for each user
how_many_tweets = 200

tweets = pd.DataFrame(columns = ['id','created_at','text','in_reply_to_screen_name',
                                    'in_reply_to_status_id','in_reply_to_user_id',
                                    'retweet_count','favourite_count','lang','entities'])

for i,j in zip(usernames.username,usernames.id):
    try:
        df = get_tweets(i,how_many_tweets)
        df['screen_name'] = i
        df['user_id'] = str(j)
        tweets = pd.concat([tweets,df],sort=True)
        
    except tweepy.error.TweepError as t:
        if t.api_code == 34: # The code corresponding to the user not found error
            print("username that failed=",  i)
        elif t.api_code == 88: #rate limit error
            time.sleep(15*60) # Sleep for 15 minutes
    else:# if no error
        continue

In [101]:
tweets.head()

Unnamed: 0,created_at,entities,favourite_count,id,in_reply_to_screen_name,in_reply_to_status_id,in_reply_to_user_id,lang,retweet_count,screen_name,text,user_id
0,2020-05-02 13:15:57,"{'hashtags': [], 'symbols': [], 'user_mentions...",0,1256573143038087169,Tim_H1980,1.256507e+18,1194816000.0,en,0,RealFakeLiberal,@Tim_H1980 @MikeCarlton01 Yep. It's why I neve...,744840375492214786
1,2020-05-02 13:08:11,"{'hashtags': [], 'symbols': [], 'user_mentions...",0,1256571184881131520,,,,en,71,RealFakeLiberal,RT @MikeCarlton01: Shitferbrains is at it agai...,744840375492214786
2,2020-05-02 13:04:21,"{'hashtags': [], 'symbols': [], 'user_mentions...",0,1256570221478871040,,,,en,416,RealFakeLiberal,RT @noplaceforsheep: I have no idea at all how...,744840375492214786
3,2020-05-02 13:03:43,"{'hashtags': [], 'symbols': [], 'user_mentions...",0,1256570060493094917,MikeCarlton01,1.256503e+18,160856300.0,en,0,RealFakeLiberal,@MikeCarlton01 He can throw in Taylor and Robe...,744840375492214786
4,2020-05-02 12:58:38,"{'hashtags': [], 'symbols': [], 'user_mentions...",0,1256568784174739457,Origsmartassam,1.256488e+18,1.038749e+18,en,0,RealFakeLiberal,@Origsmartassam And the golf cart ? The pin ? ...,744840375492214786


In [102]:
# Create a second dataframe to put important information
tweets_final = pd.DataFrame(columns = ["id", "in_reply_to_screen_name", "in_reply_to_status_id", "in_reply_to_user_id",
                                      "user_mentions_screen_name", "user_mentions_id","retweet_count",
                                       "favourite_count",'lang',"text",'user_id', "screen_name"])

# joiner columns..
equal_columns = ["created_at", "id", "text"]
tweets_final[equal_columns] = tweets[equal_columns]

# create some functions to extract the info our of nested rows.
# Get the basic information about user 
def get_basics(tweets_final):
    tweets_final["screen_name"] = tweets["screen_name"]
    tweets_final["user_id"] = tweets["user_id"]
    return tweets_final

# Get the user mentions 
def get_usermentions(tweets_final):
    # Inside the tag 'entities' will find 'user mentions' and will get 'screen name' and 'id'
    tweets_final["user_mentions_screen_name"] = tweets["entities"].apply(lambda x: x["user_mentions"][0]["screen_name"] if x["user_mentions"] else np.nan)
    tweets_final["user_mentions_id"] = tweets["entities"].apply(lambda x: x["user_mentions"][0]["id_str"] if x["user_mentions"] else np.nan)
    return tweets_final

# Get the information about replies
def get_in_reply(tweets_final):
    # Just copy the 'in_reply' columns to the new dataframe
    tweets_final["in_reply_to_screen_name"] = tweets["in_reply_to_screen_name"]
    tweets_final["in_reply_to_status_id"] = tweets["in_reply_to_status_id"]
    tweets_final["in_reply_to_user_id"]= tweets["in_reply_to_user_id"]
    return tweets_final

# Lastly fill the new dataframe with the important information
def fill_df(tweets_final):
    get_basics(tweets_final)
    get_usermentions(tweets_final)
    get_in_reply(tweets_final)
    tweets_final["lang"]= tweets["lang"]
    tweets_final["retweet_count"]= tweets["retweet_count"]
    tweets_final["favourite_count"]= tweets["favourite_count"]
    return tweets_final


# Get the interactions between the different users
def get_interactions(row):
    # From every row of the original dataframe
    # First we obtain the 'user_id' and 'screen_name'
    user = row["user_id"], row["screen_name"]
    # Be careful if there is no user id
    if user[0] is None:
        return (None, None), []
    
    # The interactions are going to be a set of tuples
    interactions = set()
    
    # Add all interactions 
    # First, we add the interactions corresponding to replies adding the id and screen_name
    interactions.add((row["in_reply_to_user_id"], row["in_reply_to_screen_name"]))
    # After that, we add the interactions with retweets
    #interactions.add((row["retweeted_id"], row["retweeted_screen_name"]))
    # And later, the interactions with user mentions
    interactions.add((row["user_mentions_id"], row["user_mentions_screen_name"]))
    
    # Discard if user id is in interactions
    interactions.discard((row["user_id"], row["screen_name"]))
    # Discard all not existing values
    interactions.discard((None, None))
    # Return user and interactions
    return user, interactions

In [103]:
# final dataset for tweets
tweets_final = fill_df(tweets_final)
tweets_final.head()

Unnamed: 0,id,in_reply_to_screen_name,in_reply_to_status_id,in_reply_to_user_id,user_mentions_screen_name,user_mentions_id,retweet_count,favourite_count,lang,text,user_id,screen_name,created_at
0,1256573143038087169,Tim_H1980,1.256507e+18,1194816000.0,Tim_H1980,1194816090,0,0,en,@Tim_H1980 @MikeCarlton01 Yep. It's why I neve...,744840375492214786,RealFakeLiberal,2020-05-02 13:15:57
1,1256571184881131520,,,,MikeCarlton01,160856325,71,0,en,RT @MikeCarlton01: Shitferbrains is at it agai...,744840375492214786,RealFakeLiberal,2020-05-02 13:08:11
2,1256570221478871040,,,,noplaceforsheep,233343317,416,0,en,RT @noplaceforsheep: I have no idea at all how...,744840375492214786,RealFakeLiberal,2020-05-02 13:04:21
3,1256570060493094917,MikeCarlton01,1.256503e+18,160856300.0,MikeCarlton01,160856325,0,0,en,@MikeCarlton01 He can throw in Taylor and Robe...,744840375492214786,RealFakeLiberal,2020-05-02 13:03:43
4,1256568784174739457,Origsmartassam,1.256488e+18,1.038749e+18,Origsmartassam,1038748858241581056,0,0,en,@Origsmartassam And the golf cart ? The pin ? ...,744840375492214786,RealFakeLiberal,2020-05-02 12:58:38


In [104]:
# get rid of null for feature engineering
tweets_final = tweets_final.where((pd.notnull(tweets_final)), None)

We now have a twitter feed and the user account info, most recent tweets for all those who contributed!

In [105]:
tweets_final.columns

Index(['id', 'in_reply_to_screen_name', 'in_reply_to_status_id',
       'in_reply_to_user_id', 'user_mentions_screen_name', 'user_mentions_id',
       'retweet_count', 'favourite_count', 'lang', 'text', 'user_id',
       'screen_name', 'created_at'],
      dtype='object')

In [106]:
# format for feature engineering used in project
accounts = accounts[['id','name','username','location','url','description',
                                'verified','followers','friends','favourites_count',
                                'statuses_count','created_at','default_profile','default_profile_image']]

tweets = tweets_final.iloc[:,[0,-1,-2,-4]]
tweets.columns = ['tweetid','datetime','screen_name','text']
tweets = tweets[['screen_name','text','tweetid','datetime']]

In [None]:
# save out to send for feature engineering