Note: This script will not run without twitter API keys.

# Setup

In [1]:
!pip install tweepy



In [2]:
import os
import time
import tweepy as tw
import pandas as pd

# Load Tweet IDS

### Load Waseem 2016 IDs

In [3]:
def loadWaseem(filepath_in):
    
    # read in tweet IDs
    df = pd.read_csv(filepath_or_buffer=filepath_in,
                    sep='\t',
                    index_col=False,
                    usecols=['TweetID','Expert'])

    # rename columns
    df.rename(columns={'TweetID':'tweet_id', 'Expert':'label'}, inplace=True)
    
    # map label
    df['label'] = df['label'].map({'neither':0, 'racism':1, 'sexism':2, 'both':3})
    
    # return dataframe
    return df

In [4]:
# load tweet ids
waseem_id_df = loadWaseem(filepath_in = '../data/downloaded/hatespeech-master/NLP+CSS_2016.csv')

### Load Jha Mamidi IDs

In [5]:
def loadJhaMamidi(hostile_filepath_in, benevolent_filepath_in):
    
    # read in tweet IDs
    hostile_df    = pd.read_csv(filepath_or_buffer=hostile_filepath_in,
                                header=None,
                                names=['tweet_id'])
    benevolent_df = pd.read_csv(filepath_or_buffer=benevolent_filepath_in,
                                header=None,
                                names=['tweet_id'])

    # add labels
    hostile_df['label'] = 4
    benevolent_df['label'] = 5
    
    # stack dataframes
    df = pd.concat([hostile_df, benevolent_df])
    
    # return dataframe
    return df

In [6]:
# load tweet ids
jhamamidi_id_df = loadJhaMamidi(hostile_filepath_in='../data/downloaded/NLP_CSS_2017-master/hostile_sexist.tsv',
                             benevolent_filepath_in='../data/downloaded/NLP_CSS_2017-master/benevolent_sexist.tsv')

### Combine IDs

In [7]:
# combine IDs into one dataframe
id_df = pd.concat([waseem_id_df, jhamamidi_id_df])
# reset indices
id_df.reset_index(inplace=True)
# quick peek
id_df.sample(5)

Unnamed: 0,index,tweet_id,label
6338,6338,575544108325978113,0
150,150,575860144355041280,2
15142,4855,839681051605688320,5
92,92,569612886722486272,0
5204,5204,598776035379843072,0


# Query tweets

### Initialize Twitter API

In [8]:
def initializeAPI(consumer_key, consumer_secret):
    auth = tw.AppAuthHandler(consumer_key, consumer_secret)
    api = tw.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)
    return api

In [9]:
# initialize twitter API
my_consumer_key = ''
my_consumer_secret = ''
api = initializeAPI(consumer_key=my_consumer_key,
                    consumer_secret=my_consumer_secret)

In [10]:
# quick test
api.get_status(839880162586071040).text

'RT @GemmaAnneStyles: Happy #womensday to all my sassy women, classy women, nasty women üíãüëßüèæüíñüë©üèº\u200düî¨üëÑüëµüèº my fine women, alive women, gonna fight‚Ä¶'

### Query tweets based on IDs

In [11]:
def queryTweets(tweet_df, api, tweet_filepath_out, error_filepath_out, all_tweets=True, tweet_size=15):

    # add column for tweet text
    tweet_df['text'] = None
    
    # reorder columns
    tweet_df = tweet_df[['tweet_id', 'text', 'label']]
    
    # initialize error counter
    error_count = [0] * len(set(tweet_df['label']))

    # define range for tweets to query
    range_size = len(tweet_df) if all_tweets else tweet_size

    # iterate over rows of dataframe
    for i in range(range_size):
        
        # extract tweet id and label
        tweet_id = tweet_df['tweet_id'][i]
        tweet_label = tweet_df['label'][i]
        
        # try query
        try:
            # query tweet
            tweet = api.get_status(tweet_id)
            # extract text
            tweet_text = tweet.text
            
        # catch exceptions
        except:
            # increase error counter
            error_count[tweet_label] += 1
            # no change to text
            tweet_text = None

        # update record
        tweet_df['text'][i] = tweet_text
        
        # print progress every 100 tweets
        if (i+1)%100 == 0:
            print((i+1), 'tweets queried of', range_size, 'total tweets.')

    # create error tracking dataframe
    error_df = pd.DataFrame({'label':[i for i in range(len(error_count))],
                             'error_count':error_count})

    # write tweets and error tracker to csv
    tweet_df.to_csv(path_or_buf=tweet_filepath_out, index=False)
    error_df.to_csv(path_or_buf=error_filepath_out, index=False)
    
    # print error count
    print(f'There were {sum(error_count)} total errors of {range_size} total tweets.')
    for i in range(len(error_count)):
        print(f'    Label {i} had {error_count[i]} errors.')
    
    # return tweets
    return tweet_df, error_df

In [12]:
%%time

# query tweets
tweet_df, error_df = queryTweets(tweet_df=id_df,
                                 api=api,
                                 tweet_filepath_out='../data/derived/tweets_query.csv',
                                 error_filepath_out='../data/derived/track_error_query.csv',
                                 all_tweets=True)

100 tweets queried of 17492 total tweets.
200 tweets queried of 17492 total tweets.
300 tweets queried of 17492 total tweets.
400 tweets queried of 17492 total tweets.
500 tweets queried of 17492 total tweets.
600 tweets queried of 17492 total tweets.
700 tweets queried of 17492 total tweets.
800 tweets queried of 17492 total tweets.


Rate limit reached. Sleeping for: 45


900 tweets queried of 17492 total tweets.
1000 tweets queried of 17492 total tweets.
1100 tweets queried of 17492 total tweets.
1200 tweets queried of 17492 total tweets.
1300 tweets queried of 17492 total tweets.
1400 tweets queried of 17492 total tweets.
1500 tweets queried of 17492 total tweets.
1600 tweets queried of 17492 total tweets.
1700 tweets queried of 17492 total tweets.


Rate limit reached. Sleeping for: 788


1800 tweets queried of 17492 total tweets.
1900 tweets queried of 17492 total tweets.
2000 tweets queried of 17492 total tweets.
2100 tweets queried of 17492 total tweets.
2200 tweets queried of 17492 total tweets.
2300 tweets queried of 17492 total tweets.
2400 tweets queried of 17492 total tweets.
2500 tweets queried of 17492 total tweets.
2600 tweets queried of 17492 total tweets.


Rate limit reached. Sleeping for: 792


2700 tweets queried of 17492 total tweets.
2800 tweets queried of 17492 total tweets.
2900 tweets queried of 17492 total tweets.
3000 tweets queried of 17492 total tweets.
3100 tweets queried of 17492 total tweets.
3200 tweets queried of 17492 total tweets.
3300 tweets queried of 17492 total tweets.
3400 tweets queried of 17492 total tweets.
3500 tweets queried of 17492 total tweets.


Rate limit reached. Sleeping for: 783


3600 tweets queried of 17492 total tweets.
3700 tweets queried of 17492 total tweets.
3800 tweets queried of 17492 total tweets.
3900 tweets queried of 17492 total tweets.
4000 tweets queried of 17492 total tweets.
4100 tweets queried of 17492 total tweets.
4200 tweets queried of 17492 total tweets.
4300 tweets queried of 17492 total tweets.
4400 tweets queried of 17492 total tweets.


Rate limit reached. Sleeping for: 788


4500 tweets queried of 17492 total tweets.
4600 tweets queried of 17492 total tweets.
4700 tweets queried of 17492 total tweets.
4800 tweets queried of 17492 total tweets.
4900 tweets queried of 17492 total tweets.
5000 tweets queried of 17492 total tweets.
5100 tweets queried of 17492 total tweets.
5200 tweets queried of 17492 total tweets.
5300 tweets queried of 17492 total tweets.


Rate limit reached. Sleeping for: 790


5400 tweets queried of 17492 total tweets.
5500 tweets queried of 17492 total tweets.
5600 tweets queried of 17492 total tweets.
5700 tweets queried of 17492 total tweets.
5800 tweets queried of 17492 total tweets.
5900 tweets queried of 17492 total tweets.
6000 tweets queried of 17492 total tweets.
6100 tweets queried of 17492 total tweets.
6200 tweets queried of 17492 total tweets.


Rate limit reached. Sleeping for: 790


6300 tweets queried of 17492 total tweets.
6400 tweets queried of 17492 total tweets.
6500 tweets queried of 17492 total tweets.
6600 tweets queried of 17492 total tweets.
6700 tweets queried of 17492 total tweets.
6800 tweets queried of 17492 total tweets.
6900 tweets queried of 17492 total tweets.
7000 tweets queried of 17492 total tweets.
7100 tweets queried of 17492 total tweets.


Rate limit reached. Sleeping for: 789


7200 tweets queried of 17492 total tweets.
7300 tweets queried of 17492 total tweets.
7400 tweets queried of 17492 total tweets.
7500 tweets queried of 17492 total tweets.
7600 tweets queried of 17492 total tweets.
7700 tweets queried of 17492 total tweets.
7800 tweets queried of 17492 total tweets.
7900 tweets queried of 17492 total tweets.
8000 tweets queried of 17492 total tweets.


Rate limit reached. Sleeping for: 789


8100 tweets queried of 17492 total tweets.
8200 tweets queried of 17492 total tweets.
8300 tweets queried of 17492 total tweets.
8400 tweets queried of 17492 total tweets.
8500 tweets queried of 17492 total tweets.
8600 tweets queried of 17492 total tweets.
8700 tweets queried of 17492 total tweets.
8800 tweets queried of 17492 total tweets.
8900 tweets queried of 17492 total tweets.


Rate limit reached. Sleeping for: 791


9000 tweets queried of 17492 total tweets.
9100 tweets queried of 17492 total tweets.
9200 tweets queried of 17492 total tweets.
9300 tweets queried of 17492 total tweets.
9400 tweets queried of 17492 total tweets.
9500 tweets queried of 17492 total tweets.
9600 tweets queried of 17492 total tweets.
9700 tweets queried of 17492 total tweets.
9800 tweets queried of 17492 total tweets.


Rate limit reached. Sleeping for: 789


9900 tweets queried of 17492 total tweets.
10000 tweets queried of 17492 total tweets.
10100 tweets queried of 17492 total tweets.
10200 tweets queried of 17492 total tweets.
10300 tweets queried of 17492 total tweets.
10400 tweets queried of 17492 total tweets.
10500 tweets queried of 17492 total tweets.
10600 tweets queried of 17492 total tweets.
10700 tweets queried of 17492 total tweets.


Rate limit reached. Sleeping for: 792


10800 tweets queried of 17492 total tweets.
10900 tweets queried of 17492 total tweets.
11000 tweets queried of 17492 total tweets.
11100 tweets queried of 17492 total tweets.
11200 tweets queried of 17492 total tweets.
11300 tweets queried of 17492 total tweets.
11400 tweets queried of 17492 total tweets.
11500 tweets queried of 17492 total tweets.
11600 tweets queried of 17492 total tweets.


Rate limit reached. Sleeping for: 794


11700 tweets queried of 17492 total tweets.
11800 tweets queried of 17492 total tweets.
11900 tweets queried of 17492 total tweets.
12000 tweets queried of 17492 total tweets.
12100 tweets queried of 17492 total tweets.
12200 tweets queried of 17492 total tweets.
12300 tweets queried of 17492 total tweets.
12400 tweets queried of 17492 total tweets.
12500 tweets queried of 17492 total tweets.


Rate limit reached. Sleeping for: 798


12600 tweets queried of 17492 total tweets.
12700 tweets queried of 17492 total tweets.
12800 tweets queried of 17492 total tweets.
12900 tweets queried of 17492 total tweets.
13000 tweets queried of 17492 total tweets.
13100 tweets queried of 17492 total tweets.
13200 tweets queried of 17492 total tweets.
13300 tweets queried of 17492 total tweets.
13400 tweets queried of 17492 total tweets.


Rate limit reached. Sleeping for: 799


13500 tweets queried of 17492 total tweets.
13600 tweets queried of 17492 total tweets.
13700 tweets queried of 17492 total tweets.
13800 tweets queried of 17492 total tweets.
13900 tweets queried of 17492 total tweets.
14000 tweets queried of 17492 total tweets.
14100 tweets queried of 17492 total tweets.
14200 tweets queried of 17492 total tweets.
14300 tweets queried of 17492 total tweets.


Rate limit reached. Sleeping for: 791


14400 tweets queried of 17492 total tweets.
14500 tweets queried of 17492 total tweets.
14600 tweets queried of 17492 total tweets.
14700 tweets queried of 17492 total tweets.
14800 tweets queried of 17492 total tweets.
14900 tweets queried of 17492 total tweets.
15000 tweets queried of 17492 total tweets.
15100 tweets queried of 17492 total tweets.
15200 tweets queried of 17492 total tweets.


Rate limit reached. Sleeping for: 792


15300 tweets queried of 17492 total tweets.
15400 tweets queried of 17492 total tweets.
15500 tweets queried of 17492 total tweets.
15600 tweets queried of 17492 total tweets.
15700 tweets queried of 17492 total tweets.
15800 tweets queried of 17492 total tweets.
15900 tweets queried of 17492 total tweets.
16000 tweets queried of 17492 total tweets.
16100 tweets queried of 17492 total tweets.


Rate limit reached. Sleeping for: 797


16200 tweets queried of 17492 total tweets.
16300 tweets queried of 17492 total tweets.
16400 tweets queried of 17492 total tweets.
16500 tweets queried of 17492 total tweets.
16600 tweets queried of 17492 total tweets.
16700 tweets queried of 17492 total tweets.
16800 tweets queried of 17492 total tweets.
16900 tweets queried of 17492 total tweets.
17000 tweets queried of 17492 total tweets.


Rate limit reached. Sleeping for: 789


17100 tweets queried of 17492 total tweets.
17200 tweets queried of 17492 total tweets.
17300 tweets queried of 17492 total tweets.
17400 tweets queried of 17492 total tweets.
There were 6193 total errors of 17492 total tweets.
    Label 0 had 439 errors.
    Label 1 had 34 errors.
    Label 2 had 350 errors.
    Label 3 had 26 errors.
    Label 4 had 661 errors.
    Label 5 had 4683 errors.
CPU times: user 2min 41s, sys: 7.83 s, total: 2min 49s
Wall time: 4h 34min 55s


In [13]:
# quick peek
tweet_df.head(10)

Unnamed: 0,tweet_id,text,label
0,597576902212063232,Cisco had to deal with a fat cash payout to th...,0
1,565586175864610817,"@MadamPlumpette I'm decent at editing, no worr...",0
2,563881580209246209,@girlziplocked will read. gotta go afk for a b...,0
3,595380689534656512,guys. show me the data. show me your github. t...,0
4,563757610327748608,@tpw_rules nothings broken. I was just driving...,0
5,563082741370339330,ur face is classified as a utility by the FCC.,0
6,596962098845851648,@lysandraws yay! Absolutely. I'm not gone unti...,0
7,563874350038675457,"RT @kashiichan: ""It really feels like the @twi...",0
8,597240424873394176,@SirenSailor rtfm. http://t.co/jaMXHikl3u,0
9,571030421103910912,,1


# Data dictionary

`tweet_id` integer with tweet ID to query from twitter API

`text` text body of tweet

`label` boolean for label from datasets
* 0. neither, expert label from Waseem 2016
* 1. racist, expert label from Waseem 2016
* 2. sexist, expert label from Waseem 2016
* 3. both, expert label from Waseem 2016
* 4. hostile, label from Jha and Mamidi 2017
* 5. benevolent, label from Jha and Mamidi 2017

# References
* [Jha and Mamidi 2017](https://www.aclweb.org/anthology/W17-2902.pdf)
* [Waseem 2016](https://www.aclweb.org/anthology/W16-5618.pdf)