In [1]:
#Loaded by default (c.f.: ~/.ipython/profile_default/startup)
#Snippet location: ./anaconda3/share/jupyter/nbextensions/snippets/snippets.json
from __future__ import division, print_function
import numpy as np
import pandas as pd
from pandas_summary import DataFrameSummary
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.graphics.gofplots import qqplot
pd.set_option('display.max_columns', None)
plt.style.use('fivethirtyeight')
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

# NLP Using the Twitter API:  Lab

<br>

---


<img src="https://snag.gy/RNAEgP.jpg" width="600">

### Can we correctly identify which of these two old men tweeted what?


## Goals
---

We are going to attempt to classify whether a tweet comes from Trump or Sanders.  This project involves multiple steps:
- Create a developer account on Twitter
- Create a method to pull a list of tweets from the Twitter API
- Perform proper preprocessing on our text
- Engineer sentiment feature in our dataset using TextBlob
- Explore supervised classification techniques


## Twitter API Developer Registration
---

[Twitter Rest API](https://dev.twitter.com/rest/public)



## Create an "App"

---

![](https://snag.gy/HPBQbJ.jpg)

Go to Twitter and register an "app" [apps.twitter.com](https://apps.twitter.com/).

> **Note**: For the required website field you can put a placeholder.

After we set up our app, we will only need to reference the cooresponding keys Twitter generates for our app.  These are the keys that we will use with our application to communicate with the Twitter API.

## Install Python Twitter API library

---

We will use Python Twitter Tools. It makes pulling tweets simple: we only need to plug in our keys and start collecting data.<br>
[Python Twitter Tools](http://mike.verdone.ca/twitter/).

To install it, just run the next frame (there is no conda package).

In [2]:
# pip install twitter python-twitter

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.max_columns', None)
plt.style.use('fivethirtyeight')
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

## Some Boring Twitter Rules
---

**Twitter notifies you they will rate limit your requests:**

>When using application-only authentication, rate limits are determined globally for the entire application. If a method allows for 15 requests per rate limit window, then it allows you to make 15 requests per window — on behalf of your application. This limit is considered completely separately from per-user limits. https://dev.twitter.com/rest/public/rate-limiting

Here's a quick overview of what Twitter says are "the rules":

![](https://snag.gy/yJ6vIH.jpg)


## About those Keys: OAuth Review
---

![](https://g.twimg.com/dev/documentation/image/appauth_0.png)

## Our Application Keys
---

Note application keys, we will use it to connect to Twitter account and mine tweets from the official Bernie Sanders and Donald Trump twitter accounts:

## `TweetMiner` class structure

---

The following will provide connectivity to twitter. The class has the ability to make requests and can eventually transform the JSON responses into DataFrames.


> **Note:** "request_limit" is used in this class to limit the number of tweets that are pulled per instance request.  Setting it to something lower until we've worked the bugs out of the request, and captured the data we want, is essential to avoiding the rate limit blocks.

In [3]:
import twitter
import re
import datetime
import pandas as pd

twitter_keys = {
    'consumer_key':        'lA6UQplM5sxIAIr83ueUl9sgE',
    'consumer_secret':     'f94t4BD6Vj7aCVkX1qAIVwsP4x69J2vXvm61lTIuwb9GfmdsuP',
    'access_token_key':    '1203210464-dYE1FvoUx1GjVcoyok3U1brWDpELBJEcSNGC1OC',
    'access_token_secret': 'kuVli1j010RPdYTboz9iIyp8QxX6PqRApdi49baLqBgzo'
}

api = twitter.Api(
    consumer_key=twitter_keys['consumer_key'],
    consumer_secret=twitter_keys['consumer_secret'],
    access_token_key=twitter_keys['access_token_key'],
    access_token_secret=twitter_keys['access_token_secret']
)

In [4]:
class TweetMiner(object):

    result_limit = 20
    api = False
    data = []

    twitter_keys = {
        'consumer_key':        'KmN03M1X1pImZ43sqdIu4yfnE',
        'consumer_secret':     'ePlIrIX5VXbZnO7DBu1RbFlw5lOai9dQr9n5TZb6vxnIdrr5Fz',
        'access_token_key':    '185036086-Q7K5IjuSoQZJwSIqD0wyHf6t62iPKatmfaPkriAM',
        'access_token_secret': 'cYpQz3xWHQbLplOj8iSeiNSOmMcsTOXmWcKMrJ9buLj5d'
    }

    def __init__(self, keys_dict, api, result_limit=20):

        self.api = api
        self.twitter_keys = keys_dict

        self.result_limit = result_limit

    def mine_user_tweets(self, user="dyerrington", mine_rewteets=False, max_pages=5):

        data = []
        last_tweet_id = False
        page = 1

        while page <= max_pages:

            if last_tweet_id:
                statuses = self.api.GetUserTimeline(
                    screen_name=user, count=self.result_limit, max_id=last_tweet_id - 1)
            else:
                statuses = self.api.GetUserTimeline(
                    screen_name=user, count=self.result_limit)

            for item in statuses:

                mined = {
                    'tweet_id':        item.id,
                    'handle':          item.user.name,
                    'retweet_count':   item.retweet_count,
                    'text':            item.text,
                    'mined_at':        datetime.datetime.now(),
                    'created_at':      item.created_at,
                }

                last_tweet_id = item.id
                data.append(mined)

            page += 1

        return data

## Instantiate the class
---

Pass the keys dictionary and the api as arguments.

**Check:** call the object's `mine_user_tweets()` method, providing a user to pull the tweets of.

In [5]:
miner = TweetMiner(twitter_keys, api, result_limit=2)

In [6]:
sanders = miner.mine_user_tweets(user="berniesanders", max_pages=5)
donald = miner.mine_user_tweets(user="realDonaldTrump", max_pages=5)

In [7]:
print(sanders[0])

{'tweet_id': 995034893687119872, 'handle': 'Bernie Sanders', 'retweet_count': 101, 'text': 'RT @RichLazerPHL: Proud to have the endorsement of @BernieSanders in #PA05 -- I look forward to fighting alongside him for working families…', 'mined_at': datetime.datetime(2018, 5, 14, 16, 53, 26, 673396), 'created_at': 'Fri May 11 20:16:20 +0000 2018'}


In [8]:
print(donald[0])

{'tweet_id': 996046634864791557, 'handle': 'Donald J. Trump', 'retweet_count': 3709, 'text': '#USEmbassyJerusalem https://t.co/f1SFvrkcAH', 'mined_at': datetime.datetime(2018, 5, 14, 16, 53, 28, 47705), 'created_at': 'Mon May 14 15:16:38 +0000 2018'}


### Convert the tweet ouputs to a pandas DataFrame

> *This is as easy as passing it to the DataFrame constructor!*

In [9]:
pd.DataFrame(sanders).head()

Unnamed: 0,created_at,handle,mined_at,retweet_count,text,tweet_id
0,Fri May 11 20:16:20 +0000 2018,Bernie Sanders,2018-05-14 16:53:26.673396,101,RT @RichLazerPHL: Proud to have the endorsemen...,995034893687119872
1,Sun May 06 19:29:26 +0000 2018,Bernie Sanders,2018-05-14 16:53:26.673404,170,Bernie just returned from a campaign swing wit...,993211151373819909
2,Sun May 06 19:29:06 +0000 2018,Bernie Sanders,2018-05-14 16:53:27.046898,261,Thanks to all the people who have provided sup...,993211068179730433
3,Sun May 06 19:28:11 +0000 2018,Bernie Sanders,2018-05-14 16:53:27.046905,798,If we are going to turn this country around po...,993210837711060993
4,Sat May 05 19:24:16 +0000 2018,Bernie Sanders,2018-05-14 16:53:27.285391,254,RT @AriRabinHavt: . @edwardsforpa agrees with ...,992847463034900481


##  Create the training data

---

Let's get our "mined" data from the Twitter API.  

1. Mine Trump tweets
- Create a tweet DataFrame
- Mine Sanders tweets
- Append the results to our DataFrame

In [10]:
# we only need to "instantiate" once.  Then we can call mine_user_tweets as much as we want.
miner = TweetMiner(twitter_keys, api, result_limit=400)
trump_tweets = miner.mine_user_tweets("realDonaldTrump")

In [11]:
trump_df = pd.DataFrame(trump_tweets)
print(trump_df.shape)

(1000, 6)


In [12]:
bernie_tweets = miner.mine_user_tweets('berniesanders')

In [13]:
bernie_df = pd.DataFrame(bernie_tweets)
print(bernie_df.shape)

(998, 6)


In [14]:
tweets = pd.concat([trump_df, bernie_df], axis=0)
tweets.shape

(1998, 6)

## Any interesting ngrams going on with Trump?
---

Set up a vectorizer from sklearn and fit the text of Trump's tweets with an ngram range from 2 to 4. <br>
Lets figure out what the most common ngrams are.

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter

# We can use the TfidfVectorizer to find ngrams for us
vect = TfidfVectorizer(ngram_range=(2, 4))

# Pulls all of trumps tweet text's into one giant string
summaries = "".join(trump_df['text'])
ngrams_summaries = vect.build_analyzer()(summaries)

Counter(ngrams_summaries).most_common(20)

[('https co', 817),
 ('of the', 96),
 ('in the', 68),
 ('to the', 57),
 ('will be', 44),
 ('fake news', 43),
 ('for the', 40),
 ('on the', 39),
 ('our country', 35),
 ('at the', 35),
 ('and the', 30),
 ('we are', 30),
 ('thank you', 30),
 ('want to', 29),
 ('all of', 29),
 ('honor to', 29),
 ('with the', 28),
 ('united states', 24),
 ('was my', 24),
 ('the https', 24)]

### Look at the ngrams for Bernie Sanders

In [16]:
# We can use the TfidfVectorizer to find ngrams for us
vect = TfidfVectorizer(ngram_range=(2, 4))

# Pulls all of trumps tweet text's into one giant string
summaries = "".join(bernie_df['text'])
ngrams_summaries = vect.build_analyzer()(summaries)

Counter(ngrams_summaries).most_common(20)

[('https co', 445),
 ('health care', 139),
 ('of the', 65),
 ('bernie sanders', 57),
 ('in the', 52),
 ('to the', 51),
 ('we must', 44),
 ('we need', 41),
 ('for the', 41),
 ('we are', 37),
 ('this country', 36),
 ('for all', 36),
 ('is not', 36),
 ('millions of', 34),
 ('on the', 30),
 ('going to', 29),
 ('it is', 29),
 ('and the', 28),
 ('is the', 27),
 ('should be', 27)]

## Processing the tweets and building a model

---

To do classfication we will need to convert the tweets into a set of features.

***Will need to:***
- Vectorize input text data.
- Intialize a model (try Logistic regression).
- Train / Predict / cross-validate.
- Evaluate the performance of the model.

> I noticed that there are website links in the tweets so we can do some additional preprocessing before building the model?


In [20]:
# !pip install textacy

In [17]:
# Using the textacy package to do some more comprehensive preprocessing
# http://textacy.readthedocs.io/en/latest/
from textacy.preprocess import preprocess_text

tweet_text = tweets['text'].values
clean_text = [preprocess_text(x, fix_unicode=True, lowercase=True, transliterate=False,
                              no_urls=True, no_emails=True, no_phone_numbers=True, no_currency_symbols=True,
                              no_punct=True, no_accents=True)
              for x in tweet_text]

In [18]:
print(tweet_text[0:3])

['#USEmbassyJerusalem https://t.co/f1SFvrkcAH'
 'Big day for Israel. Congratulations!'
 'U.S. Embassy opening in Jerusalem will be covered live on @FoxNews &amp; @FoxBusiness. Lead up to 9:00 A.M. (eastern) e… https://t.co/VncqHcDmLG']


In [19]:
print(clean_text[0:3])

['usembassyjerusalem url', 'big day for israel congratulations', 'u s embassy opening in jerusalem will be covered live on foxnews foxbusiness lead up to 9 00 a m eastern e url']


In [20]:
# target is the handle.
# make trump 1 and sanders 0
y = tweets['handle'].map(lambda x: 1 if x == 'Donald J. Trump' else 0).values
print(np.mean(y))

0.5005005005005005


In [21]:
from sklearn.linear_model import LogisticRegression

# Preprocess our text data to Tfidf
tfv = TfidfVectorizer(ngram_range=(1, 4), max_features=2000)
X = tfv.fit_transform(clean_text).todense()
print(X.shape)

(1998, 2000)


In [22]:
# cross-validate the accuracy:
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(LogisticRegression(), X, y, cv=10)

import numpy as np
print(accuracies)
print(np.mean(accuracies))

# Setup logistic regression (or try another classification method here)
estimator = LogisticRegression()
estimator.fit(X, y)

[0.81       0.89       0.915      0.925      0.885      0.93
 0.93       0.93       0.90452261 0.89447236]
0.9013994974874372


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [27]:
# Very good accuracy considering the baseline is 50%

## Check the predicted probability for a random Sanders and Trump tweet
---

Below are a couple of tweets from both Sanders and Trump.

Estimate the predicted probability of being trump for the two tweets.

In [23]:
# Prep our source as TfIdf vectors
source_test = [
    "Demanding that the wealthy and the powerful start paying their fair share of taxes that's exactly what the American people want.",
    "Crooked Hillary is spending tremendous amounts of Wall Street money on false ads against me. She is a very dishonest person!"
]

############
# NOTE:  Do not re-initialize the tfidf vectorizor or the feature space willbe overwritten and
# the transform will not match the number of features trained the model on.
#
# This is why we only need to "transform" since we have already "fit" previously
#
####

Xtest = tfv.transform(source_test)

# Predict using previously trained logist regression `estimator`
estimator.predict_proba(Xtest)

array([[0.71410576, 0.28589424],
       [0.285973  , 0.714027  ]])

In [29]:
# The 1st column is probability of being Bernie, and 2nd Trump. The classifier is getting it right.

### Pull tweets for some new users.

Experiment using more data. The API will not like it if we blow through their limits so we have to  be careful and try to grab only what we need one time, then work on the copy of the objects that are returned.

> NOTE: Read the documentation about rate limits to see the options available in the API to avoid this problem.

**Pull tweets for more than two different users **

In [30]:
# We deviate from trump / sanders using student tweets here to illustrate the NLP pipeine with twitter data

twitter_handles = ["dril", "LaziestCanine", 'ch000ch']
tweets = {}

for twitter_handle in twitter_handles:
    print("Mining tweets for: ", twitter_handle)
    miner = TweetMiner(twitter_keys, api, result_limit=500)
    tweets[twitter_handle] = miner.mine_user_tweets(
        user=twitter_handle, max_pages=10)

Mining tweets for:  dril
Mining tweets for:  LaziestCanine
Mining tweets for:  ch000ch


In [31]:
multi = pd.DataFrame(tweets['dril'])
multi = multi.append(pd.DataFrame(tweets['LaziestCanine']))
multi = multi.append(pd.DataFrame(tweets['ch000ch']))

print(multi.shape)

(5985, 6)


In [32]:
multi.handle.value_counts()

chuuch      1996
wint        1995
Lazy dog    1994
Name: handle, dtype: int64

### Build a multi-class classification model to distinguish between the users.

Try a new model.

In [33]:
tweet_text = multi['text'].values
clean_text = [preprocess_text(x, fix_unicode=True, lowercase=True, transliterate=False,
                              no_urls=True, no_emails=True, no_phone_numbers=True, no_currency_symbols=True,
                              no_punct=True, no_accents=True)
              for x in tweet_text]

y = multi['handle'].map(
    lambda x: 0 if x == 'wint' else 1 if x == 'Lazy dog' else 2).values

In [34]:
tfv = TfidfVectorizer(ngram_range=(1, 3), max_features=2500)
X = tfv.fit_transform(clean_text)

In [35]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42)

In [36]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

rf = RandomForestClassifier(n_estimators=250, verbose=1)
knn = KNeighborsClassifier(n_neighbors=7)

rf.fit(X_train, y_train)
knn.fit(X_train, y_train)

[Parallel(n_jobs=1)]: Done 250 out of 250 | elapsed:    5.5s finished


KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=7, p=2,
           weights='uniform')

In [37]:
# Random forest score:
print('RF:', rf.score(X_test, y_test))
print('KNN:', knn.score(X_test, y_test))

[Parallel(n_jobs=1)]: Done 250 out of 250 | elapsed:    0.4s finished


RF: 0.7032293986636972
KNN: 0.4142538975501114


In [38]:
# Baseline score:
multi.handle.value_counts()/multi.shape[0]

chuuch      0.333500
wint        0.333333
Lazy dog    0.333166
Name: handle, dtype: float64

In [39]:
rf_yhat = knn.predict(X_test)

### Make a confusion matrix and classification report.

In [40]:
from sklearn.metrics import classification_report, confusion_matrix

print(classification_report(y_test, rf_yhat))

             precision    recall  f1-score   support

          0       0.66      0.10      0.18       630
          1       0.36      0.80      0.50       585
          2       0.51      0.36      0.42       581

avg / total       0.52      0.41      0.36      1796



In [41]:
# Confusion Matrix
print(confusion_matrix(y_test, rf_yhat))

[[ 64 458 108]
 [ 23 470  92]
 [ 10 361 210]]


### Most and least "distinctive" tweets for each user?

To find this, we check the tweet that has the highest (correct) predicted probability of being that user's tweet for each user.

In [42]:
rf.fit(X, y)
pp = rf.predict_proba(X)

[Parallel(n_jobs=1)]: Done 250 out of 250 | elapsed:    8.4s finished
[Parallel(n_jobs=1)]: Done 250 out of 250 | elapsed:    1.1s finished


In [43]:
pp[0:5]

array([[0.952     , 0.012     , 0.036     ],
       [0.47577615, 0.46609291, 0.05813095],
       [0.888     , 0.02      , 0.092     ],
       [0.788     , 0.04      , 0.172     ],
       [0.84364824, 0.08031591, 0.07603586]])

In [44]:
pp = pd.DataFrame(pp, columns=['dril_pp', 'laziestcanine_pp', 'ch000ch_pp'])

In [45]:
print(multi.shape, pp.shape)

(5985, 6) (5985, 3)


In [46]:
tweets_pp = pd.concat([multi.reset_index(), pp.reset_index()], axis=1)
tweets_pp.head(2)

Unnamed: 0,index,created_at,handle,mined_at,retweet_count,text,tweet_id,index.1,dril_pp,laziestcanine_pp,ch000ch_pp
0,0,Wed May 09 20:33:41 +0000 2018,wint,2018-05-10 20:50:30.039267,2731,gathering data on various of bastards,994314487682555904,0,0.952,0.012,0.036
1,1,Tue May 08 19:39:35 +0000 2018,wint,2018-05-10 20:50:30.039281,631,https://t.co/rbM8Cj6jkM,993938484556640256,1,0.475776,0.466093,0.058131


In [47]:
print('Most dril:', tweets_pp[tweets_pp.handle == 'wint'].sort_values(
    'dril_pp', ascending=False).text.values[0])
print('Least dril:', tweets_pp[tweets_pp.handle == 'wint'].sort_values(
    'dril_pp', ascending=True).text.values[0])

Most dril: @formida_poupon shut hte fuck up
Least dril: @StaggMack  Puerto Rican Pisser


In [48]:
print('Most LaziestCanine:', tweets_pp[tweets_pp.handle == 'Lazy dog'].sort_values(
    'laziestcanine_pp', ascending=False).text.values[0])
print('Least LaziestCanine:', tweets_pp[tweets_pp.handle == 'Lazy dog'].sort_values(
    'laziestcanine_pp', ascending=True).text.values[0])

Most LaziestCanine: you vs. the guy she tells you not to worry about https://t.co/yhs18lGqvJ
Least LaziestCanine: RT @CouRageJD: https://t.co/CVvPR4PF1n


In [49]:
print('Most chuuch:', tweets_pp[tweets_pp.handle == 'chuuch'].sort_values(
    'ch000ch_pp', ascending=False).text.values[0])
print('Least chuuch:', tweets_pp[tweets_pp.handle == 'chuuch'].sort_values(
    'ch000ch_pp', ascending=True).text.values[0])

Most chuuch: @BuckyIsotope damnit
Least chuuch: https://t.co/HJsaeyaRCd
