# Donald Trump's tweets sentiment analysis and correlation with approval ratings

## Imports and API initialization

In [1]:
%load_ext lab_black

In [2]:
import tweepy
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style='whitegrid', palette='husl', font_scale=1.1, rc={"figure.figsize": [12, 8]})
import numpy as np
import datetime, json, logging, os, re
import preprocessor as pre
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
from nltk import word_tokenize
import spacy, en_core_web_sm
nlp = en_core_web_sm.load()
from collections import Counter
from gensim.summarization import keywords

In [3]:
logging.basicConfig(filename='logfile.log', level=logging.DEBUG, 
                    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')

name = 'realDonaldTrump'

# dates range to collect tweets
start_date = datetime.datetime(2018, 12, 18, 0, 0, 0)
end_date = datetime.datetime(2019, 2, 19, 0, 0, 0)

consumer_key = '5NTdgEYyu0ikbduFxjPJconG0'
consumer_secret = 'RhoB2yyWl8L6mS3EmSEzCoMGlPsMX1z3XKQ0j2MXAagHwzU6yU'

oauth_token = '4227898119-lQpkWMTn4mUPxNX9kPpoHbWlRJsmjsAzPwUHyZ8'
oauth_token_secret = 'hp9Ga2IuDfPMGV2j56EfddLmaIrTP7BbgIWcurOXVaVY7'

auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(oauth_token, oauth_token_secret)

# the are rate limits for the frequency of API calls for twitter, 
# wait_on_rate_limit flag helps us not to worry about it while collecting the data
api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)

In [4]:
print(api.me().name)

Andrei


## Tweets collection

Now we collect the Trumps tweets from the specified date range. If already collected, read from file

In [5]:
def collect_tweets(tweets, file):
    """
    collect all tweets for the specified account and time range,
    save results to file
    
    """
    tweets_temp = api.user_timeline(
        screen_name=name,
        count=200,
        include_rts='false',
        tweet_mode='extended')
    for tweet in tweets_temp:
        if tweet.created_at < end_date and tweet.created_at > start_date:
            tweets.append(tweet._json)

    while (tweets_temp[-1].created_at > start_date):
        tweets_temp = api.user_timeline(
            screen_name=name,
            count=200,
            max_id=tweets_temp[-1].id,
            include_rts='false',
            tweet_mode='extended')
        for tweet in tweets_temp:
            if tweet.created_at < end_date and tweet.created_at > start_date:
                tweets.append(tweet._json)

    json.dump(tweets, file, indent=2)

In [6]:
%%time
if os.path.isfile('tweets.json') and not os.stat('tweets.json').st_size == 0:
    logging.info('File tweets.json already exists! Reading the file..')
    file = open('tweets.json', 'r', encoding='utf8')
    tweets = json.load(file)
    file.close()
else:
    logging.info('Creating tweets.json, collecting tweets..')
    file = open('tweets.json', 'w', encoding='utf8')
    tweets = []
    collect_tweets(tweets, file)
    file.close()

CPU times: user 24.3 ms, sys: 6.52 ms, total: 30.9 ms
Wall time: 30.5 ms


Create dataframe to store and process the data 

In [7]:
df = pd.DataFrame(data=[tweet['full_text'] for tweet in tweets], columns=['Text'])
df.head(10)

Unnamed: 0,Text
0,I ask every member of the Maduro regime: End t...
1,The people of Venezuela are standing for FREED...
2,We are here to proclaim that a new day is comi...
3,"Hope you are enjoying your President’s Day, ou..."
4,“This was an illegal coup attempt on the Presi...
5,Great analysis by @foxandfriends!
6,....There is a lot of explaining to do to the ...
7,"Wow, so many lies by now disgraced acting FBI ..."
8,“After two years and interviewing more than tw...
9,William Barr is arriving at a Justice Departme...


Let's see what attributes available for each tweet object

In [8]:
print('id:',tweets[0]['id'])
print('date:',tweets[0]['created_at'])
print('likes:',tweets[0]['favorite_count'])
print('retweets:',tweets[0]['retweet_count'])
print(tweets[0]['entities'])

id: 1097625026801688579
date: Mon Feb 18 22:32:53 +0000 2019
likes: 149000
retweets: 41579
{'hashtags': [], 'symbols': [], 'user_mentions': [], 'urls': []}


Now let's add interesting attributes to the dataframe

In [9]:
df['len'] = np.array([len(tweet['full_text']) for tweet in tweets])
df['id'] = np.array([tweet['id'] for tweet in tweets])
df['Date'] = np.array([tweet['created_at'] for tweet in tweets])
df['Likes'] = np.array([tweet['favorite_count'] for tweet in tweets])
df['RTs'] = np.array([tweet['retweet_count'] for tweet in tweets])
df.head()

Unnamed: 0,Text,len,id,Date,Likes,RTs
0,I ask every member of the Maduro regime: End t...,268,1097625026801688579,Mon Feb 18 22:32:53 +0000 2019,149000,41579
1,The people of Venezuela are standing for FREED...,130,1097624329154674689,Mon Feb 18 22:30:07 +0000 2019,111651,31140
2,We are here to proclaim that a new day is comi...,217,1097623506580357120,Mon Feb 18 22:26:51 +0000 2019,96212,30255
3,"Hope you are enjoying your President’s Day, ou...",89,1097592331405066242,Mon Feb 18 20:22:58 +0000 2019,155374,24288
4,“This was an illegal coup attempt on the Presi...,110,1097488256848007173,Mon Feb 18 13:29:25 +0000 2019,115008,30164


In [10]:
min(df.Date), max(df.Date)

('Fri Dec 21 03:13:54 +0000 2018', 'Wed Jan 30 21:58:38 +0000 2019')

In [11]:
df.shape

(533, 6)

## Natural Language Processing


### Data preprocessing

Now lets do some preprocessing to prepare it for the entities extraction

In [12]:
# remove @user mentions, # hashtag symbol, URLs, emoji etc.
# check the package documentation for info
# pre.set_options(pre.OPT.URL, pre.OPT.EMOJI, pre.OPT.MENTION, pre.OPT.RESERVED, pre.OPT.SMILEY, pre.OPT.NUMBER)
text_processed = df['Text'].apply(lambda s: pre.clean(s))

# removing all non alpha-numeric symbols
text_processed = text_processed.apply(lambda s: re.sub(r'[^A-Za-z0-9 ]', '', s))

# removing stop words
text_processed = text_processed.apply(lambda s: ' '.join(word for word in s.split() if word not in stop_words))

In [13]:
print('Before:\n' + df['Text'][9] + '\n\nAfter:\n' + text_processed[9])

Before:
William Barr is arriving at a Justice Department that desperately needs an infusion of credibility, writes @KimStrassel https://t.co/naY9XOxb12 via @WSJ

After:
William Barr arriving Justice Department desperately needs infusion credibility writes via


## Entities extraction
Now after cleaning the data we can extract entities using SpaCy and NLTK and add it to the dataframe

To see the transcription for the entities codes check the SpaCy documentation

In [14]:
def get_entities(text):
    """
    Extract counts for each entity found in input text
    """
    return dict(Counter([s.label_ for s in nlp(text).ents]))

In [15]:
%%time
entities = text_processed.apply(lambda s: get_entities(s))
entities = entities.apply(pd.Series)

CPU times: user 8.42 s, sys: 45.3 ms, total: 8.47 s
Wall time: 4.27 s


In [16]:
df = pd.concat([df, entities], axis=1)
df.head()

Unnamed: 0,Text,len,id,Date,Likes,RTs,PERSON,ORG,CARDINAL,GPE,...,NORP,PRODUCT,MONEY,ORDINAL,TIME,LAW,EVENT,WORK_OF_ART,FAC,QUANTITY
0,I ask every member of the Maduro regime: End t...,268,1097625026801688579,Mon Feb 18 22:32:53 +0000 2019,149000,41579,1.0,1.0,1.0,1.0,...,,,,,,,,,,
1,The people of Venezuela are standing for FREED...,130,1097624329154674689,Mon Feb 18 22:30:07 +0000 2019,111651,31140,,1.0,,1.0,...,,,,,,,,,,
2,We are here to proclaim that a new day is comi...,217,1097623506580357120,Mon Feb 18 22:26:51 +0000 2019,96212,30255,,,,1.0,...,,,,,,,,,,
3,"Hope you are enjoying your President’s Day, ou...",89,1097592331405066242,Mon Feb 18 20:22:58 +0000 2019,155374,24288,,,,,...,,,,,,,,,,
4,“This was an illegal coup attempt on the Presi...,110,1097488256848007173,Mon Feb 18 13:29:25 +0000 2019,115008,30164,2.0,,,,...,,,,,,,,,,


## Collection of replies/comments

Now as we collected all Trump's tweets for the desired period, we can loop through them and actually collect people's reactions by getting the replies to these tweets

In [None]:
def collect_replies(tweets, file):
    """
    collect replies for all the collected tweets of the specified account,
    query gets 100 replies per page, 10 pages give 1000 replies in total,
    then compares with desired parameters,
    finally saves results to file.
    
    """
    for tweet in tweets:
        for page in tweepy.Cursor(
            api.search,
            # filter tweets by replies to @name and exclude retweets
            q='to:' + name + ' -filter:retweets',
            since_id=tweet['id'],
            count=100,
            # specifiy what type of search results you would prefer to receive. 
            # default is "mixed"
            result_type='mixed',
            tweet_mode='extended').pages(10):
            for status in page:
                if hasattr(status, 'in_reply_to_status_id_str'):
                    logging.info('Found a reply to the tweet with id=',
                                 status.in_reply_to_status_id_str,
                                 'text=' + status.text)
                    replies.append(status._json)
                    logging.info('Reply added to the list. Continue...')
    
    json.dump(replies, file, indent=2)

In [None]:
%%time
if os.path.isfile('replies.json') and not os.stat('replies.json').st_size == 0:
    logging.info('File replies.json already exists! Reading the file..')
    file = open('replies.json', 'r', encoding='utf8')
    replies = json.load(file)
    file.close()
else:
    logging.info('Creating replies.json, collecting tweets..')
    file = open('replies.json', 'w', encoding='utf8')
    replies = []
    collect_replies(tweets, file)
    file.close()

## Sentiment Analysis

Now we'll calculate the sentiment intensity for each of Trump's tweets using VADER which is especially tailored to work with social media texts

In [None]:
analyzer = SentimentIntensityAnalyzer()

sentiment_intensity = df['Text'].apply(lambda s: analyzer.polarity_scores(s))
df = pd.concat([df, sentiment_intensity.apply(pd.Series)], axis=1)
df.head()

Now lets take a look at the distribution of positive/negative tweets

In [None]:
sns.distplot(df['pos'])
plt.xlabel('Positive intensity')
plt.title('Trump\'s positive tweets')

In [None]:
sns.distplot(df['neg'])
plt.xlabel('Negative intensity')
plt.title('Trump\'s negative tweets')

In [None]:
sns.distplot(df['neu'])
plt.xlabel('Neutral intensity')
plt.title('Trump\'s neutral tweets')

In [None]:
sns.distplot(df['compound'])
plt.xlabel('Mixed intensity')
plt.title('Trump\'s mixed tweets')