In [1]:
"""
N.B. This notebook uses 'got' package which only works with Python2.7
got: https://github.com/Jefferson-Henrique/GetOldTweets-python
tweet2df function requires the twitter user name and how many tweets to receive, the function will:

1. process an initial clean up
2. use VaderSentiment to measure a sentiment score (1 being very posotive
   and -1 being very negative) for each tweet
3. save all information to a csv to data folder
"""

import got
import pandas as pd
import numpy as np
from collections import Counter
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import time
from datetime import datetime


PATH = '../data/'
def tweet2df(username='barackobama', max_tweets=100):
    t = time.time()

    print('getting tweets...')
    tweetCriteria = got.manager.TweetCriteria().setUsername(username).setMaxTweets(max_tweets)
    tweet = got.manager.TweetManager.getTweets(tweetCriteria)
    print('Time to crawl tweets:', round(time.time() - t,1))

    df = pd.DataFrame()

    columns = ['text', 'date', 'favorites', 'retweets', 'hashtags', 'mentions', 'permalink', 'id']

    information = list([[] for i in range(len(columns))])

    information[0] = [tweet[i].text for i in range(len(tweet))]
    information[1] = [tweet[i].date for i in range(len(tweet))]
    information[2] = [tweet[i].favorites for i in range(len(tweet))]
    information[3] = [tweet[i].retweets for i in range(len(tweet))]
    information[4] = [tweet[i].hashtags for i in range(len(tweet))]
    information[5] = [tweet[i].mentions for i in range(len(tweet))]
    information[6] = [tweet[i].permalink for i in range(len(tweet))]
    information[7] = [tweet[i].id for i in range(len(tweet))]
    
    for i,c in enumerate(columns):
        # value can be a list, a Series, an array or a scalar   
        df.insert(loc=0, column=c, value=information[i])
        
    del information, tweet
    
    df = df[['text', 'date', 'favorites', 'retweets', 'hashtags', 'mentions', 'permalink', 'id']]
    analyzer = SentimentIntensityAnalyzer()
    df['score'] = [analyzer.polarity_scores(i)['compound'] for i in df['text']]
    
    # clean up
    df['mentions'] = df['mentions'].apply(lambda x: x.replace('@',''))
    df['mentions'] = df['mentions'].apply(lambda x: x.replace(' ',','))

    df['hashtags'] = df['hashtags'].apply(lambda x: x.replace('#',''))
    df['hashtags'] = df['hashtags'].apply(lambda x: x.replace(' ',','))

    df['mentions'] = df['mentions'].astype(str)
    df['hashtags'] = df['hashtags'].astype(str)
    
    today = str(datetime.now().date()).replace('-','')
    df.to_excel(PATH+'%s_%s_%s.xlsx' % (username,today,str(max_tweets)+'tweets'), index=False)
    print('Time to finsh the whole process:', round(time.time() - t,1))
    return df

In [2]:
df = tweet2df('realDonaldTrump', 100)

getting tweets...
('Time to crawl tweets:', 2.0)
('Time to finsh the whole process:', 2.3)


In [3]:
df[df.text.str.contains('China')]

Unnamed: 0,text,date,favorites,retweets,hashtags,mentions,permalink,id,score
16,Talking trade with the Vice Premier of the Peo...,2018-05-17 14:27:52,39818,7668,,,https://twitter.com/realDonaldTrump/status/997...,997227223638790144,0.0
27,...haven’t even started yet! The U.S. has very...,2018-05-16 06:09:20,45808,9417,,,https://twitter.com/realDonaldTrump/status/996...,996739376028835840,0.0
28,"...We have not seen China’s demands yet, which...",2018-05-16 06:09:20,42263,8840,,,https://twitter.com/realDonaldTrump/status/996...,996739374619426816,0.4588
29,The Washington Post and CNN have typically wri...,2018-05-16 06:09:19,56864,13257,,,https://twitter.com/realDonaldTrump/status/996...,996739372723638272,-0.3818
37,Trade negotiations are continuing with China. ...,2018-05-15 05:35:30,76867,14832,,,https://twitter.com/realDonaldTrump/status/996...,996368474556583937,0.0
41,"ZTE, the large Chinese phone company, buys a b...",2018-05-14 13:06:53,62206,12461,,,https://twitter.com/realDonaldTrump/status/996...,996119678551552000,0.0
47,China and the United States are working well t...,2018-05-13 12:22:03,100531,19948,,,https://twitter.com/realDonaldTrump/status/995...,995746011321597953,0.901
48,"President Xi of China, and I, are working toge...",2018-05-13 08:01:00,83541,17398,,,https://twitter.com/realDonaldTrump/status/995...,995680316458262533,-0.3802
73,"I will be speaking to my friend, President Xi ...",2018-05-08 04:22:53,86574,17046,,,https://twitter.com/realDonaldTrump/status/993...,993813485745295360,0.8555
87,Our high level delegation is on the way back f...,2018-05-04 17:31:30,81025,17529,,,https://twitter.com/realDonaldTrump/status/992...,992562394051293186,0.6948


In [4]:
df['score'][:3000].median()

0.45445

In [5]:
def print_sentiment_scores(sentence):
    snt = analyzer.polarity_scores(sentence)
    print("{:-<40} {}".format(sentence, str(snt)))

In [6]:
df.loc[11, 'text']

u'Fake News Media had me calling Immigrants, or Illegal Immigrants, \u201cAnimals.\u201d Wrong! They were begrudgingly forced to withdraw their stories. I referred to MS 13 Gang Members as \u201cAnimals,\u201d a big difference - and so true. Fake News got it purposely wrong, as usual!'