<a href="https://colab.research.google.com/github/aarora79/covid-analytics/blob/master/tweets_sentiment_anaysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
pip install nltk==3.3

Collecting nltk==3.3
[?25l  Downloading https://files.pythonhosted.org/packages/50/09/3b1755d528ad9156ee7243d52aa5cd2b809ef053a0f31b53d92853dd653a/nltk-3.3.0.zip (1.4MB)
[K     |████████████████████████████████| 1.4MB 2.9MB/s 
Building wheels for collected packages: nltk
  Building wheel for nltk (setup.py) ... [?25l[?25hdone
  Created wheel for nltk: filename=nltk-3.3-cp36-none-any.whl size=1394471 sha256=0d75b487058e684fe34d101b89d9db202f57d00d6405702d29893f4e6bc33ed0
  Stored in directory: /root/.cache/pip/wheels/d1/ab/40/3bceea46922767e42986aef7606a600538ca80de6062dc266c
Successfully built nltk
Installing collected packages: nltk
  Found existing installation: nltk 3.2.5
    Uninstalling nltk-3.2.5:
      Successfully uninstalled nltk-3.2.5
Successfully installed nltk-3.3


In [20]:
import nltk
nltk.download('twitter_samples')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package twitter_samples to /root/nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [3]:
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import twitter_samples, stopwords
from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize
from nltk import FreqDist, classify, NaiveBayesClassifier

import re, string, random

In [5]:
def remove_noise(tweet_tokens, stop_words = ()):

    cleaned_tokens = []

    for token, tag in pos_tag(tweet_tokens):
        token = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|'\
                       '(?:%[0-9a-fA-F][0-9a-fA-F]))+','', token)
        token = re.sub("(@[A-Za-z0-9_]+)","", token)

        if tag.startswith("NN"):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'

        lemmatizer = WordNetLemmatizer()
        token = lemmatizer.lemmatize(token, pos)

        if len(token) > 0 and token not in string.punctuation and token.lower() not in stop_words:
            cleaned_tokens.append(token.lower())
    return cleaned_tokens

In [6]:
def get_all_words(cleaned_tokens_list):
    for tokens in cleaned_tokens_list:
        for token in tokens:
            yield token


In [7]:
def get_tweets_for_model(cleaned_tokens_list):
    for tweet_tokens in cleaned_tokens_list:
        yield dict([token, True] for token in tweet_tokens)

In [13]:
positive_tweets = twitter_samples.strings('positive_tweets.json')
print(f"number of positive tweets={len(positive_tweets)}")

negative_tweets = twitter_samples.strings('negative_tweets.json')
print(f"number of negative tweets={len(negative_tweets)}")

text = twitter_samples.strings('tweets.20150430-223406.json')
print(f"number of random tweet samples={len(text)}")

tweet_tokens = twitter_samples.tokenized('positive_tweets.json')[0]

stop_words = stopwords.words('english')

positive_tweet_tokens = twitter_samples.tokenized('positive_tweets.json')
negative_tweet_tokens = twitter_samples.tokenized('negative_tweets.json')

positive_cleaned_tokens_list = []
negative_cleaned_tokens_list = []

number of positive tweets=5000
number of negative tweets=5000
number of random tweet samples=20000


In [18]:
for tokens in positive_tweet_tokens:
    positive_cleaned_tokens_list.append(remove_noise(tokens, stop_words))

for tokens in negative_tweet_tokens:
    negative_cleaned_tokens_list.append(remove_noise(tokens, stop_words))

all_pos_words = get_all_words(positive_cleaned_tokens_list)

In [21]:
freq_dist_pos = FreqDist(all_pos_words)
print(freq_dist_pos.most_common(10))

positive_tokens_for_model = get_tweets_for_model(positive_cleaned_tokens_list)
negative_tokens_for_model = get_tweets_for_model(negative_cleaned_tokens_list)

positive_dataset = [(tweet_dict, "Positive")
                      for tweet_dict in positive_tokens_for_model]

negative_dataset = [(tweet_dict, "Negative")
                      for tweet_dict in negative_tokens_for_model]

dataset = positive_dataset + negative_dataset

random.shuffle(dataset)

train_data = dataset[:7000]
test_data = dataset[7000:]

classifier = NaiveBayesClassifier.train(train_data)

print("Accuracy is:", classify.accuracy(classifier, test_data))

print(classifier.show_most_informative_features(10))

custom_tweet = "I ordered just once from TerribleCo, they screwed up, never used the app again."

custom_tokens = remove_noise(word_tokenize(custom_tweet))

print(custom_tweet, classifier.classify(dict([token, True] for token in custom_tokens)))


[]
Accuracy is: 0.9973333333333333
Most Informative Features
                      :( = True           Negati : Positi =   2070.9 : 1.0
                      :) = True           Positi : Negati =   1655.6 : 1.0
                     sad = True           Negati : Positi =     23.9 : 1.0
                follower = True           Positi : Negati =     21.3 : 1.0
                     bam = True           Positi : Negati =     20.9 : 1.0
                     x15 = True           Negati : Positi =     14.4 : 1.0
                 welcome = True           Positi : Negati =     14.1 : 1.0
                  arrive = True           Positi : Negati =     13.5 : 1.0
               community = True           Positi : Negati =     13.0 : 1.0
                    glad = True           Positi : Negati =     13.0 : 1.0
None
I ordered just once from TerribleCo, they screwed up, never used the app again. Negative


In [26]:
import pandas as pd
pd.set_option('display.max_colwidth', -1)

df = pd.read_csv("tt_clean.csv")
df["text"].head()

  


0    https://t.co/yFasyAw1hR                                                                                                                    
1    Saddened to hear the news of civil rights hero John Lewis passing. Melania and I send our prayers to he and his family.                    
2    RT @realDonaldTrump: Corrupt Joe Biden wants to defund our police. He may use different words but when you look at his pact with Crazy Ber…
3    RT @realDonaldTrump: I am the ultimate member of The Book of the Month Club. First I have lowlife dummy John Bolton a war mongering fool…  
4    RT @realDonaldTrump: ....will all end up going to the government anyway. Next up is Mary Trump a seldom seen niece who knows little about… 
Name: text, dtype: object

In [28]:
df['text'][1]

'Saddened to hear the news of civil rights hero John Lewis passing. Melania and I send our prayers to he and his family.'

In [33]:
custom_tweet = df['text'][40]
custom_tokens = remove_noise(word_tokenize(custom_tweet))

print(custom_tweet, classifier.classify(dict([token, True] for token in custom_tokens)))

RT @TimMurtaugh: 𝐁𝐫𝐚𝐧𝐝𝐨𝐧 𝐉𝐮𝐝𝐝: In Joe Biden’s America would-be immigrants including women &amp; children will be encouraged to put themselve… Negative


In [34]:
def get_tweet_sentiment(tweet):
  tokens = remove_noise(word_tokenize(tweet))
  return classifier.classify(dict([token, True] for token in tokens))


In [35]:
df['sentiment'] = df['text'].map(lambda x: get_tweet_sentiment(x))

In [36]:
df['sentiment'].value_counts()

Positive    29002
Negative    18646
Name: sentiment, dtype: int64

In [47]:
df['sentiment_as_int'] = df['sentiment'].map(lambda x: 1 if x == 'Positive' else -1)

In [48]:
df.head()

Unnamed: 0,source,text,created_at,retweet_count,favorite_count,is_retweet,id_str,date,sentiment,sentiment_as_int
0,Twitter for iPhone,https://t.co/yFasyAw1hR,2020-07-18 20:30:53,24905.0,66405,False,1.284586e+18,2020-07-18,Positive,1
1,Twitter for iPhone,Saddened to hear the news of civil rights hero John Lewis passing. Melania and I send our prayers to he and his family.,2020-07-18 18:05:35,16165.0,115427,False,1.28455e+18,2020-07-18,Positive,1
2,Twitter for iPhone,RT @realDonaldTrump: Corrupt Joe Biden wants to defund our police. He may use different words but when you look at his pact with Crazy Ber…,2020-07-18 05:08:47,41189.0,0,True,1.284354e+18,2020-07-18,Negative,-1
3,Twitter for iPhone,RT @realDonaldTrump: I am the ultimate member of The Book of the Month Club. First I have lowlife dummy John Bolton a war mongering fool…,2020-07-18 05:08:26,23076.0,0,True,1.284354e+18,2020-07-18,Positive,1
4,Twitter for iPhone,RT @realDonaldTrump: ....will all end up going to the government anyway. Next up is Mary Trump a seldom seen niece who knows little about…,2020-07-18 05:08:22,15487.0,0,True,1.284354e+18,2020-07-18,Positive,1


In [54]:
df_sentiment = df[['date', 'sentiment_as_int']].groupby('date').sum().reset_index()
df_sentiment.head()

Unnamed: 0,date,sentiment_as_int
0,2009-05-04,1
1,2009-05-05,1
2,2009-05-08,2
3,2009-05-12,2
4,2009-05-13,1


In [55]:
!pip install plotly



In [66]:
# draw a timeseries chart
import plotly.express as px


fig = px.line(df_sentiment, x='date', y='sentiment_as_int')
fig.update_layout(yaxis_title="Sentiment",
                  xaxis_title="")
fig.show()

In [67]:
df_sentiment.dtypes

date                object
sentiment_as_int    int64 
dtype: object

In [76]:
from datetime import datetime
df_sentiment_monthly = df_sentiment
df_sentiment_monthly['month'] = pd.to_datetime(df_sentiment_monthly.date).dt.to_period('m')
#df_sentiment_monthly['month'] = df_sentiment_monthly['date'].map(lambda x: datetime.strptime(x, "%Y-%m-%d").month)
df_sentiment_monthly = df_sentiment_monthly[['month', 'sentiment_as_int']].groupby(['month']).sum().reset_index()
df_sentiment_monthly.head()

Unnamed: 0,month,sentiment_as_int
0,2009-05,17
1,2009-06,9
2,2009-07,3
3,2009-08,-1
4,2009-09,1


In [82]:
df_sentiment_monthly['month'] = df_sentiment_monthly['month'].map(lambda x: datetime.strptime(str(x), "%Y-%m"))

df_sentiment_monthly.dtypes

month               datetime64[ns]
sentiment_as_int    int64         
dtype: object

In [83]:
import plotly.express as px


fig = px.line(df_sentiment_monthly, x='month', y='sentiment_as_int')
fig.update_layout(yaxis_title="Sentiment",
                  xaxis_title="")
fig.update_layout(xaxis=dict(tickformat="%Y-%m"))

fig.show()

In [91]:
# plot side by side with approval rating
# https://github.com/fivethirtyeight/data/tree/master/trump-approval-ratings
df_trump_approval_ratings = pd.read_csv("https://projects.fivethirtyeight.com/trump-approval-data/approval_polllist.csv")
df_trump_approval_ratings['createddate'] = df_trump_approval_ratings['createddate'].map(lambda x: datetime.strptime(str(x), "%m/%d/%Y"))

df_trump_approval_ratings.head()

Unnamed: 0,president,subgroup,modeldate,startdate,enddate,pollster,grade,samplesize,population,weight,influence,approve,disapprove,adjusted_approve,adjusted_disapprove,multiversions,tracking,url,poll_id,question_id,createddate,timestamp
0,Donald Trump,All polls,7/21/2020,1/20/2017,1/22/2017,Gallup,B,1500.0,a,0.262323,0.0,45.0,45.0,45.755339,43.574073,,T,http://www.gallup.com/poll/201617/gallup-daily-trump-job-approval.aspx,49253,77265,2017-01-23,14:42:29 21 Jul 2020
1,Donald Trump,All polls,7/21/2020,1/20/2017,1/22/2017,Morning Consult,B/C,1992.0,rv,0.680029,0.0,46.0,37.0,45.138716,37.90304,,,http://static.politico.com/9b/13/82a3baf542ae9018e5b6e1008379/170103-topline-politico-v3-kd.pdf,49249,77261,2017-01-23,14:42:29 21 Jul 2020
2,Donald Trump,All polls,7/21/2020,1/20/2017,1/24/2017,Ipsos,B-,1632.0,a,0.153481,0.0,42.1,45.2,43.118518,43.869973,,T,http://polling.reuters.com/#poll/CP3_2/,49426,77599,2017-03-01,14:42:29 21 Jul 2020
3,Donald Trump,All polls,7/21/2020,1/21/2017,1/23/2017,Gallup,B,1500.0,a,0.242845,0.0,45.0,46.0,45.755339,44.574073,,T,http://www.gallup.com/poll/201617/gallup-daily-trump-job-approval.aspx,49262,77274,2017-01-24,14:42:29 21 Jul 2020
4,Donald Trump,All polls,7/21/2020,1/22/2017,1/24/2017,Gallup,B,1500.0,a,0.22738,0.0,46.0,45.0,46.755339,43.574073,,T,http://www.gallup.com/poll/201617/gallup-daily-trump-job-approval.aspx,49236,77248,2017-01-25,14:42:29 21 Jul 2020


In [95]:
df_trump_approval_ratings_overall = df_trump_approval_ratings[['createddate', 'approve', 'disapprove']].groupby('createddate').mean().reset_index()
df_trump_approval_ratings_overall.head()

Unnamed: 0,createddate,approve,disapprove
0,2017-01-23,45.5,41.0
1,2017-01-24,45.0,46.0
2,2017-01-25,49.0,44.0
3,2017-01-26,45.222222,41.0
4,2017-01-27,50.0,46.5


In [102]:
#pd.pivot_table(df_trump_approval_ratings_overall, index=['createddate']).reset_index()
df_trump_approval_ratings_overall_tidy = pd.melt(df_trump_approval_ratings_overall, id_vars=['createddate'], value_vars=['approve', 'disapprove'])
df_trump_approval_ratings_overall_tidy.head()

Unnamed: 0,createddate,variable,value
0,2017-01-23,approve,45.5
1,2017-01-24,approve,45.0
2,2017-01-25,approve,49.0
3,2017-01-26,approve,45.222222
4,2017-01-27,approve,50.0


In [103]:
import plotly.express as px


fig = px.line(df_trump_approval_ratings_overall_tidy, x='createddate', y='value', color='variable')
fig.update_layout(yaxis_title="Approval Rating %",
                  xaxis_title="")
fig.show()