## Import libraries

In [1]:
!pip install tweepy openpyxl

Collecting tweepy
  Downloading https://files.pythonhosted.org/packages/bb/7c/99d51f80f3b77b107ebae2634108717362c059a41384a1810d13e2429a81/tweepy-3.9.0-py2.py3-none-any.whl
Installing collected packages: tweepy
Successfully installed tweepy-3.9.0


In [2]:
import numpy as np
import pandas as pd
import tweepy

## Load data

We will use the [Twitter API](https://developer.twitter.com/en/docs/twitter-ads-api/creatives/api-reference/tweets)

* Create a Twitter app
    * Go to [Twitter Developer](https://developer.twitter.com/en/dashboard)
    * Go to "Apps" (upper right) > Create an app

<!-- -->

* Create .credentials.py and put your credentials in it:

    ```
    API_KEY      = ""
    API_SECRET   = ""
    ACCESS_TOKEN = ""
    ACCESS_TOKEN_SECRET = ""
    ```

In [3]:
with open('.credentials.py') as f:
    exec(f.read())

print(API_KEY)

ryi6WEqp7tfCU9jKs0aevf1AP


In [5]:
# Authenticate
auth = tweepy.OAuthHandler(API_KEY, API_SECRET)
auth.set_access_token(ACCESS_TOKEN, ACCESS_TOKEN_SECRET)

api = tweepy.API(auth)
api.me().name

'A-mt'

In [9]:
# Get last tweets about covid
tweets = api.search(
    'covid',
    count=5,
    lang='en',
    exclude='retweets',
    tweet_mode='extended')  # get the full (untruncated) text

In [11]:
print(tweets[0]._json)

{'created_at': 'Thu Oct 29 12:46:23 +0000 2020', 'id': 1321795516355743751, 'id_str': '1321795516355743751', 'full_text': "@giorgishka The universities have asked students to contact them if they have covid. They have asked students to follow guidelines and isolate if required. Its up to the students to do their part. Universities aren't responsible for the test and trace system.", 'truncated': False, 'display_text_range': [12, 259], 'entities': {'hashtags': [], 'symbols': [], 'user_mentions': [{'screen_name': 'giorgishka', 'name': 'giorgia aiello', 'id': 211443137, 'id_str': '211443137', 'indices': [0, 11]}], 'urls': []}, 'metadata': {'iso_language_code': 'en', 'result_type': 'recent'}, 'source': '<a href="https://mobile.twitter.com" rel="nofollow">Twitter Web App</a>', 'in_reply_to_status_id': 1321788455626723329, 'in_reply_to_status_id_str': '1321788455626723329', 'in_reply_to_user_id': 211443137, 'in_reply_to_user_id_str': '211443137', 'in_reply_to_screen_name': 'giorgishka', 'user

In [15]:
# Get last 1000 tweets about covid
NB_MAX = 1000

df = pd.DataFrame(columns=[
    'tweet', 'user', 'user_statuses_count', 'user_followers',
    'user_location', 'user_verified', 'fav_count', 'rt_count',
    'tweet_date'
])

# Paginate results with a stride of 100
cursor = tweepy.Cursor(
    api.search,
    count=100,
    lang='en',

    q=['covid'],
    exclude='retweets',
    tweet_mode='extended'
)

i =  0
for tweet in cursor.items(NB_MAX):
    if i % 100 == 0:
        print('\n{:d}/{:d} '.format(i, NB_MAX))
    i += 1
    print('.', end='')

    df.loc[len(df)] = {
        'tweet': tweet.full_text,
        'user' : tweet.user.name,
        'user_statuses_count': tweet.user.statuses_count,
        'user_followers': tweet.user.followers_count,
        'user_location': tweet.user.location,
        'user_verified': tweet.user.verified,
        'fav_count': tweet.favorite_count,
        'rt_count': tweet.retweet_count,
        'tweet_date': tweet.created_at
    }

df.to_csv('covid_tweets.csv')
df.head()


0/1000 
....................................................................................................
100/1000 
....................................................................................................
200/1000 
....................................................................................................
300/1000 
....................................................................................................
400/1000 
....................................................................................................
500/1000 
....................................................................................................
600/1000 
....................................................................................................
700/1000 
....................................................................................................
800/1000 
....................................................................................................
90

Unnamed: 0,tweet,user,user_statuses_count,user_followers,user_location,user_verified,fav_count,rt_count,tweet_date
0,@Craig_Spur @kev_g1 @davspurs We have put a nu...,Covyid1882_UltimateFandemic,1015,191,,False,0,0,2020-10-29 12:53:25
1,@BrownCheong @itvnews It cos he drains the lif...,GhostOfTheCovidBat,3,0,,False,0,0,2020-10-29 12:53:25
2,@realDonaldTrump COVID COVID COVID!\nCOVID COV...,Julie Harrison,13860,782,,False,0,0,2020-10-29 12:53:24
3,@Brad_kemble_84 It’s a spiritual warfare going...,Coach DaniB.,1315,186,,False,0,0,2020-10-29 12:53:24
4,@davidchang Guess he can “look past” kids in c...,Susana Leyva 🎧,6093,95,"Chicago, IL",False,0,0,2020-10-29 12:53:24


In [16]:
df.shape

(1000, 9)

In [3]:
df = pd.read_csv('covid_tweets.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,tweet,user,user_statuses_count,user_followers,user_location,user_verified,fav_count,rt_count,tweet_date
0,0,@Craig_Spur @kev_g1 @davspurs We have put a nu...,Covyid1882_UltimateFandemic,1015,191,,False,0,0,2020-10-29 12:53:25
1,1,@BrownCheong @itvnews It cos he drains the lif...,GhostOfTheCovidBat,3,0,,False,0,0,2020-10-29 12:53:25
2,2,@realDonaldTrump COVID COVID COVID!\nCOVID COV...,Julie Harrison,13860,782,,False,0,0,2020-10-29 12:53:24
3,3,@Brad_kemble_84 It’s a spiritual warfare going...,Coach DaniB.,1315,186,,False,0,0,2020-10-29 12:53:24
4,4,@davidchang Guess he can “look past” kids in c...,Susana Leyva 🎧,6093,95,"Chicago, IL",False,0,0,2020-10-29 12:53:24


## Data preprocessing

In [4]:
import re

In [16]:
def cleanup(txt):
    txt = re.sub('@\w+',' ', txt)             # remove @mentions
    txt = re.sub('\w+:\/\/\S+', ' ', txt)      # remove link://...
    txt = re.sub('[^0-9a-zA-Z \t]', ' ', txt) # remove punctuation
    return txt.strip()

In [6]:
print(df.loc[0].tweet)

@Craig_Spur @kev_g1 @davspurs We have put a number now on every single head within the Covid world which makes people in the public feel that every loss is personal to them and I do get it but you think people are gonna go out an exercise when we tell them 17m die globally a year through crap lifestyles?


In [7]:
print(cleanup(df.loc[0].tweet))

We have put a number now on every single head within the Covid world which makes people in the public feel that every loss is personal to them and I do get it but you think people are gonna go out an exercise when we tell them 17m die globally a year through crap lifestyles


In [17]:
df['tweet_clean'] = df['tweet'].apply(cleanup)

df[['tweet', 'tweet_clean']].head()

Unnamed: 0,tweet,tweet_clean
0,@Craig_Spur @kev_g1 @davspurs We have put a nu...,We have put a number now on every single head ...
1,@BrownCheong @itvnews It cos he drains the lif...,It cos he drains the life force of 21 year old...
2,@realDonaldTrump COVID COVID COVID!\nCOVID COV...,COVID COVID COVID COVID COVID COVID TrumpV...
3,@Brad_kemble_84 It’s a spiritual warfare going...,It s a spiritual warfare going on look around ...
4,@davidchang Guess he can “look past” kids in c...,Guess he can look past kids in cages a dism...


## Analyse tweets

Textblob will return Sentiment(polarity, subjectivity)

* Polarity: [-1, +1] — where -1 is a negative sentiment and +1 is positive  
* Subjectivity: [0.0, 1.0] — where 0.0 is very objective and 1.0 is very subjective

In [18]:
!pip install textblob

Collecting textblob
[?25l  Downloading https://files.pythonhosted.org/packages/60/f0/1d9bfcc8ee6b83472ec571406bd0dd51c0e6330ff1a51b2d29861d389e85/textblob-0.15.3-py2.py3-none-any.whl (636kB)
[K     |████████████████████████████████| 645kB 1.1MB/s eta 0:00:01
Installing collected packages: textblob
Successfully installed textblob-0.15.3


In [9]:
from textblob import TextBlob

In [10]:
def analyze_sentiment(txt):
    analysis = TextBlob(txt)

    return round(analysis.sentiment.polarity, 2)

In [18]:
df['sentiment']  = df['tweet_clean'].apply(analyze_sentiment)
df.head()

Unnamed: 0.1,Unnamed: 0,tweet,user,user_statuses_count,user_followers,user_location,user_verified,fav_count,rt_count,tweet_date,tweet_clean,sentiment
0,0,@Craig_Spur @kev_g1 @davspurs We have put a nu...,Covyid1882_UltimateFandemic,1015,191,,False,0,0,2020-10-29 12:53:25,We have put a number now on every single head ...,-0.17
1,1,@BrownCheong @itvnews It cos he drains the lif...,GhostOfTheCovidBat,3,0,,False,0,0,2020-10-29 12:53:25,It cos he drains the life force of 21 year old...,0.0
2,2,@realDonaldTrump COVID COVID COVID!\nCOVID COV...,Julie Harrison,13860,782,,False,0,0,2020-10-29 12:53:24,COVID COVID COVID COVID COVID COVID TrumpV...,0.0
3,3,@Brad_kemble_84 It’s a spiritual warfare going...,Coach DaniB.,1315,186,,False,0,0,2020-10-29 12:53:24,It s a spiritual warfare going on look around ...,0.16
4,4,@davidchang Guess he can “look past” kids in c...,Susana Leyva 🎧,6093,95,"Chicago, IL",False,0,0,2020-10-29 12:53:24,Guess he can look past kids in cages a dism...,-0.19


In [12]:
print('Positive tweets:', len(df[df['sentiment'] > 0]))
print('Negative tweets:', len(df[df['sentiment'] < 0]))
print('Neutral tweets:',  len(df[df['sentiment'] == 0]))

Positive tweets: 445
Negative tweets: 224
Neutral tweets: 331


In [13]:
# Select one negative row randomly
row = df[df['sentiment'] < -0.5].sample().iloc[0]

print('\nOriginal tweet:\n', row['tweet'])
print('\nCleaned tweet:\n',  row['tweet_clean'])
print('\nSentiment:\n',      row['sentiment'])


Original tweet:
 @Qualifyfor It’s a distraction from his miserable failure with COVID.

Cleaned tweet:
 It s a distraction from his miserable failure with COVID

Sentiment:
 -0.66


In [19]:
# Select one positive row randomly
row = df[df['sentiment'] > 0.5].sample().iloc[0]

print('\nOriginal tweet:\n', row['tweet'])
print('\nCleaned tweet:\n',  row['tweet_clean'])
print('\nSentiment:\n',      row['sentiment'])


Original tweet:
 I'm just saying that the media jumped all over Obama if he had the temerity to mention good jobs numbers while the economy was recovering from the Great Recession. More than 1,000 people died of COVID *yesterday*, Trump has his economic pom poms out &amp; it's just accepted.

Cleaned tweet:
 I m just saying that the media jumped all over Obama if he had the temerity to mention good jobs numbers while the economy was recovering from the Great Recession  More than 1 000 people died of COVID  yesterday   Trump has his economic pom poms out  amp  it s just accepted

Sentiment:
 0.55
