In [1]:
import pandas as pd
import numpy as np
import datetime
from datetime import datetime
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer


import warnings
warnings.filterwarnings('ignore')

#! pip install transformers==2.4.1
#! pip install flair

## Read Data

In [2]:
df = pd.read_csv('../Data/twitter.csv')

df.drop(df.columns[0], axis=1, inplace=True)
df.reset_index(drop=True, inplace=True)

df.head()

Unnamed: 0,ID,title,post_link,post_date_time,post_content,user_name,user_bio,user_followers_count,user_followee_count,user_post_count,country,state,language
0,1,beauty and the beast,http://twitter.com/isabellabarila/status/87564...,2017-06-16T09:21:34.000+0000,After watching the new Beauty and the Beast it...,isabellabarila,love me some big ass gold hoops,61,200,1118,AUS,South Australia,en
1,1,beauty and the beast,http://twitter.com/itrapful/status/87588081932...,2017-06-17T01:00:35.000+0000,RT @Moviepicts Beauty and the Beast (1991)/Bea...,itrapful,Aussie | your girl probably follows me | #7 ⚽️...,39222,3230,2163,AUS,Victoria,en
2,1,beauty and the beast,http://twitter.com/rulesofcaylen/status/875873...,2017-06-17T00:30:29.000+0000,does anyone know where i can watch beauty and ...,rulesofcaylen,someone write me a bio,5317,1480,89497,AUS,Queensland,en
3,1,beauty and the beast,http://twitter.com/ajsalvatore02/status/875868...,2017-06-17T00:12:43.000+0000,RT @beourguest Watch #BeautyAndTheBeast for ev...,ajsalvatore02,"Teen Wolf, The 100, Shadow hunters,The Vampire...",554,599,5577,AUS,Victoria,en
4,1,beauty and the beast,http://twitter.com/FilmMomatic/status/87590622...,2017-06-17T02:41:32.000+0000,@RealJDDuran The live-action director's cut of...,FilmMomatic,"Certified @RottenTomatoes film critic, yet the...",555,1686,11685,AUS,New South Wales,en


In [3]:
df['title'] = np.where(df['title'] == 'beauty and the beast', 'Beauty and the Beast', df['title'])
df['title'] = np.where(df['title'] == 'Predator', 'The Predator', df['title'])

## Explore Data 

### Tweets Per Movie
More popular movies have more tweet mentions. Exceptions include Starwards and Avengers which are also popular movies but somehow not many tweets.

Total of 25 movie tweets included in dataset

In [4]:
# Check how many tweets for each title
print('Breakdown of Tweet Count by Movie')
print(df['title'].value_counts())

print()
print('Total number of movie titles in dataset is: ' + str(len(df.title.unique())))

Breakdown of Tweet Count by Movie
Black Panther                        14009
Wonder Woman                         10506
Bright                               10410
Dunkirk                               6996
Beauty and the Beast                  4405
The Predator                          4275
Avengers Infinity War                 4236
Bird Box                              4010
The Greatest Showman                  3965
Occupation                            2579
Lion                                  2369
Coco                                  2174
Star Wars: The Last Jedi              2048
Incredibles 2                         1822
Polar                                 1673
Jurassic World                        1618
The Grinch                            1287
The Meg                                610
To All the Boys I�ve Loved Before      563
Overlord                               389
Night School                            98
Sweet Country                           78
Strange Colours     

### Tweets Per Language
Given that 96% of tweets are english, we can remove non english tweets. Can consider doing translation, but nope not doing it here. 


In [5]:
print("Total number of records in twitter data is: " + str(len(df)))

en_count = len(df.loc[df['language']=='en'])
perc = en_count/len(df)*100
print("Percentage of tweets in English is: " + str(perc))

df = df.loc[df['language'] == 'en']
print("Total number of records in English is: " + str(len(df)))


Total number of records in twitter data is: 80184
Percentage of tweets in English is: 96.00668462536167
Total number of records in English is: 76982


### Tweets Per Country and Region

All tweets from Australia, but from varying states.

In [6]:
#All tweets from AUS
print('Breakdown by Country')
print(df['country'].value_counts())
print()
print('Breakdown by States')
print(df['state'].value_counts())

Breakdown by Country
AUS    76982
Name: country, dtype: int64

Breakdown by States
New South Wales                 21341
Victoria                        18176
Queensland                       8954
Western Australia                4887
South Australia                  4317
Australian Capital Territory      730
Tasmania                          647
Northern Territory                212
Name: state, dtype: int64


## Preprocess Date

1. Extract Date, create new column df['post_date']
2. Get year and week number, to compare against box of performance. Create new column df['year_week']

In [7]:
df['post_date']= df['post_date_time'].astype(str).str[:10]
df.head()

Unnamed: 0,ID,title,post_link,post_date_time,post_content,user_name,user_bio,user_followers_count,user_followee_count,user_post_count,country,state,language,post_date
0,1,Beauty and the Beast,http://twitter.com/isabellabarila/status/87564...,2017-06-16T09:21:34.000+0000,After watching the new Beauty and the Beast it...,isabellabarila,love me some big ass gold hoops,61,200,1118,AUS,South Australia,en,2017-06-16
1,1,Beauty and the Beast,http://twitter.com/itrapful/status/87588081932...,2017-06-17T01:00:35.000+0000,RT @Moviepicts Beauty and the Beast (1991)/Bea...,itrapful,Aussie | your girl probably follows me | #7 ⚽️...,39222,3230,2163,AUS,Victoria,en,2017-06-17
2,1,Beauty and the Beast,http://twitter.com/rulesofcaylen/status/875873...,2017-06-17T00:30:29.000+0000,does anyone know where i can watch beauty and ...,rulesofcaylen,someone write me a bio,5317,1480,89497,AUS,Queensland,en,2017-06-17
3,1,Beauty and the Beast,http://twitter.com/ajsalvatore02/status/875868...,2017-06-17T00:12:43.000+0000,RT @beourguest Watch #BeautyAndTheBeast for ev...,ajsalvatore02,"Teen Wolf, The 100, Shadow hunters,The Vampire...",554,599,5577,AUS,Victoria,en,2017-06-17
4,1,Beauty and the Beast,http://twitter.com/FilmMomatic/status/87590622...,2017-06-17T02:41:32.000+0000,@RealJDDuran The live-action director's cut of...,FilmMomatic,"Certified @RottenTomatoes film critic, yet the...",555,1686,11685,AUS,New South Wales,en,2017-06-17


In [8]:
df['week_num'] = df['post_date'].apply(lambda x: str(datetime.fromisoformat(x).isocalendar()[1])) 
df['week_num'] = df['week_num'].apply(lambda x: '0'+str(x) if len(x)==1 else str(x)) 
df['year_week'] = df['post_date'].astype(str).str[:4] + "_" + df['week_num']
df.head()

Unnamed: 0,ID,title,post_link,post_date_time,post_content,user_name,user_bio,user_followers_count,user_followee_count,user_post_count,country,state,language,post_date,week_num,year_week
0,1,Beauty and the Beast,http://twitter.com/isabellabarila/status/87564...,2017-06-16T09:21:34.000+0000,After watching the new Beauty and the Beast it...,isabellabarila,love me some big ass gold hoops,61,200,1118,AUS,South Australia,en,2017-06-16,24,2017_24
1,1,Beauty and the Beast,http://twitter.com/itrapful/status/87588081932...,2017-06-17T01:00:35.000+0000,RT @Moviepicts Beauty and the Beast (1991)/Bea...,itrapful,Aussie | your girl probably follows me | #7 ⚽️...,39222,3230,2163,AUS,Victoria,en,2017-06-17,24,2017_24
2,1,Beauty and the Beast,http://twitter.com/rulesofcaylen/status/875873...,2017-06-17T00:30:29.000+0000,does anyone know where i can watch beauty and ...,rulesofcaylen,someone write me a bio,5317,1480,89497,AUS,Queensland,en,2017-06-17,24,2017_24
3,1,Beauty and the Beast,http://twitter.com/ajsalvatore02/status/875868...,2017-06-17T00:12:43.000+0000,RT @beourguest Watch #BeautyAndTheBeast for ev...,ajsalvatore02,"Teen Wolf, The 100, Shadow hunters,The Vampire...",554,599,5577,AUS,Victoria,en,2017-06-17,24,2017_24
4,1,Beauty and the Beast,http://twitter.com/FilmMomatic/status/87590622...,2017-06-17T02:41:32.000+0000,@RealJDDuran The live-action director's cut of...,FilmMomatic,"Certified @RottenTomatoes film critic, yet the...",555,1686,11685,AUS,New South Wales,en,2017-06-17,24,2017_24


## Preprocess Content

1. Remove URLS, tokens that start with 'https:'
2. Remove user mentionds, @
3. Did not remove hastags, anyways most of them would be neutral, may not have effect on sentiment score

In [9]:
#Do not use Iterrows, takes too long

df['content_clean'] = ''
f = lambda x: " ".join(filter(lambda x:x[:5]!='https' and x[0]!='@', x.split()))
df['content_clean'] = df['post_content'].apply(f)
df.head()


Unnamed: 0,ID,title,post_link,post_date_time,post_content,user_name,user_bio,user_followers_count,user_followee_count,user_post_count,country,state,language,post_date,week_num,year_week,content_clean
0,1,Beauty and the Beast,http://twitter.com/isabellabarila/status/87564...,2017-06-16T09:21:34.000+0000,After watching the new Beauty and the Beast it...,isabellabarila,love me some big ass gold hoops,61,200,1118,AUS,South Australia,en,2017-06-16,24,2017_24,After watching the new Beauty and the Beast it...
1,1,Beauty and the Beast,http://twitter.com/itrapful/status/87588081932...,2017-06-17T01:00:35.000+0000,RT @Moviepicts Beauty and the Beast (1991)/Bea...,itrapful,Aussie | your girl probably follows me | #7 ⚽️...,39222,3230,2163,AUS,Victoria,en,2017-06-17,24,2017_24,RT Beauty and the Beast (1991)/Beauty and the ...
2,1,Beauty and the Beast,http://twitter.com/rulesofcaylen/status/875873...,2017-06-17T00:30:29.000+0000,does anyone know where i can watch beauty and ...,rulesofcaylen,someone write me a bio,5317,1480,89497,AUS,Queensland,en,2017-06-17,24,2017_24,does anyone know where i can watch beauty and ...
3,1,Beauty and the Beast,http://twitter.com/ajsalvatore02/status/875868...,2017-06-17T00:12:43.000+0000,RT @beourguest Watch #BeautyAndTheBeast for ev...,ajsalvatore02,"Teen Wolf, The 100, Shadow hunters,The Vampire...",554,599,5577,AUS,Victoria,en,2017-06-17,24,2017_24,RT Watch #BeautyAndTheBeast for evermore. The ...
4,1,Beauty and the Beast,http://twitter.com/FilmMomatic/status/87590622...,2017-06-17T02:41:32.000+0000,@RealJDDuran The live-action director's cut of...,FilmMomatic,"Certified @RottenTomatoes film critic, yet the...",555,1686,11685,AUS,New South Wales,en,2017-06-17,24,2017_24,The live-action director's cut of BEAUTY AND T...


## Get Sentiment Scores using Pre-trained Model, VADER 

VADER: (Valence Aware Dictionary and sEntiment Reasoner) 

#### Preprocessing Steps
- Remove websites urls, 
- Do not remove emojis, or change to lower case. (VADER takes into consideration)
- Removing movie titles from tweets

#### Notes on VADER

- Takes into consideration Punctuation, Capitalization, Degree Modifiers, Emojis etc... 
- Referenced from https://medium.com/analytics-vidhya/simplifying-social-media-sentiment-analysis-using-vader-in-python-f9e6ec6fc52f

In [10]:
analyser = SentimentIntensityAnalyzer()
def sentiment_analyzer_scores(sentence):
    score = analyser.polarity_scores(sentence)
    print("{:-<40} {}".format(sentence, str(score)))
    
#sentiment_analyzer_scores('RT')
#analyser.polarity_scores('Star Wars')

In [11]:
#Get compoound score
df['polarity_score'] = ''
f = lambda x:analyser.polarity_scores(x)['compound']
df['polarity_score'] = df['content_clean'].apply(f)


In [12]:
df.groupby('title', as_index=False)['polarity_score'].mean().head()

Unnamed: 0,title,polarity_score
0,Avengers Infinity War,-0.081974
1,Beauty and the Beast,0.514472
2,Bird Box,0.086896
3,Black Panther,0.247773
4,Bright,0.503141


In [13]:
print("polarity score of starwars: " + str(analyser.polarity_scores('Star Wars')['compound']))
print("beauty and the beast: " + str(analyser.polarity_scores('Beauty and the beast')['compound']))

polarity score of starwars: -0.5574
beauty and the beast: 0.5859


## Remove titles from tweets

From above we observe that varying movie titles itself have varying polarity score. 'Star wars' has a negative polarity number, possibly due to 'wars' while 'Beauty and the beast' has a positive score of 0.5689, possibly due to 'beauty'. Given that most tweets are likely to contain the title of the movie, it will be better to remove titles from the tweets.  

In [14]:
# Remove Titles from Tweets

df['content_notitle'] = ''

def f(title, content):
    banned_lst = [i.lower() for i in title.split()]
    new_content =  " ".join(filter(lambda x:x.casefold() not in banned_lst, content.split()))
    return new_content
    
df['content_notitle'] = df.apply(lambda x: f(x.title, x.content_clean), axis=1)

In [15]:
#Get compound score for new tweets without titles

df['polarity_score_notitle'] = ''
f = lambda x:analyser.polarity_scores(x)['compound']
df['polarity_score_notitle'] = df['content_notitle'].apply(f)
#df.head()

In [16]:
df.groupby('title', as_index=False)['polarity_score', 'polarity_score_notitle'].mean().head(20)

Unnamed: 0,title,polarity_score,polarity_score_notitle
0,Avengers Infinity War,-0.081974,0.151043
1,Beauty and the Beast,0.514472,0.312795
2,Bird Box,0.086896,0.087328
3,Black Panther,0.247773,0.247699
4,Bright,0.503141,0.306908
5,Coco,0.239043,0.239007
6,Dunkirk,0.215069,0.21501
7,Incredibles 2,0.183971,0.183416
8,Jurassic World,0.111239,0.111278
9,Ladies in Black,0.23842,0.23842


Observe that polarity score changed for these movies 'Avengers Infinity War', 'Incredibles 2' 'Star Wars: The Last Jedi', etc as their titles itself are not neutral. 

In [17]:
df['index'] = df.index
df.to_csv('../Data/new_tweets.csv')

## End