## Import Libraries

In [55]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from nltk.tokenize import sent_tokenize, RegexpTokenizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download('stopwords')
nltk.download('vader_lexicon')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /Users/boula/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/boula/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package punkt to /Users/boula/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## Data Collection

The dataset comes from CrowdFlower via data.world (IMDB Sentiment Sampled) at the following link:

https://data.world/robbertb/imdb-sentiment-sampled

In [147]:
df = pd.read_csv('imdb_sentiment.csv')

In [148]:
df.head()

Unnamed: 0,review
0,Protocol is an implausible movie whose only sa...
1,"I just watched The Dresser this evening, havin..."
2,"Besides being boring, the scenes were oppressi..."
3,I'm not sure why the producers needed to trade...
4,Honestly - this short film sucks. the dummy us...


## Preprocessing

We start by preparing our data by cleaning and tokenizing the articles text

In [149]:
df['review'] = df['review'].apply(lambda txt: txt.lower())

In [150]:
stop_words=stopwords.words('english')
df['review'] = df['review'].apply(lambda txt: ' '.join([word for word in txt.split() if word not in stop_words]))

In [151]:
df['review'] = df['review'].apply(lambda txt: sent_tokenize(txt))

In [153]:
df['review'] = df['review'].apply(lambda txt: ' '.join(txt))

## Polarity Score

In [155]:
sid = SentimentIntensityAnalyzer()

In [156]:
df['score'] = df['review'].apply(lambda txt: sid.polarity_scores(txt))

In [160]:
df['score'][0]

{'neg': 0.0, 'neu': 0.653, 'pos': 0.347, 'compound': 0.9565}

In [162]:
df.head()

Unnamed: 0,review,score
0,protocol implausible movie whose saving grace ...,"{'neg': 0.0, 'neu': 0.653, 'pos': 0.347, 'comp..."
1,"watched dresser evening, seen before, dozen ye...","{'neg': 0.09, 'neu': 0.691, 'pos': 0.219, 'com..."
2,"besides boring, scenes oppressive dark. movie ...","{'neg': 0.164, 'neu': 0.654, 'pos': 0.182, 'co..."
3,i'm sure producers needed trade name somewhat ...,"{'neg': 0.0, 'neu': 0.808, 'pos': 0.192, 'comp..."
4,honestly - short film sucks. dummy used necro ...,"{'neg': 0.106, 'neu': 0.608, 'pos': 0.286, 'co..."


In [163]:
df['negative'] = df['score'].apply(lambda txt: txt['neg'])

In [164]:
df['neutral'] = df['score'].apply(lambda txt: txt['neu'])

In [165]:
df['positive'] = df['score'].apply(lambda txt: txt['pos'])

In [166]:
df['compound'] = df['score'].apply(lambda txt: txt['compound'])

In [167]:
df.head()

Unnamed: 0,review,score,negative,neutral,positive,compound
0,protocol implausible movie whose saving grace ...,"{'neg': 0.0, 'neu': 0.653, 'pos': 0.347, 'comp...",0.0,0.653,0.347,0.9565
1,"watched dresser evening, seen before, dozen ye...","{'neg': 0.09, 'neu': 0.691, 'pos': 0.219, 'com...",0.09,0.691,0.219,0.9895
2,"besides boring, scenes oppressive dark. movie ...","{'neg': 0.164, 'neu': 0.654, 'pos': 0.182, 'co...",0.164,0.654,0.182,0.1431
3,i'm sure producers needed trade name somewhat ...,"{'neg': 0.0, 'neu': 0.808, 'pos': 0.192, 'comp...",0.0,0.808,0.192,0.969
4,honestly - short film sucks. dummy used necro ...,"{'neg': 0.106, 'neu': 0.608, 'pos': 0.286, 'co...",0.106,0.608,0.286,0.9834


In [169]:
def polarity_score(compound):
    if compound > 0.05:
        return "positive"
    elif compound < -0.5:
        return "negative"
    elif compound >= -0.05 and compound < 0.05:
        return "neutral"

In [170]:
df['sentiment'] = df['compound'].apply(lambda val: polarity_score(val))

In [172]:
df.head()

Unnamed: 0,review,score,negative,neutral,positive,compound,sentiment
0,protocol implausible movie whose saving grace ...,"{'neg': 0.0, 'neu': 0.653, 'pos': 0.347, 'comp...",0.0,0.653,0.347,0.9565,positive
1,"watched dresser evening, seen before, dozen ye...","{'neg': 0.09, 'neu': 0.691, 'pos': 0.219, 'com...",0.09,0.691,0.219,0.9895,positive
2,"besides boring, scenes oppressive dark. movie ...","{'neg': 0.164, 'neu': 0.654, 'pos': 0.182, 'co...",0.164,0.654,0.182,0.1431,positive
3,i'm sure producers needed trade name somewhat ...,"{'neg': 0.0, 'neu': 0.808, 'pos': 0.192, 'comp...",0.0,0.808,0.192,0.969,positive
4,honestly - short film sucks. dummy used necro ...,"{'neg': 0.106, 'neu': 0.608, 'pos': 0.286, 'co...",0.106,0.608,0.286,0.9834,positive


In [177]:
df['sentiment'].value_counts()

positive    694
negative    258
neutral       6
Name: sentiment, dtype: int64

## Conclusion

VADER classifies the sentiments very well. It is easy to use, the ready-made model which can be used across multiple domains, social-media texts.