In [1]:
import pandas as pd

#### Load data

In [4]:
df = pd.read_csv('IMDB Dataset.csv')
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


#### Check if there are any Null values

In [5]:
df.isna().sum()

review       0
sentiment    0
dtype: int64

In [6]:
df['sentiment'].value_counts()

positive    25000
negative    25000
Name: sentiment, dtype: int64

## Import SentimentIntensityAnalyzer and create a sid object

In [7]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [8]:
sid = SentimentIntensityAnalyzer()

In [11]:
sid.polarity_scores(df['review'][0])

{'neg': 0.203, 'neu': 0.748, 'pos': 0.048, 'compound': -0.9951}

In [16]:
sid.polarity_scores(df['review'][0])['compound']

-0.9951

### Append "comp_score" as new sentiment label

In [12]:
df['scores'] = df['review'].apply(lambda review: sid.polarity_scores(review))

In [13]:
df.head()

Unnamed: 0,review,sentiment,scores
0,One of the other reviewers has mentioned that ...,positive,"{'neg': 0.203, 'neu': 0.748, 'pos': 0.048, 'co..."
1,A wonderful little production. <br /><br />The...,positive,"{'neg': 0.053, 'neu': 0.776, 'pos': 0.172, 'co..."
2,I thought this was a wonderful way to spend ti...,positive,"{'neg': 0.094, 'neu': 0.714, 'pos': 0.192, 'co..."
3,Basically there's a family where a little boy ...,negative,"{'neg': 0.138, 'neu': 0.797, 'pos': 0.065, 'co..."
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,"{'neg': 0.052, 'neu': 0.801, 'pos': 0.147, 'co..."


In [23]:
df['compound'] = df['scores'].apply(lambda score: score['compound'])
df['comp_score'] = df['compound'].apply(lambda comp: 'positive' if comp >=0 else 'negative')
df.head()

Unnamed: 0,review,sentiment,scores,compound,comp_score
0,One of the other reviewers has mentioned that ...,positive,"{'neg': 0.203, 'neu': 0.748, 'pos': 0.048, 'co...",-0.9951,negative
1,A wonderful little production. <br /><br />The...,positive,"{'neg': 0.053, 'neu': 0.776, 'pos': 0.172, 'co...",0.9641,positive
2,I thought this was a wonderful way to spend ti...,positive,"{'neg': 0.094, 'neu': 0.714, 'pos': 0.192, 'co...",0.9605,positive
3,Basically there's a family where a little boy ...,negative,"{'neg': 0.138, 'neu': 0.797, 'pos': 0.065, 'co...",-0.9213,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,"{'neg': 0.052, 'neu': 0.801, 'pos': 0.147, 'co...",0.9744,positive


### Performing comparison analysis between the original sentiment and comp_score

In [18]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [24]:
accuracy_score(df['sentiment'], df['comp_score'])

0.69626

In [26]:
print(classification_report(df['sentiment'], df['comp_score']))

              precision    recall  f1-score   support

    negative       0.79      0.54      0.64     25000
    positive       0.65      0.86      0.74     25000

    accuracy                           0.70     50000
   macro avg       0.72      0.70      0.69     50000
weighted avg       0.72      0.70      0.69     50000



In [27]:
print(confusion_matrix(df['sentiment'], df['comp_score']))

[[13410 11590]
 [ 3597 21403]]


## It looks like we couldn't predict sentiment very accurately.
### In the next part, we'll try another approach