In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.tokenize import word_tokenize,sent_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelBinarizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split

In [2]:
# Import IMDB dataset
df = pd.read_csv("https://raw.githubusercontent.com/meghjoshii/NSDC_DataScienceProjects_SentimentAnalysis/main/IMDB%20Dataset.csv")

df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [3]:
df.describe()

Unnamed: 0,review,sentiment
count,50000,50000
unique,49582,2
top,Loved today's show!!! It was a variety and not...,positive
freq,5,25000


We can see that we have 50,000 reviews in our dataset. The `sentiment` column has 2 unique values - `positive` and `negative`.

In [4]:
print(df['review'])

0        One of the other reviewers has mentioned that ...
1        A wonderful little production. <br /><br />The...
2        I thought this was a wonderful way to spend ti...
3        Basically there's a family where a little boy ...
4        Petter Mattei's "Love in the Time of Money" is...
                               ...                        
49995    I thought this movie did a down right good job...
49996    Bad plot, bad dialogue, bad acting, idiotic di...
49997    I am a Catholic taught in parochial elementary...
49998    I'm going to have to disagree with the previou...
49999    No one expects the Star Trek movies to be high...
Name: review, Length: 50000, dtype: object


In [5]:
# Check if the dataset is balanced
df['sentiment'].value_counts()

Unnamed: 0_level_0,count
sentiment,Unnamed: 1_level_1
positive,25000
negative,25000


We can see that we have 25,000 positive reviews and 25,000 negative reviews in our dataset. They are evenly distributed and we do not have to worry about class imbalance.

# Preprocessing

In [6]:
# Tokenization
nltk.download('punkt')
df['review'] = df['review'].apply(word_tokenize)

# Remove everything that isn't a word
df['review'] = df['review'].apply(lambda x: [item for item in x if item.isalpha()])

# Convert to lowercase
df['review'] = df['review'].apply(lambda x: [item.lower() for item in x])

# Remove stopwords
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

df['review'] = df['review'].apply(lambda x: [item for item in x if item not in stop_words])

# Stemming
from nltk.stem import PorterStemmer
ps = PorterStemmer()

df['review'] = df['review'].apply(lambda x: [ps.stem(item) for item in x])

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
df

Unnamed: 0,review,sentiment
0,"[one, review, mention, watch, oz, episod, hook...",positive
1,"[wonder, littl, product, br, br, film, techniq...",positive
2,"[thought, wonder, way, spend, time, hot, summe...",positive
3,"[basic, famili, littl, boy, jake, think, zombi...",negative
4,"[petter, mattei, love, time, money, visual, st...",positive
...,...,...
49995,"[thought, movi, right, good, job, creativ, ori...",positive
49996,"[bad, plot, bad, dialogu, bad, act, idiot, dir...",negative
49997,"[cathol, taught, parochi, elementari, school, ...",negative
49998,"[go, disagre, previou, comment, side, maltin, ...",negative


In [8]:
# Join word lists to go back to text form
df['review'] = df['review'].apply(lambda x: " ".join(x))

In [9]:
df

Unnamed: 0,review,sentiment
0,one review mention watch oz episod hook right ...,positive
1,wonder littl product br br film techniqu fashi...,positive
2,thought wonder way spend time hot summer weeke...,positive
3,basic famili littl boy jake think zombi closet...,negative
4,petter mattei love time money visual stun film...,positive
...,...,...
49995,thought movi right good job creativ origin fir...,positive
49996,bad plot bad dialogu bad act idiot direct anno...,negative
49997,cathol taught parochi elementari school nun ta...,negative
49998,go disagre previou comment side maltin one sec...,negative


In [10]:
# Train-test split
trainset, testset = train_test_split(df, test_size=0.20, random_state=42)

train_reviews = trainset.review
train_sentiments = trainset.sentiment

test_reviews = testset.review
test_sentiments = testset.sentiment

In [11]:
# Count vectorizer for bag of words
cv = CountVectorizer(min_df=0.0, max_df=1, binary = False, ngram_range = (1,3))

cv_train_reviews = cv.fit_transform(train_reviews) # transformed train reviews
cv_test_reviews = cv.transform(test_reviews) # transformed test reviews

In [12]:
# Label sentiment data
lb = LabelBinarizer()

lb_train_sentiments = lb.fit_transform(train_sentiments) # transformed train sentiment data
lb_test_sentiments = lb.transform(test_sentiments) # transformed test sentiment data

# Multinomial Naive Bayes classifier

In [13]:
# Train the model
mnb = MultinomialNB()

# Fit the model
mnb = mnb.fit(cv_train_reviews, lb_train_sentiments)

# Generate predictions (bag of words)
mnb_pred = mnb.predict(cv_test_reviews)

# Model Evaluation
mnb_score = accuracy_score(lb_test_sentiments, mnb_pred)
print("Accuracy :", mnb_score)
mnb_rep = classification_report(lb_test_sentiments, mnb_pred,target_names=['Positive','Negative'])
print(mnb_rep)

  y = column_or_1d(y, warn=True)


Accuracy : 0.7467
              precision    recall  f1-score   support

    Positive       0.73      0.77      0.75      4961
    Negative       0.76      0.72      0.74      5039

    accuracy                           0.75     10000
   macro avg       0.75      0.75      0.75     10000
weighted avg       0.75      0.75      0.75     10000



# Linear SVM (With SGD training)

In [16]:
from sklearn.linear_model import SGDClassifier

svm = SGDClassifier(loss='hinge', penalty='l2',alpha=1e-5, random_state=42, max_iter=5, tol=None)
svm.fit(cv_train_reviews, lb_train_sentiments)

svm_pred = svm.predict(cv_test_reviews)
svm_score = accuracy_score(lb_test_sentiments, svm_pred)
print("Accuracy :", svm_score)

svm_rep = classification_report(lb_test_sentiments, svm_pred,target_names=['Positive','Negative'])
print(svm_rep)

  y = column_or_1d(y, warn=True)


Accuracy : 0.7125
              precision    recall  f1-score   support

    Positive       0.66      0.88      0.75      4961
    Negative       0.82      0.55      0.66      5039

    accuracy                           0.71     10000
   macro avg       0.74      0.71      0.71     10000
weighted avg       0.74      0.71      0.70     10000



# Logistic Regression

In [15]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression(penalty='l2', C=1, random_state=42)
logreg.fit(cv_train_reviews, lb_train_sentiments)

logreg_pred = logreg.predict(cv_test_reviews)
logreg_score = accuracy_score(lb_test_sentiments, logreg_pred)
print("Accuracy :", logreg_score)

logreg_rep = classification_report(lb_test_sentiments, logreg_pred,target_names=['Positive','Negative'])
print(logreg_rep)

  y = column_or_1d(y, warn=True)


Accuracy : 0.6101
              precision    recall  f1-score   support

    Positive       0.56      0.97      0.71      4961
    Negative       0.90      0.25      0.40      5039

    accuracy                           0.61     10000
   macro avg       0.73      0.61      0.55     10000
weighted avg       0.73      0.61      0.55     10000



# Logistic Regression (With SGD training)

In [17]:
logreg_sgd = SGDClassifier(loss='log_loss', penalty='l2',alpha=1e-5, random_state=42, max_iter=5, tol=None)
logreg_sgd.fit(cv_train_reviews, lb_train_sentiments)

logreg_sgd_pred = logreg_sgd.predict(cv_test_reviews)
logreg_sgd_score = accuracy_score(lb_test_sentiments, logreg_sgd_pred)
print("Accuracy :", logreg_sgd_score)

logreg_sgd_rep = classification_report(lb_test_sentiments, logreg_sgd_pred,target_names=['Positive','Negative'])
print(logreg_sgd_rep)

  y = column_or_1d(y, warn=True)


Accuracy : 0.6909
              precision    recall  f1-score   support

    Positive       0.63      0.91      0.74      4961
    Negative       0.84      0.48      0.61      5039

    accuracy                           0.69     10000
   macro avg       0.74      0.69      0.68     10000
weighted avg       0.74      0.69      0.68     10000



# Conclusion

We can observe that multinomial naive bayes and linear SVM perform better that logistic regression does, with or without SGD training.