In [None]:
!pip install numpy pandas matplotlib spacy scikit-learn
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m83.9 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
# !pip uninstall torch torchvision torchaudio --y
# !pip install torch torchvision torchaudio

In [None]:

import numpy as np
import matplotlib.pyplot as plt
import spacy
import string
import pickle
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

### Dataset
The dataset which I used in this model is "IMDB Dataset of 50k Movies Reviews"

https://www.kaggle.com/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews

In [None]:
#dataset = pd.read_csv('/dataset/IMDB Dataset.csv')
dataset = pd.read_csv('/content/IMDB Dataset.csv')

In [None]:
dataset.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [None]:
print(f'Rows: {dataset.shape[0]}\nColumns: {dataset.shape[1]}')

Rows: 50000
Columns: 2


In [None]:
print(f'Columns Names: {list(dataset.columns)}')

Columns Names: ['review', 'sentiment']


### Text Operation

In [None]:

nlp = English()
stopwords = list(STOP_WORDS)
punctuations = string.punctuation

In [None]:
def tokenizer(sentence):
    mytokens = nlp(sentence)
    mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]
    mytokens = [ word for word in mytokens if word not in stopwords and word not in punctuations ]
    return mytokens

### Transformation and Vectorization

In [None]:
class predictors(TransformerMixin):
    def transform(self, X, **transform_params):
        return [clean_text(text) for text in X]
    def fit(self, X, y, **fit_params):
        return self
    def get_params(self, deep=True):
        return {}


def clean_text(text):
    return text.strip().lower()

In [None]:
vectorizer = CountVectorizer(tokenizer=tokenizer, ngram_range=(1, 1), max_df=0.95, min_df=2)
tfvectorizer = TfidfVectorizer(tokenizer = tokenizer)

### Split the Dataset

In [None]:
X = dataset['review']
y = dataset['sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=77)

### Logistic Regression

In [None]:
X_train

Unnamed: 0,review
9755,One comment said it wasn't a comedy...Mistake!...
31159,An ok movie about downs syndrome. A mother has...
26697,I have just wasted my Saturday night watching ...
39881,Viggo Mortensen stars as a new inmate of a hau...
43474,Another one that slipped by the radar of most ...
...,...
7832,when i saw the movie at first i thought that i...
42277,"I have to admit, this movie moved me to the ex..."
18667,"This movie rivals ""Plan 9"" as one of the dumbe..."
8799,"This movie is a real shame, not just for the p..."


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score


vectorizer = TfidfVectorizer(stop_words='english')

LRmodel = Pipeline([
    ("cleaner", predictors()),  # Data cleaning/preprocessing
    ('vectorizer', vectorizer),  # Vectorizing features
    ('classifier', LogisticRegression())  # Logistic Regression classifier
])


X_train = [doc for doc in X_train if len(doc.strip()) > 0]


LRmodel.fit(X_train, y_train)


LRpred = LRmodel.predict(X_test)


print(f'Confusion Matrix:\n{confusion_matrix(y_test, LRpred)}')
print(f'\nClassification Report:\n{classification_report(y_test, LRpred)}')
print(f'Accuracy: {accuracy_score(y_test, LRpred) * 100}%')

print('Logistic Regression model trained and evaluated.')


Confusion Matrix:
[[4458  593]
 [ 437 4512]]

Classification Report:
              precision    recall  f1-score   support

    negative       0.91      0.88      0.90      5051
    positive       0.88      0.91      0.90      4949

    accuracy                           0.90     10000
   macro avg       0.90      0.90      0.90     10000
weighted avg       0.90      0.90      0.90     10000

Accuracy: 89.7%
Logistic Regression model trained and evaluated.


In [None]:

pre = LRmodel.predict(["Production has an incredibly important place to shoot a series or film. Sometimes even a very minimalist story can reach an incredibly successful point after the right production stages. The Witcher series is far from minimalist. The Witcher is one of the best Middle-earth works in the world. Production quality is essential if you want to handle such a topic successfully."])
print(f'Prediction: {pre[0]}')

Prediction: positive


### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score


vectorizer = TfidfVectorizer(stop_words='english')

# Random Forest Classifier with 200 estimators
RFclassifier = RandomForestClassifier(n_estimators=200)

RFmodel = Pipeline([
    ("cleaner", predictors()),  # Data cleaning/preprocessing
    ('vectorizer', vectorizer),  # Vectorizing features
    ('classifier', RFclassifier)  # Random Forest classifier
])


X_train = [doc for doc in X_train if len(doc.strip()) > 0]


RFmodel.fit(X_train, y_train)


RFpred = RFmodel.predict(X_test)

print(f'Confusion Matrix:\n{confusion_matrix(y_test, RFpred)}')
print(f'\nClassification Report:\n{classification_report(y_test, RFpred)}')
print(f'Accuracy: {accuracy_score(y_test, RFpred) * 100}%')

print('Random Forest model trained and evaluated.')


Confusion Matrix:
[[4393  658]
 [ 658 4291]]

Classification Report:
              precision    recall  f1-score   support

    negative       0.87      0.87      0.87      5051
    positive       0.87      0.87      0.87      4949

    accuracy                           0.87     10000
   macro avg       0.87      0.87      0.87     10000
weighted avg       0.87      0.87      0.87     10000

Accuracy: 86.83999999999999%
Random Forest model trained and evaluated.


In [None]:

pre = RFmodel.predict(["I think this is my first review. This series is so bad I had to write one. I don't understand the good score. I have tried on 2 separate occasions to watch this show. Haven't even gotten past the 2nd episode because it is SO BORING."])
print(f'Prediction: {pre[0]}')

Prediction: negative


### LinearSVC

In [None]:
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

vectorizer = TfidfVectorizer(stop_words='english')

# Linear Support Vector Classifier (SVC)
SVCclassifier = LinearSVC()

SVCmodel = Pipeline([
    ("cleaner", predictors()),  # Data cleaning/preprocessing
    ('vectorizer', vectorizer),  # Vectorizing features
    ('classifier', SVCclassifier)  # Linear SVC classifier
])


X_train = [doc for doc in X_train if len(doc.strip()) > 0]


SVCmodel.fit(X_train, y_train)


SVCpred = SVCmodel.predict(X_test)


print(f'Confusion Matrix:\n{confusion_matrix(y_test, SVCpred)}')
print(f'\nClassification Report:\n{classification_report(y_test, SVCpred)}')
print(f'Accuracy: {accuracy_score(y_test, SVCpred) * 100}%')

print('LinearSVC model trained and evaluated.')


Confusion Matrix:
[[4497  554]
 [ 453 4496]]

Classification Report:
              precision    recall  f1-score   support

    negative       0.91      0.89      0.90      5051
    positive       0.89      0.91      0.90      4949

    accuracy                           0.90     10000
   macro avg       0.90      0.90      0.90     10000
weighted avg       0.90      0.90      0.90     10000

Accuracy: 89.92999999999999%
LinearSVC model trained and evaluated.


In [None]:
pre = SVCmodel.predict(["Henry cavill nailed the role perfectly. The fight scenes, the music, the cinematography, the whole atmosphere is beyond amazing. Netflix did it again"])
print(f'Prediction: {pre[0]}')

Prediction: positive


### Conclusion
The accuracy of all the algorithm is nearly the same and Logistic Regression accuracy is best among them with an accuracy of 88.97%