In [258]:
import re
import nltk
import pandas as pd
from nltk.corpus import stopwords
from sklearn.metrics import precision_score, recall_score, accuracy_score

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\new\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [259]:
text_data = pd.read_csv("data\IMDB_Dataset.csv")

In [260]:
text_data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [261]:
text_data.columns

Index(['review', 'sentiment'], dtype='object')

In [262]:
text_data.shape

(50000, 2)

In [263]:
text_data['sentiment'].value_counts()

positive    25000
negative    25000
Name: sentiment, dtype: int64

In [264]:
text_data['review'].isna().value_counts()

False    50000
Name: review, dtype: int64

In [265]:
X,y = text_data['review'], text_data['sentiment']
y = y.apply(lambda x: 0 if x=='negative' else 1)

In [266]:
X.head()

0    One of the other reviewers has mentioned that ...
1    A wonderful little production. <br /><br />The...
2    I thought this was a wonderful way to spend ti...
3    Basically there's a family where a little boy ...
4    Petter Mattei's "Love in the Time of Money" is...
Name: review, dtype: object

In [267]:
from sklearn.model_selection import train_test_split
X_train, temp_X, y_train, temp_y = train_test_split(X,y,test_size=0.3)
X_test,X_val,y_test,y_val = train_test_split(temp_X,temp_y, test_size=0.5)

In [268]:
from sklearn.base import BaseEstimator,TransformerMixin

class PreProcessor(BaseEstimator,TransformerMixin):

    def fit(self,X,y=None):
        return self
    
    def transform(self,X):
        X = X.apply(lambda X: re.sub(r"<br />+"," ",X))
        X = X.apply(lambda X: re.sub(r"<br />+",'',re.sub(r"[\.,\']+", ' ',X)))
        X = X.apply(lambda X: X.lower())
        X = X.apply(lambda X: re.sub(r'\W'," ",X))
        X = X.apply(lambda X: re.sub(r"^\s+[a-z]\s+$"," ",X))
        X = X.apply(lambda X: re.sub(r"\s+",' ',X))
        return X

In [269]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

In [270]:
ml_pipeline = Pipeline(steps=[("preprocessor",PreProcessor()),
                              ("tfidf",TfidfVectorizer(max_features=1000,min_df=3,max_df=0.6,stop_words=stopwords.words('english'))),
                              ("model",LogisticRegression())])

In [271]:
preprocessed_output = ml_pipeline["preprocessor"].fit(X_train,y_train)


In [272]:
ml_pipeline.fit(X_train,y_train)

Pipeline(steps=[('preprocessor', PreProcessor()),
                ('tfidf',
                 TfidfVectorizer(max_df=0.6, max_features=1000, min_df=3,
                                 stop_words=['i', 'me', 'my', 'myself', 'we',
                                             'our', 'ours', 'ourselves', 'you',
                                             "you're", "you've", "you'll",
                                             "you'd", 'your', 'yours',
                                             'yourself', 'yourselves', 'he',
                                             'him', 'his', 'himself', 'she',
                                             "she's", 'her', 'hers', 'herself',
                                             'it', "it's", 'its', 'itself', ...])),
                ('model', LogisticRegression())])

In [273]:
training_predicted_labels = ml_pipeline.predict(X_train)
print("Training Precision = {}".format(precision_score(y_train, training_predicted_labels, average='macro')))
print("Training Recall = {}".format(recall_score(y_train, training_predicted_labels, average='macro')))
print("Training Accuracy = {}".format(accuracy_score(y_train, training_predicted_labels)))

Training Precision = 0.8737990435842162
Training Recall = 0.873662164135952
Training Accuracy = 0.8736571428571429


In [274]:
validation_predicted_labels = ml_pipeline.predict(X_val)
print("Validation Precision = {}".format(precision_score(y_val, validation_predicted_labels, average='macro')))
print("Validation Recall = {}".format(recall_score(y_val, validation_predicted_labels, average='macro')))
print("Validation Accuracy = {}".format(accuracy_score(y_val, validation_predicted_labels)))

Validation Precision = 0.8592055292496318
Validation Recall = 0.8590771729944724
Validation Accuracy = 0.8590666666666666


In [275]:
test_predicted_labels = ml_pipeline.predict(X_test)
print("Test Precision = {}".format(precision_score(y_test, test_predicted_labels, average='macro')))
print("Test Recall = {}".format(recall_score(y_test, test_predicted_labels, average='macro')))
print("Test Accuracy = {}".format(accuracy_score(y_test, test_predicted_labels)))

Test Precision = 0.8624903898665357
Test Recall = 0.8620789824958608
Test Accuracy = 0.8621333333333333


In [276]:
import pickle
# saving model in a pickle file
with open('model.pkl','wb') as f:
    pickle.dump(ml_pipeline['model'],f)
