## Text Classification on Reviews Dataset

In [1]:
import numpy as np
import re
import pickle
import nltk
from nltk.corpus import stopwords
from sklearn.datasets import load_files
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/anil/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Dataset
I have downloaded dataset from http://www.cs.cornell.edu/people/pabo/movie-review-data/,
the dataset is polarity dataset v2.0

In [2]:
#importing dataset
reviews = load_files('./txt_sentoken/')
X,y = reviews.data,reviews.target

In [3]:
#storing data in pickle
with open("./data/X.pickle", 'wb') as f:
    pickle.dump(X,f)
with open("./data/y.pickle", 'wb') as f:
    pickle.dump(y,f)

In [4]:
#loading data
with open("./data/X.pickle", 'rb') as f:
    X = pickle.load(f)
with open("./data/y.pickle", 'rb') as f:
    y = pickle.load(f)

In [5]:
def preprocessing(review):
    review = re.sub(r'\W',' ',review)
    review = review.lower()
    review = re.sub(r'\s+[a-z]\s+',' ',review)
    review = re.sub(r'^[a-z]\s+',' ',review)
    review = re.sub(r'\s+',' ',review)
    return review

In [6]:
#preprocessing
corpus = []
for i in range(0,len(X)):
    review = preprocessing(str(X[i]))
    corpus.append(review)

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
# max_features - selecting no of columns (selecting top 2000 frequent words)
# min_df - considering the word which appears more than 3 documents
# max_df - eliminating the word which is in more than 60% documents (Ex: the, is, are....)
vectorizer = TfidfVectorizer(max_features=2000,min_df=3,max_df=0.6, stop_words=stopwords.words('english'))
X = vectorizer.fit_transform(corpus).toarray()

In [8]:
#split the dataset
from sklearn.model_selection import train_test_split
trainX, testX, trainY, testY = train_test_split(X,y,test_size = 0.2, random_state = 0)

In [9]:
# training the model
from sklearn.linear_model import LogisticRegression
model  = LogisticRegression()
model.fit(trainX,trainY)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [10]:
# model Prediction
y_pred = model.predict(testX)

In [11]:
from sklearn.metrics import confusion_matrix,classification_report
print(classification_report(testY,y_pred,target_names=['Negative','Positive']))

              precision    recall  f1-score   support

    Negative       0.89      0.81      0.85       208
    Positive       0.81      0.89      0.85       192

    accuracy                           0.85       400
   macro avg       0.85      0.85      0.85       400
weighted avg       0.85      0.85      0.85       400



In [12]:
#pickling the classifier
with open("./models/model.pickle", 'wb') as f:
    pickle.dump(model,f)

#pickling the vectorizer
with open("./models/vectorizer.pickle", 'wb') as f:
    pickle.dump(vectorizer,f)