In [1]:
import pandas as pd
import numpy as np
import re
from bs4 import BeautifulSoup

from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD as TSVD
from sklearn.svm import SVC, LinearSVC
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import cross_val_score, cross_val_predict


In [2]:
import nltk
from nltk.corpus import stopwords

In [4]:
data = pd.read_csv('IMDB Dataset.csv')

In [5]:
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [6]:
train = data.iloc[:25000]
test = data.iloc[25000:]

In [7]:
train.shape, test.shape

((25000, 2), (25000, 2))

In [8]:
train.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [9]:
# Converting the text to lowercase

train['review'] = train['review'].apply(lambda x: str(x).lower())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['review'] = train['review'].apply(lambda x: str(x).lower())


In [10]:
data.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. <br /><br />the...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there's a family where a little boy ...,negative
4,"petter mattei's ""love in the time of money"" is...",positive


In [11]:
stopwords

<WordListCorpusReader in 'C:\\Users\\91987\\AppData\\Roaming\\nltk_data\\corpora\\stopwords'>

In [12]:
#converting data to vector
train['sentiment'].value_counts()

negative    12526
positive    12474
Name: sentiment, dtype: int64

In [13]:
X = train['review']
y = train['sentiment']

In [14]:
tfidf = TfidfVectorizer()
X = tfidf.fit_transform(X)

In [15]:
X.shape

(25000, 76496)

In [16]:
X

<25000x76496 sparse matrix of type '<class 'numpy.float64'>'
	with 3416775 stored elements in Compressed Sparse Row format>

In [17]:
#Splitting Data into Training and Testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 4, stratify = y)
X_train.shape, X_test.shape

((20000, 76496), (5000, 76496))

In [18]:
#Using Logistic Regression
from sklearn.linear_model import LogisticRegression
clf_lr = LogisticRegression()
X_train

<20000x76496 sparse matrix of type '<class 'numpy.float64'>'
	with 2735306 stored elements in Compressed Sparse Row format>

In [19]:
scores = cross_val_score(clf_lr, X_train, y_train, cv=10, n_jobs=4)
scores

array([0.8885, 0.8935, 0.888 , 0.888 , 0.8805, 0.8875, 0.8855, 0.882 ,
       0.8815, 0.88  ])

In [20]:
scores.mean()


0.8855000000000001

In [21]:
clf_lr.fit(X_train, y_train)

LogisticRegression()

In [22]:
y_test_pred = clf_lr.predict(X_test)

In [23]:
print(classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

    negative       0.89      0.87      0.88      2505
    positive       0.87      0.90      0.88      2495

    accuracy                           0.88      5000
   macro avg       0.88      0.88      0.88      5000
weighted avg       0.88      0.88      0.88      5000



In [24]:
confusion_matrix(y_test, y_test_pred)

array([[2175,  330],
       [ 261, 2234]], dtype=int64)

In [25]:
clf_lr.predict(tfidf.transform(['American Psycho deserved an Oscar, they were robbed']))

array(['negative'], dtype=object)

In [26]:
y_real_pred = clf_lr.predict(tfidf.transform(test['review']))

In [27]:
print(classification_report(test['sentiment'], y_real_pred))

              precision    recall  f1-score   support

    negative       0.89      0.88      0.88     12474
    positive       0.88      0.90      0.89     12526

    accuracy                           0.89     25000
   macro avg       0.89      0.89      0.89     25000
weighted avg       0.89      0.89      0.89     25000

