## IMDB Movie reviews


In [1]:
import pandas as pd
import numpy as np

In [2]:
df=pd.read_csv('imdb.csv')

In [3]:
df

Unnamed: 0,reviews,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [4]:
df.head()

Unnamed: 0,reviews,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [5]:
df.shape

(50000, 2)

In [6]:
df.isnull().sum()

reviews      0
sentiment    0
dtype: int64

In [7]:
## Get the Independent Features

X=df['reviews']

In [8]:
## Get the Dependent features
y=df['sentiment']

In [9]:
X.shape

(50000,)

In [10]:
y.shape

(50000,)

In [11]:
from sklearn.preprocessing import LabelEncoder

# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Fit and transform the target variable
y_encoded= label_encoder.fit_transform(y)

# Check the mapping of original classes to encoded classes
print("Mapping of original classes to encoded classes:")
for label, encoded_label in zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)):
    print(label, "-->", encoded_label)

# Check the unique classes in the encoded target variable
unique_encoded_classes = np.unique(y_encoded)
print("Unique encoded classes in y_encoded:", unique_encoded_classes)

Mapping of original classes to encoded classes:
negative --> 0
positive --> 1
Unique encoded classes in y_encoded: [0 1]


In [12]:
y_encoded

array([1, 1, 1, ..., 0, 0, 0])

In [13]:
# movreviews=X.copy()

In [14]:
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [15]:
corpus = []
for i in range(0, len(X)):
    review = re.sub(r'<.*?>', '', X[i])
    review = re.sub('[^a-zA-Z0-9]', ' ', review)
    review = review.lower()
    review = word_tokenize(review)
    stop_words = set(stopwords.words('english'))
    review = [word for word in review if word not in stop_words]
    lemmatizer = WordNetLemmatizer()
    review = [lemmatizer.lemmatize(word) for word in review]
    review = ' '.join(review)
    corpus.append(review)

In [16]:
corpus[7000]

'sorry someone say really dull movie worthy perhaps dull nonetheless nearly cried boredom watching acting pretty dire story drawn predictable score camera work totally standard unexciting one movie allowed hate becase disabled people hate suspect nearly everyone interesting critic kind movie suppose allowed objective made win award remember duly neither interesting entertaining seen play compare'

In [17]:
# CountVec
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=2500, binary=True,ngram_range=(2,2))
X = cv.fit_transform(corpus).toarray()

In [18]:
X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 1, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [28]:
# y=pd.get_dummies(X)
# y=y.iloc[:,1].values

In [22]:
y_encoded

array([1, 1, 1, ..., 0, 0, 0])

In [23]:
# Train Test Split

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size = 0.20, random_state = 0)

In [24]:
from sklearn.naive_bayes import MultinomialNB
spam_detect_model = MultinomialNB().fit(X_train, y_train)

In [25]:
y_pred=spam_detect_model.predict(X_test)

In [26]:
from sklearn.metrics import accuracy_score,classification_report

In [27]:
score=accuracy_score(y_test,y_pred)
print(score)

0.7932


In [29]:
# CountVec

In [30]:
from sklearn.naive_bayes import MultinomialNB
spam_detect_model = MultinomialNB().fit(X_train, y_train)

In [31]:
#prediction
y_pred=spam_detect_model.predict(X_test)

In [32]:
from sklearn.metrics import accuracy_score,classification_report

In [33]:
score=accuracy_score(y_test,y_pred)
print(score)

0.7932


In [34]:
# Creating the TFIDF model
from sklearn.feature_extraction.text import TfidfVectorizer
tv = TfidfVectorizer(max_features=2500,ngram_range=(1,2))
X = tv.fit_transform(corpus).toarray()

In [35]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size = 0.20, random_state = 0)

In [36]:
from sklearn.naive_bayes import MultinomialNB
spam_detect_model = MultinomialNB().fit(X_train, y_train)

In [37]:
#prediction
y_pred=spam_detect_model.predict(X_test)

In [38]:
score=accuracy_score(y_test,y_pred)
print(score)

0.8508


In [39]:
# RandomForest
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier()
classifier.fit(X_train, y_train)

In [40]:
y_pred=classifier.predict(X_test)

In [41]:
print(accuracy_score(y_pred,y_test))

0.8421
