In [1]:
import pandas as pd
df_review = pd.read_csv('datasets/IMDB Dataset.csv')
df_review

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [2]:
# To train our model faster in the following steps, we’re going to take a smaller sample of 10000 rows. 
# This small sample will contain 9000 positive and 1000 negative reviews to make the data imbalanced
df_positive = df_review[df_review['sentiment']=='positive'][:9000]
df_negative = df_review[df_review['sentiment']=='negative'][:1000]
df_review_imb = pd.concat([df_positive, df_negative])

In [3]:
#pip install scikit-learn

In [4]:
#pip install imblearn

In [5]:
# Dealing with Imbalanced Classes
from imblearn.under_sampling import RandomUnderSampler

rus = RandomUnderSampler(random_state=0)
df_review_bal, df_review_bal['sentiment']=rus.fit_resample(df_review_imb[['review']], df_review_imb['sentiment'])
df_review_bal

Unnamed: 0,review,sentiment
0,Basically there's a family where a little boy ...,negative
1,"This show was an amazing, fresh & innovative i...",negative
2,Encouraged by the positive comments about this...,negative
3,Phil the Alien is one of those quirky films wh...,negative
4,I saw this movie when I was about 12 when it c...,negative
...,...,...
1995,Knute Rockne led an extraordinary life and his...,positive
1996,At the height of the 'Celebrity Big Brother' r...,positive
1997,This is another of Robert Altman's underrated ...,positive
1998,This movie won a special award at Cannes for i...,positive


In [6]:
print(df_review_imb.count)

<bound method DataFrame.count of                                                  review sentiment
0     One of the other reviewers has mentioned that ...  positive
1     A wonderful little production. <br /><br />The...  positive
2     I thought this was a wonderful way to spend ti...  positive
4     Petter Mattei's "Love in the Time of Money" is...  positive
5     Probably my all-time favorite movie, a story o...  positive
...                                                 ...       ...
2000  Stranded in Space (1972) MST3K version - a ver...  negative
2005  I happened to catch this supposed "horror" fli...  negative
2007  waste of 1h45 this nasty little film is one to...  negative
2013  Quite what the producers of this appalling ada...  negative

[10000 rows x 2 columns]>


In [7]:
print(df_review_bal.count)

<bound method DataFrame.count of                                                  review sentiment
0     Basically there's a family where a little boy ...  negative
1     This show was an amazing, fresh & innovative i...  negative
2     Encouraged by the positive comments about this...  negative
3     Phil the Alien is one of those quirky films wh...  negative
4     I saw this movie when I was about 12 when it c...  negative
...                                                 ...       ...
1995  Knute Rockne led an extraordinary life and his...  positive
1996  At the height of the 'Celebrity Big Brother' r...  positive
1997  This is another of Robert Altman's underrated ...  positive
1998  This movie won a special award at Cannes for i...  positive
1999  You'd be forgiven to think a Finnish director ...  positive

[2000 rows x 2 columns]>


In [8]:
print(df_review_imb.values==('sentiment'))
#print(df_review_bal.values==('negative').count)

[[False False]
 [False False]
 [False False]
 ...
 [False False]
 [False False]
 [False False]]


In [9]:
df_review.dtypes

review       object
sentiment    object
dtype: object

In [10]:
df_review_bal.dtypes

review       object
sentiment    object
dtype: object

In [11]:
df_review_imb.dtypes

review       object
sentiment    object
dtype: object

In [12]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(df_review_bal, test_size=0.33, random_state=42)

In [13]:
train_x, train_y = train['review'], train['sentiment']
test_x, test_y = test['review'], test['sentiment']

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(stop_words='english')
train_x_vector = tfidf.fit_transform(train_x)
train_x_vector

<1340x20625 sparse matrix of type '<class 'numpy.float64'>'
	with 118834 stored elements in Compressed Sparse Row format>

In [15]:
pd.DataFrame.sparse.from_spmatrix(train_x_vector, index=train_x.index, columns=tfidf.get_feature_names())

Unnamed: 0,00,000,007,01pm,02,04,08,10,100,1000,...,zooming,zooms,zues,zzzzzzzzzzzzzzzzzz,æon,élan,émigré,ísnt,ïn,ünfaithful
81,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
915,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1018,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
380,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.042791,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1029,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1130,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1294,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
860,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1459,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
test_x_vector = tfidf.transform(test_x)

In [18]:
#Model Selection
from sklearn.svm import SVC
svc = SVC(kernel='linear')
svc.fit(train_x_vector, train_y)

SVC(kernel='linear')

In [20]:
# Regression: They’re used to predict continuous values such as price, salary, age, etc
# Classification: They’re used to predict discrete values such as male/female, spam/not spam, positive/negative, etc.

print(svc.predict(tfidf.transform(['A good movie'])))
print(svc.predict(tfidf.transform(['An excellent movie'])))
print(svc.predict(tfidf.transform(['I did not like this movie at all'])))

['positive']
['positive']
['negative']


In [21]:
# Decision Tree
from sklearn.tree import DecisionTreeClassifier
dec_tree = DecisionTreeClassifier()
dec_tree.fit(train_x_vector, train_y)

DecisionTreeClassifier()

In [22]:
# Naive Bayes
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(train_x_vector.toarray(), train_y)

GaussianNB()

In [23]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression()
log_reg.fit(train_x_vector, train_y)

LogisticRegression()