In [1]:
!pip install PyDrive

from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# Authenticate and create the PyDrive client.
# This only needs to be done once in a notebook.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

fileId = drive.CreateFile({'id': '#'}) 
print(fileId['title'])  # dataset.zip
fileId.GetContentFile('temp.zip')  # Save Drive file as a local file

!unzip temp.zip -d ./

Collecting PyDrive
[?25l  Downloading https://files.pythonhosted.org/packages/52/e0/0e64788e5dd58ce2d6934549676243dc69d982f198524be9b99e9c2a4fd5/PyDrive-1.3.1.tar.gz (987kB)
[K     |████████████████████████████████| 993kB 4.3MB/s 
Building wheels for collected packages: PyDrive
  Building wheel for PyDrive (setup.py) ... [?25l[?25hdone
  Stored in directory: /root/.cache/pip/wheels/fa/d2/9a/d3b6b506c2da98289e5d417215ce34b696db856643bad779f4
Successfully built PyDrive
Installing collected packages: PyDrive
Successfully installed PyDrive-1.3.1
imdb_master.zip
Archive:  temp.zip
  inflating: ./imdb_master.csv       


In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
import re
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import metrics
from sklearn.model_selection import train_test_split
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [0]:
df = pd.read_csv('imdb_master.csv', encoding = "ISO-8859-1")
del df['Unnamed: 0']
del df['file']
del df['type']

In [0]:
df = df.loc[0:49999]
df = df.sample(frac=1).reset_index(drop=True)

In [0]:
def clean_reviews(text):
    lemmatizer = WordNetLemmatizer()
    my_stopwords = stopwords.words('english') 
    text = text.replace("<br >", "")
    text = text.replace("</br >", "")        
    text = re.sub('[^a-zA-Z]',' ', text)
    text = text.lower() 
    text = [lemmatizer.lemmatize(token) for token in text.split(" ")]
    text = [lemmatizer.lemmatize(token, "v") for token in text]
    text = [word for word in text if not word in my_stopwords]
    text = " ".join(text)
    return text  

In [0]:
df["review"]=df.review.apply(lambda x: clean_reviews(x))

In [7]:
df.head()

Unnamed: 0,review,label
0,gruelling watch one bergman finest film inte...,pos
1,saw movie five time never get tire feature tr...,pos
2,even st century child bear dangerous woman...,pos
3,zero day ha purpose simply entertainment deli...,pos
4,usually give horror film around one catc...,pos


In [8]:
df["label"] = df["label"].map({'pos': 1, 'neg': 0})
df.head()

Unnamed: 0,review,label
0,gruelling watch one bergman finest film inte...,1
1,saw movie five time never get tire feature tr...,1
2,even st century child bear dangerous woman...,1
3,zero day ha purpose simply entertainment deli...,1
4,usually give horror film around one catc...,1


In [0]:
X_train, X_test, y_train, y_test = train_test_split(df["review"], df["label"], test_size=0.2)

In [0]:
vect = TfidfVectorizer(ngram_range=(1,2), binary=True)
train_data = vect.fit_transform(X_train)
test_data = vect.transform(X_test)

In [0]:
# SVM

from sklearn.svm import LinearSVC
svm = LinearSVC()

In [0]:
svm.fit(train_data, y_train)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)

In [0]:
y_pred_class = svm.predict(test_data)

In [0]:
metrics.accuracy_score(y_test, y_pred_class)

0.9044

In [0]:
metrics.confusion_matrix(y_test, y_pred_class)

array([[4495,  535],
       [ 421, 4549]])

In [0]:
# Logistic Regression

from sklearn.linear_model import LogisticRegression
nb = LogisticRegression()

In [0]:
nb.fit(train_data, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [0]:
y_pred_class = nb.predict(test_data)

In [0]:
metrics.accuracy_score(y_test, y_pred_class)

0.889

In [0]:
metrics.confusion_matrix(y_test, y_pred_class)

array([[4423,  607],
       [ 503, 4467]])

In [0]:
# Naive Bayes

from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()

In [0]:
nb.fit(train_data, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [0]:
y_pred_class = nb.predict(test_data)

In [0]:
metrics.accuracy_score(y_test, y_pred_class)

0.8851

In [0]:
metrics.confusion_matrix(y_test, y_pred_class)

array([[4540,  490],
       [ 659, 4311]])

In [0]:
# Decision Tree

from sklearn.tree import DecisionTreeClassifier
nb = DecisionTreeClassifier()

In [0]:
nb.fit(train_data, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [0]:
y_pred_class = nb.predict(test_data)

In [0]:
metrics.accuracy_score(y_test, y_pred_class)

0.6933

In [0]:
metrics.confusion_matrix(y_test, y_pred_class)

array([[3534, 1496],
       [1571, 3399]])

In [0]:
# Random Forest 

from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators = 100)

In [0]:
clf.fit(train_data, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [0]:
y_pred_class = clf.predict(test_data)

In [0]:
metrics.accuracy_score(y_test, y_pred_class)

0.8535

In [0]:
metrics.confusion_matrix(y_test, y_pred_class)

array([[4326,  704],
       [ 761, 4209]])

In [11]:
!pip install xgboost



In [0]:
import xgboost as xgb

In [0]:
xgb_classifier = xgb.XGBClassifier(max_depth=10)
xgb_classifier = xgb_classifier.fit(train_data, y_train)
pred = xgb_classifier.predict(test_data)

In [29]:
metrics.accuracy_score(y_test, pred)

0.8458

In [30]:
metrics.confusion_matrix(y_test, pred)

array([[4172,  867],
       [ 675, 4286]])

In [0]:
from sklearn.ensemble import GradientBoostingClassifier

In [0]:
clf = GradientBoostingClassifier()

In [23]:
clf.fit(train_data, y_train)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='auto',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [0]:
y_pred_class = clf.predict(test_data)

In [25]:
metrics.accuracy_score(y_test, y_pred_class)

0.8101

In [26]:
metrics.confusion_matrix(y_test, y_pred_class)

array([[3850, 1189],
       [ 710, 4251]])