# Importing Libraries

In [0]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer,TfidfTransformer
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB,MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier,LogisticRegression
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix

from sklearn.svm import LinearSVC,SVC

# Loading dataset in colab from GoogleDrive

In [0]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials


In [0]:
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [0]:
file_id = '15R4-hhDrghD3QmBcr5MYdk4kWf433l3y'
downloaded = drive.CreateFile({'id':file_id})
downloaded.GetContentFile('IMDB Dataset.csv')

In [6]:
!ls

 adc.json  'IMDB Dataset.csv'   sample_data


# Analyzing Dataset

In [0]:
data = pd.read_csv('IMDB Dataset.csv')

In [8]:
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [9]:
data.shape

(50000, 2)

In [10]:
data.isnull().sum()

review       0
sentiment    0
dtype: int64

In [11]:
data.isna().sum()

review       0
sentiment    0
dtype: int64

In [12]:
data.groupby('sentiment').count()

Unnamed: 0_level_0,review
sentiment,Unnamed: 1_level_1
negative,25000
positive,25000


In [0]:
blanks = []
for i,re,sen in data.itertuples():
  if(re.isspace()):
    blank.append(i)

In [14]:
len(blanks)

0

# Splitting into X & Y

In [0]:
X = data['review']

In [0]:
Y = data['sentiment']

In [17]:
print(X.shape,Y.shape)

(50000,) (50000,)


In [18]:
data.groupby('sentiment').count()

Unnamed: 0_level_0,review
sentiment,Unnamed: 1_level_1
negative,25000
positive,25000


# Train-Test Split

In [0]:
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,random_state=0,stratify=Y)

In [20]:
print(X_train.shape,X_test.shape,Y_train.shape,Y_test.shape)

(37500,) (12500,) (37500,) (12500,)


# TFIDF & Vectorization

In [0]:
tfidf_vect = TfidfVectorizer(stop_words='english')

In [0]:
X_train_vect = tfidf_vect.fit_transform(X_train)

In [23]:
print(X_train_vect.shape)

(37500, 90103)


In [0]:
X_test_vect = tfidf_vect.transform(X_test)

In [25]:
print(X_test_vect.shape)

(12500, 90103)


# Models

## SVM

In [0]:
svm = LinearSVC()

In [33]:
svm.fit(X_train_vect,Y_train)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)

In [34]:
Y_pred_train = svm.predict(X_train_vect)
accuracy_train = accuracy_score(Y_train,Y_pred_train)
Y_pred_test = svm.predict(X_test_vect)
accuracy_test = accuracy_score(Y_test,Y_pred_test)
print('Training Accuacy',(accuracy_train))
print('Testing Accuacy',(accuracy_test))

Training Accuacy 0.9890666666666666
Testing Accuacy 0.89496


In [35]:
print(classification_report(Y_train,Y_pred_train))

              precision    recall  f1-score   support

    negative       0.99      0.99      0.99     18750
    positive       0.99      0.99      0.99     18750

    accuracy                           0.99     37500
   macro avg       0.99      0.99      0.99     37500
weighted avg       0.99      0.99      0.99     37500



In [36]:
print(classification_report(Y_test,Y_pred_test))

              precision    recall  f1-score   support

    negative       0.90      0.89      0.89      6250
    positive       0.89      0.90      0.90      6250

    accuracy                           0.89     12500
   macro avg       0.90      0.89      0.89     12500
weighted avg       0.90      0.89      0.89     12500



## Naive-Bayes

In [37]:
mnb = MultinomialNB()
mnb.fit(X_train_vect,Y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [38]:
Y_pred_train = mnb.predict(X_train_vect)
accuracy_train = accuracy_score(Y_train,Y_pred_train)
Y_pred_test = mnb.predict(X_test_vect)
accuracy_test = accuracy_score(Y_test,Y_pred_test)
print('Training Accuacy',(accuracy_train))
print('Testing Accuacy',(accuracy_test))

Training Accuacy 0.9082133333333333
Testing Accuacy 0.86384


In [39]:
print(classification_report(Y_test,Y_pred_test))

              precision    recall  f1-score   support

    negative       0.85      0.88      0.87      6250
    positive       0.87      0.85      0.86      6250

    accuracy                           0.86     12500
   macro avg       0.86      0.86      0.86     12500
weighted avg       0.86      0.86      0.86     12500



## SGDC & Logistic Regression

In [41]:
sgd = SGDClassifier()
sgd.fit(X_train_vect,Y_train)

SGDClassifier(alpha=0.0001, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='hinge',
              max_iter=1000, n_iter_no_change=5, n_jobs=None, penalty='l2',
              power_t=0.5, random_state=None, shuffle=True, tol=0.001,
              validation_fraction=0.1, verbose=0, warm_start=False)

In [42]:
Y_pred_train = sgd.predict(X_train_vect)
accuracy_train = accuracy_score(Y_train,Y_pred_train)
Y_pred_test = sgd.predict(X_test_vect)
accuracy_test = accuracy_score(Y_test,Y_pred_test)
print('Training Accuacy',(accuracy_train))
print('Testing Accuacy',(accuracy_test))

Training Accuacy 0.9289066666666667
Testing Accuacy 0.89384


In [43]:
lr = LogisticRegression()
lr.fit(X_train_vect,Y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [44]:
Y_pred_train = sgd.predict(X_train_vect)
accuracy_train = accuracy_score(Y_train,Y_pred_train)
Y_pred_test = sgd.predict(X_test_vect)
accuracy_test = accuracy_score(Y_test,Y_pred_test)
print('Training Accuacy',(accuracy_train))
print('Testing Accuacy',(accuracy_test))

Training Accuacy 0.9289066666666667
Testing Accuacy 0.89384


# Testing on actual reviews

In [0]:
myreview = ["This is just awesome movie. I love it. Must Watch."]

In [0]:
myreview_vect = tfidf_vect.transform(myreview)

In [50]:
print(svm.predict(myreview_vect))

['positive']


In [0]:
myreview = ["I don't think this was a great movie. Maybe the actors could have better. Not worst but not best either."]

In [0]:
myreview_vect = tfidf_vect.transform(myreview)

In [53]:
print(svm.predict(myreview_vect))

['negative']


## Movie review from website which showed around 3.5 ratings.

In [0]:
myreview = ["Advait meets Sara during his trip to Goa. While he is a bit of an introvert, she is a free-spirited girl from London, who is visiting India for the first time and wants to live life to the fullest. They are instantly attracted to each other and make a pact to make the most of their time together. All is well until a twist in the tale sends their lives spinning out of control. Five years later, Advait goes on a killing spree on Christmas night. Anjaney Agashe, a police officer who often doesn’t follow the law, and Michael Rodrigues, a seemingly righteous cop, are now on his trail. They go on a mad hunt for him. Uncovering the reasons behind Advait's actions is what the rest of the film is about.Aditya impresses in the role of a guy whose life goes through a major upheaval. He shows restraint and finesses in his performance. The actor’s physical transformation adds weight to all the kicks and punches he pulls in the film. Disha Patani looks stunning and makes most of her role, which is unlike any character she has played so far. The chemistry between the lead pair is infectious. The ever-reliable Anil Kapoor is in top form and stuns in another interesting act. He walks the fine line between being flamboyant and over-the-top with ease. Kunal Kemmu is sincere and surprising in equal parts.Director Mohit Suri maintains a steady grip on the narrative for most parts. In this film, none of his characters are unidimensional, which makes them interesting. The film begins well with a power-packed action scene, and dives straight into the drama. What it does lack is a tighter edit and dialogues that could elevate the narrative. Though there are two interesting twists in the plot, they aren’t as big a surprise as they should have been, which take away from its thrill quotient. The music proves to be one of its strong points, especially the title track, which stays with you even after you leave the theatre."]

In [0]:
myreview_vect = tfidf_vect.transform(myreview)

In [56]:
print(svm.predict(myreview_vect))

['positive']


## 1-star review picked from google.

In [0]:
myreview = ["All that I took from Malang was its beautiful songs and Anil Kapoor's madcap, lunatic performance. The rest is like cosmetic surgery, beautiful to look at, how hollow, plastic and unconvincing inside. There is not a moment or even a half moment that kept me engaged in the first half, the dead Adiyta Roy Kapur-Disha Patani chemistry doesn't help either. Deep cleavage shows and toned bodies don't make chemistry alone. There are some random interesting ideas on revenge, fear to be in a relationship, the idea of freedom and abandon, but the director doesn't even attempt exploring anything in depth. Nothing sticks and despite the twist in the end, nothing touches, relates or stings.  Goa as a land of drug addicts, encounter-crazy and prostitution-engaging cops, is not exactly for a tourist brochure, nor does this dark depiction add any intended edge to the movie. Malang is a typical \"filmy\" Hindi movie that we had forgotten existed in the alternate small town-set cinema wave. Yes they exist. A word in for the talented, underrated Kunal Khemu, who is a victim of a underwritten part. If you love hearing beautiful songs while watching a dead slab of a movie, this is it. "]

In [0]:
myreview_vect = tfidf_vect.transform(myreview)

In [60]:
print(svm.predict(myreview_vect))

['negative']
