# **1.Importing data**


In [1]:
import pandas as pd
import numpy as np

In [2]:
df=pd.read_csv('IMDB Dataset.csv')
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [3]:
df['Sentiment']=np.where(df['sentiment']=='positive',1,0)
df.drop('sentiment',axis=1,inplace=True)
df_copy=df

In [4]:
df.head()

Unnamed: 0,review,Sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [5]:
df['review'][3]

"Basically there's a family where a little boy (Jake) thinks there's a zombie in his closet & his parents are fighting all the time.<br /><br />This movie is slower than a soap opera... and suddenly, Jake decides to become Rambo and kill the zombie.<br /><br />OK, first of all when you're going to make a film you must Decide if its a thriller or a drama! As a drama the movie is watchable. Parents are divorcing & arguing like in real life. And then we have Jake with his closet which totally ruins all the film! I expected to see a BOOGEYMAN similar movie, and instead i watched a drama with some meaningless thriller spots.<br /><br />3 out of 10 just for the well playing parents & descent dialogs. As for the shots with Jake: just ignore them."

# Data preparation

* Removed punctuations and html commands.
* Transferred emoticons to the end of of the document.
* Converted every text to lower case.

In [6]:
import re
def preprocess(text):
    text = re.sub('<[^>]*>', '',text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    text=re.sub('[\W]+',' ',text.lower()) +\
       ' '.join(emoticons).replace('-','')    
    return text

In [7]:
print('this is an example')
preprocess('Hello world !!! :) :( . Wishing you a very good morning!!')

this is an example


'hello world wishing you a very good morning :) :('

In [8]:
df['review']=df['review'].apply(preprocess)

# tokenization of documents

In [9]:
from nltk.stem.porter import PorterStemmer

porter = PorterStemmer()

In [10]:
def tokenizer_stem(text):
    return [porter.stem(word) for word in text.split()]

# Transforming text data into TF-IDF Vectors

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer 

tfidf=TfidfVectorizer(strip_accents=None,
                     lowercase=False,
                     preprocessor=None,
                     tokenizer=tokenizer_stem,
                     use_idf=True,
                     norm='l2',
                     smooth_idf=True)

y=df.Sentiment.values
X=tfidf.fit_transform(df.review)

# Document classification using Logistic regression

In [12]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=1)

In [13]:
import pickle 
from sklearn.linear_model import LogisticRegressionCV

clf= LogisticRegressionCV(cv=5,
                         random_state=0,
                         n_jobs=-1,
                         verbose=3,
                         max_iter=300).fit(X_train,y_train)

saved_model= open('saved_model1.sav', 'wb')
pickle.dump(clf,saved_model)
saved_model.close()


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  6.7min finished


Saved the model by the name of 'saved_model1.sav' using pickle library.

# Model evaluation

In [14]:
filename='saved_model1.sav'
saved_clf=pickle.load(open(filename,'rb'))

loaded the saved model.

In [15]:
saved_clf.score(X_test,y_test)

0.89556

**Accuracy of model on test set is about 90%.**