In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Installing Library Required

In [2]:
!pip install pyprind # Progress visualization
!pip install nltk # For stemming

Collecting pyprind
  Downloading PyPrind-2.11.3-py2.py3-none-any.whl (8.4 kB)
Installing collected packages: pyprind
Successfully installed pyprind-2.11.3


# Import Required Library

In [3]:
import pyprind
import numpy as np
import pandas as pd
import sklearn
import os
import re
from nltk.stem.porter import PorterStemmer
import nltk
from nltk.corpus import stopwords

# Reading Data From Drive

In [None]:
pbar = pyprind.ProgBar(50000)
labels = {'pos':1, 'neg':0}
df = pd.DataFrame()
for s in {'test', 'train'}:
  for l in {'pos', 'neg'}:
    path = '//content/path here/aclImdb/%s/%s' % (s,l)
    for file in os.listdir(path):
      with open(os.path.join(path, file), 'r') as infile:
        txt = infile.read()
        df = df.append([[txt, labels[l]]], ignore_index = True)
        pbar.update()
df.columns = ['review', 'sentiment']

  df = df.append([[txt, labels[l]]], ignore_index = True)
0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:01:21


## Shuffle Data For Better Learning

In [None]:
np.random.seed(0)
df = df.reindex(np.random.permutation(df.index))
df.to_csv('/content/drive/MyDrive/Data/Sentiment/movie_data.csv')

In [6]:
df = pd.read_csv('/content/drive/MyDrive/Data/movie_data.csv')
df.head(3)

Unnamed: 0.1,Unnamed: 0,review,sentiment
0,11841,Imagine the worst skits from Saturday Night Li...,0
1,19602,When you typically watch a short film your alw...,1
2,45519,I had always wanted to see this film and the f...,1


# Preprocessing Of Data
-> Remove HTML TAGS

-> Keeping Emoticons

In [7]:
# Preprocessing -> Removing unnecessary text(Not useful) in documnet. example markup tags.
def preprocessor(text):
  text = re.sub('<[^>]*>', '', text)
  emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
  text =(re.sub('[\W]+', ' ', text.lower()) + ' '.join(emoticons).replace('-', ''))
  return text

In [8]:
# Checking
preprocessor(df.loc[0, 'review'][-50:])

' directed anything else and it s not surprise why '

In [9]:
#Applying preprcossor to all reviews
df['review'] = df['review'].apply(preprocessor)

# Tokenize Words in the Reviews

In [10]:
# tokenizer
def tokenizer(text):
  return text.split()

## Function for Stemming Words.
Stemming is to change the words to its root Form.

In [11]:
#Stemming word in text
porter = PorterStemmer()
def tokenizer_porter(text):
  return [porter.stem(word) for word in text.split()]

In [12]:
# checking tokenizer
tokenizer_porter("runners likes running and thus they run")

['runner', 'like', 'run', 'and', 'thu', 'they', 'run']

### Removing Stop words from reviews
a, and , is, the etc

All these Words does'nt have any meaning ful impact on the model.

In [13]:
# Removing stop words
nltk.download('stopwords')
stop = stopwords.words('english')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [14]:
# Check
[w for w in tokenizer_porter('a runner likes running and runs a lot') if w not in stop]

['runner', 'like', 'run', 'run', 'lot']

# Splitting train and test Data

In [16]:
# splitting training and test dataset
X_train = df.loc[:18000, 'review'].values
Y_train = df.loc[:18000, 'sentiment'].values
X_test = df.loc[18000:, 'review'].values
Y_test = df.loc[18000:, 'sentiment'].values

# Training Model
### Using GridSearchCV to find best parameter for the model
### Using TfidfVectrozier
### Model used is Logistic Regression

In [18]:
#using grid search for finding optimal parameters
# using 5 fold stratified cross validation
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(strip_accents = None, lowercase = False, preprocessor = None)

### All parameter Combinations

In [19]:
param_grid = [{'vect__ngram_range':[(1,1)],
               'vect__stop_words':[stop, None],
               'vect__tokenizer':[tokenizer, tokenizer_porter],
               'clf__penalty':['l1', 'l2'],
               'clf__C':[1.0, 10.0, 100.0]
               },
              {'vect__ngram_range':[(1,1)],
               'vect__stop_words':[stop, None],
               'vect__tokenizer':[tokenizer, tokenizer_porter],
               'vect__use_idf':[False],
               'vect__norm':[None],
               'clf__penalty':['l1', 'l2'],
               'clf__C':[1.0, 10.0, 100.0]}]

In [20]:
lr_tfidf = Pipeline([('vect', tfidf), ('clf', LogisticRegression(random_state=0, solver='liblinear'))])
gs_lr_tfidf = GridSearchCV(lr_tfidf, param_grid, scoring = 'accuracy', cv = 2, verbose = 2, n_jobs = -1)
gs_lr_tfidf.fit(X_train, Y_train)

Fitting 2 folds for each of 48 candidates, totalling 96 fits




# Accuracy Result

In [21]:
# Best Params
print('Best Parameter: %s' %gs_lr_tfidf.best_params_)

Best Parameter: {'clf__C': 10.0, 'clf__penalty': 'l2', 'vect__ngram_range': (1, 1), 'vect__stop_words': None, 'vect__tokenizer': <function tokenizer at 0x7b323c2eb130>}


In [22]:
# Accuracy on training Set
print('CV ACCURACY: %.3f' %gs_lr_tfidf.best_score_)

CV ACCURACY: 0.881


In [23]:
clf = gs_lr_tfidf.best_estimator_
print('Test Accuracy: %3.f' %clf.score(X_test, Y_test))

Test Accuracy:   1


# Saving Model

In [24]:

import joblib

#save your model or results
joblib.dump(gs_lr_tfidf, 'model.pkl')



['model.pkl']

In [25]:
#load your model for further usage
joblib.load("model.pkl")