In [1]:
import pandas as pd
df = pd.read_csv('IMDB Dataset.csv')

In [2]:
df.head(10)
df['review'][1]

'A wonderful little production. <br /><br />The filming technique is very unassuming- very old-time-BBC fashion and gives a comforting, and sometimes discomforting, sense of realism to the entire piece. <br /><br />The actors are extremely well chosen- Michael Sheen not only "has got all the polari" but he has all the voices down pat too! You can truly see the seamless editing guided by the references to Williams\' diary entries, not only is it well worth the watching but it is a terrificly written and performed piece. A masterful production about one of the great master\'s of comedy and his life. <br /><br />The realism really comes home with the little things: the fantasy of the guard which, rather than use the traditional \'dream\' techniques remains solid then disappears. It plays on our knowledge and our senses, particularly with the scenes concerning Orton and Halliwell and the sets (particularly of their flat with Halliwell\'s murals decorating every surface) are terribly well d

In [3]:
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer()
docs = np.array(['The sun is shining',
                 'The weather is sweet',
                 'The sun is shining, the weather is sweet, and one and one is two'])

bag = count.fit_transform(docs)
print(count.vocabulary_)


{'the': 6, 'sun': 4, 'is': 1, 'shining': 3, 'weather': 8, 'sweet': 5, 'and': 0, 'one': 2, 'two': 7}


In [4]:
print(bag.toarray())

[[0 1 0 1 1 0 1 0 0]
 [0 1 0 0 0 1 1 0 1]
 [2 3 2 1 1 1 2 1 1]]


# Task 3: Term Frequency and Inverse Document Frequency
## Term frequencies alone do not contribute to distinct information
## tf-idf (t,d) = tf(t,d) X idf(t,d)
## idf(t,d) = log(nd / 1 + df(d,t))
## where nd = total number of documents and df(d,t) = number of documents that contain the term t

In [5]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf = TfidfTransformer(use_idf= True, norm= 'l2', smooth_idf= True)

In [6]:
print(tfidf.fit_transform(count.fit_transform(docs)).toarray())

[[0.         0.43370786 0.         0.55847784 0.55847784 0.
  0.43370786 0.         0.        ]
 [0.         0.43370786 0.         0.         0.         0.55847784
  0.43370786 0.         0.55847784]
 [0.50238645 0.44507629 0.50238645 0.19103892 0.19103892 0.19103892
  0.29671753 0.25119322 0.19103892]]


In [7]:
np.set_printoptions(precision=2)

In [18]:
print(tfidf.fit_transform(count.fit_transform(docs)).toarray())

[[0.   0.43 0.   0.56 0.56 0.   0.43 0.   0.  ]
 [0.   0.43 0.   0.   0.   0.56 0.43 0.   0.56]
 [0.5  0.45 0.5  0.19 0.19 0.19 0.3  0.25 0.19]]


# Task 6: Vectorizing the Dataset

In [8]:
from nltk.stem.porter import PorterStemmer
porter = PorterStemmer()

def stemmer_tokenize(text):
    return [porter.stem(word) for word in text.split()]
    
stemmer_tokenize('coders like coding and thus they code')

['coder', 'like', 'code', 'and', 'thu', 'they', 'code']

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf=  TfidfVectorizer(strip_accents = None,
                       lowercase=False,
                       tokenizer= stemmer_tokenize,
                       use_idf=True,
                       norm='l2',
                       smooth_idf=True)
Y = df.sentiment.values
X = tfidf.fit_transform(df.review)

# Take 7: Document classification using Logistic Regression 

In [31]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X,Y, random_state=1, test_size=0.5, shuffle=False)

import pickle
from sklearn.linear_model import LogisticRegressionCV

#model
clf = LogisticRegressionCV(cv = 5,
                          scoring = 'accuracy',
                          random_state = 0,
                          n_jobs = -3,
                          verbose = 3,
                          max_iter = 300).fit(X_train,Y_train)

#saving the model
saved_model = open('saved_model.sav', 'wb')

#using the pickle library's dump function to write the trained classifier
pickle.dump(clf, saved_model)


[Parallel(n_jobs=-3)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-3)]: Done   5 out of   5 | elapsed: 11.7min finished


In [33]:
saved_model.close()

# Task 8: Model Evaluation

In [34]:
filename = 'saved_model.sav'

saved_clf = pickle.load(open(filename, 'rb'))

#test the saved model on test data
saved_clf.score(X_test, Y_test)

0.88984