In [45]:
## Downloading Files

In [46]:
!wget -c "https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz"

--2021-02-24 07:53:34--  https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.217.77.38
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.217.77.38|:443... connected.
HTTP request sent, awaiting response... 416 Requested Range Not Satisfiable

    The file is already fully retrieved; nothing to do.



In [47]:
!gunzip GoogleNews-vectors-negative300.bin.gz

gzip: GoogleNews-vectors-negative300.bin already exists; do you wish to overwrite (y or n)? n
	not overwritten


## Importing the Libraries



In [48]:
import pandas as pd 
import numpy as np 

from gensim.models import KeyedVectors,Word2Vec
from sklearn import model_selection 
from sklearn import metrics
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

import nltk 
from nltk.corpus import stopwords
nltk.download("stopwords")
nltk.download("punkt")
from nltk.tokenize import word_tokenize 
import string
import re 


import pickle 
import joblib

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [49]:
## Loading the data file

In [50]:
df = pd.read_csv('imdb.csv')
df.head()
print(df.shape)

(50000, 2)


In [51]:
## Loading word to vec model
word2vec_model = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin',binary=True)

In [63]:
# def preprocessing_corpus(texts):
#   mystopwords = set(stopwords.words("english"))
#   def tokenize(text):
#     text = text.replace('</br>','')
#     return word_tokenize(text)

#   def clean_and_tokenize(tokens):
#     return [token.lower() for token in tokens if token not in mystopwords if token not in string.punctuation and not token.isdigit()]

#   return [clean_and_tokenize(tokenize(text)) for text in texts]
  

In [67]:
# def embedding_feats(list_of_lists):
#   feats = []
#   DIMENSION = 300

#   for tokens in list_of_lists:
#     feat_for_this = np.zeros(DIMENSION)
#     count_for_this = 0
#     for token in tokens:
#       if token in word2vec_model:
#         feat_for_this += word2vec_model[token]
#         count_for_this +=1
#     feats.append(feat_for_this/count_for_this)
  
#   return feats 


In [54]:
df.head(2)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive


In [55]:
X = df.review.values
y = df.sentiment.map({'positive':1,'negative':0}).values

In [56]:
from sklearn.model_selection import train_test_split 
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,stratify=y)

In [91]:
class CreateEmbeddings():
  def embedding_feats(list_of_lists):
    feats = []
    DIMENSION = 300

    for tokens in list_of_lists:
      feat_for_this = np.zeros(DIMENSION)
      count_for_this = 0
      for token in tokens:
        if token in word2vec_model:
          feat_for_this += word2vec_model[token]
          count_for_this +=1
      feats.append(feat_for_this/count_for_this)
    
    return feats 
  def __init__(self):
    print('Called')
  
  def fit(self,x,y=None):
    return self


  def transform(self,X,y=None):
    return CreateEmbeddings.embedding_feats(X)

In [89]:
class CleaningData():
  def preprocessing_corpus(texts):
    mystopwords = set(stopwords.words("english"))
    def tokenize(text):
      text = text.replace('</br>','')
      return word_tokenize(text)

    def clean_and_tokenize(tokens):
      return [token.lower() for token in tokens if token not in mystopwords if token not in string.punctuation and not token.isdigit()]

    return [clean_and_tokenize(tokenize(text)) for text in texts]

  def __init__(self):
    print('Called')

  def fit(self,x,y=None):
    return self

  def transform(self,X,y=None):
    return CleaningData.preprocessing_corpus(X)

In [92]:
logreg = LogisticRegression(max_iter=100000)

pipe = Pipeline([('cleaning_data',CleaningData()),('create_embeddings',CreateEmbeddings()),('logreg',LogisticRegression())])
pipe.fit(X_train,y_train)

Called
Called


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Pipeline(memory=None,
         steps=[('cleaning_data',
                 <__main__.CleaningData object at 0x7f5edc418f90>),
                ('create_embeddings',
                 <__main__.CreateEmbeddings object at 0x7f5edc41a110>),
                ('logreg',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
                                    l1_ratio=None, max_iter=100,
                                    multi_class='auto', n_jobs=None,
                                    penalty='l2', random_state=None,
                                    solver='lbfgs', tol=0.0001, verbose=0,
                                    warm_start=False))],
         verbose=False)

In [93]:
pred = pipe.predict(X_test)

In [94]:
accuracy = metrics.accuracy_score(y_test,pred)
print(accuracy)

0.85


In [95]:
pred_test = pipe.predict(np.array(['This movies was so bad']))
print(pred_test)

[0]


In [96]:
pickle_out = open("model_pipeline.pkl","wb")
pickle.dump(pipe,pickle_out)
pickle_out.close()

In [97]:

load_model = pickle.load(open("model_pipeline.pkl","rb"))

In [98]:
pred_test = load_model.predict(np.array(['This movies was so bad']))

In [99]:
pred_test

array([0])

In [100]:
pickle_out = open("wtovmodel.pkl","wb")
pickle.dump(word2vec_model,pickle_out)
pickle_out.close()