In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import pandas as pd 
import numpy as np
import sklearn
from sklearn.utils import class_weight
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn import decomposition, ensemble
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn import tree
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import precision_score
from sklearn.model_selection import cross_validate
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import roc_auc_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import balanced_accuracy_score, recall_score, f1_score
from sklearn.metrics import make_scorer
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import ExtraTreesClassifier
import time

In [3]:
import re    # for regular expressions 
import nltk  # for text manipulation 
import string 
import warnings 
import numpy as np 
import pandas as pd 
import seaborn as sns 
import matplotlib.pyplot as plt  
import gensim

pd.set_option("display.max_colwidth", 200) 
warnings.filterwarnings("ignore", category=DeprecationWarning) 

%matplotlib inline

In [4]:
X_train = pd.read_csv('X_train.csv')
X_test = pd.read_csv('X_test.csv')
y_train = pd.read_csv('y_train.csv')
y_test = pd.read_csv('y_test.csv')
X_val = pd.read_csv('X_val.csv')
y_val = pd.read_csv('y_val.csv')

In [5]:
X_train = X_train.iloc[:,0]
X_test = X_test.iloc[:,0]
y_train = y_train.iloc[:,0]
y_test = y_test.iloc[:,0]
X_val = X_val.iloc[:,0]
y_val = y_val.iloc[:,0]
X_train

0        worse tea fresh green tea indeed green tea time produced poor storing condition sealed enviroment exposed extensively air make tea lost flavor dont know old tea experience least 35 years old loose...
1        ive tasted best real things wonderful dreadfully high fat carbs calories includes dressings dips sauces yes ive added goes salad oil fat content calories soar expect fatfree substitute anything cl...
2        love taste good ginger snap cookie saw offered jumped thinking easy review would wellas taste great true ginger snap flavor opened bag smell made mouth water cookie extremely hard really hard time...
3        absolutely love coachs oatmeal hated oatmeal till tasted coach oatmeal first 2 bags bought costconow costco doesnt carry anymoreso went line find coach oatmeal amazon carry happy found themi oatme...
4        normally dont go instant coffees delicious instructions suggest 4 teaspoons 68oz hot water used 5 teaspoons 12oz water dollop silk soy creamer made delicio

In [6]:
df = pd.DataFrame(data=[X_train, y_train], index=["text", "label"]).T
df = df.append(pd.DataFrame(data=[X_val, y_val], index=["text", "label"]).T)
df = df.append(pd.DataFrame(data=[X_test, y_test], index=["text", "label"]).T)

In [7]:
# create a count vectorizer object 
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')
count_vect.fit(df['text'])

# transform the training and validation data using count vectorizer object
xtrain_count =  count_vect.transform(X_train)
xvalid_count =  count_vect.transform(X_val)
xtest_count =  count_vect.transform(X_test)

In [8]:
# word level tf-idf
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=10000)
tfidf_vect.fit(df['text'])
xtrain_tfidf =  tfidf_vect.transform(X_train)
xvalid_tfidf =  tfidf_vect.transform(X_val)
xtest_tfidf =  tfidf_vect.transform(X_test)


In [9]:
tokenized_reviews = df['text'].apply(lambda x: x.split()) # tokenizing 

In [10]:
%%time

model_w2v = gensim.models.Word2Vec(
            tokenized_reviews,
            size=200, # desired no. of features/independent variables
            window=5, # context window size
            min_count=2, # Ignores all words with total frequency lower than 2.                                  
            sg = 1, # 1 for skip-gram model
            hs = 0,
            negative = 10, # for negative sampling
            workers= 32, # no.of cores
            seed = 34
) 

model_w2v.train(tokenized_reviews, total_examples= len(df['text']), epochs=20)

CPU times: user 1h 9min 22s, sys: 10.9 s, total: 1h 9min 33s
Wall time: 35min 47s


In [11]:
model_w2v.save("word2vec.model")

In [16]:
model_w2v = gensim.models.Word2Vec.load("word2vec.model")

In [18]:
from pydrive.auth import GoogleAuth
from google.colab import auth

# Authenticate and create the PyDrive client.
auth.authenticate_user()

!gupload --to '1wO0ghnLExOsVlXfT24yeau5m28cWMMZo' word2vec.model


Uploading file: word2vec.model


In [26]:
model_w2v.wv.most_similar(positive="tea")

[('teas', 0.7233192324638367),
 ('teabr', 0.6031690835952759),
 ('chai', 0.5959882736206055),
 ('higgins', 0.589557409286499),
 ('burke', 0.5802149176597595),
 ('hrefhttpwwwamazoncomgpproductb0012bzghsstash', 0.5711391568183899),
 ('blackberryvanilla', 0.5694466233253479),
 ('hrefhttpwwwamazoncomgpproductb000f4h5qitwinings', 0.5682802200317383),
 ('rejuvination', 0.5576297044754028),
 ('herbal', 0.5546238422393799)]

In [27]:
model_w2v.wv.most_similar(positive="healthy")

[('healthier', 0.6138670444488525),
 ('proteinpacked', 0.4948444366455078),
 ('funtoeat', 0.4921887218952179),
 ('nutritious', 0.4917111396789551),
 ('cerealm', 0.48545607924461365),
 ('nonfruit', 0.47774869203567505),
 ('productvery', 0.47549813985824585),
 ('impressivelow', 0.4748710095882416),
 ('highsugar', 0.47379225492477417),
 ('onetoone', 0.47376060485839844)]

In [30]:
len(model_w2v['food']) #The length of the vector is 200

200