# Word2vec model development

In [None]:
import numpy as np
import pandas as pd
import gensim
import re

In [None]:
## Downloading datasets
!wget -q https://www.dropbox.com/s/0ygoimffauvl7x5/unlabeledTrainData.tsv
!wget -q https://www.dropbox.com/s/4f1s02mh6bfjcr5/labeledTrainData.tsv

In [None]:
df = pd.read_csv("unlabeledTrainData.tsv",sep='\t',quoting=3)
df.shape

(50000, 2)

In [None]:
df.head()

Unnamed: 0,id,review
0,"""9999_0""","""Watching Time Chasers, it obvious that it was..."
1,"""45057_0""","""I saw this film about 20 years ago and rememb..."
2,"""15561_0""","""Minor Spoilers<br /><br />In New York, Joan B..."
3,"""7161_0""","""I went to see this film with a great deal of ..."
4,"""43971_0""","""Yes, I agree with everyone on this site this ..."


In [None]:
df['review'][0]

'"Watching Time Chasers, it obvious that it was made by a bunch of friends. Maybe they were sitting around one day in film school and said, \\"Hey, let\'s pool our money together and make a really bad movie!\\" Or something like that. What ever they said, they still ended up making a really bad movie--dull story, bad script, lame acting, poor cinematography, bottom of the barrel stock music, etc. All corners were cut, except the one that would have prevented this film\'s release. Life\'s like that."'

In [None]:
def clean_data(doc):
  doc = re.sub("<br"," ",doc) # replacing <br with whitespace
  doc = re.sub("[^A-Za-z]"," ",doc) # replacing every non alphabet with whitespace
  doc = " ".join([w.strip() for w in doc.strip().lower().split()]) # removing whitespace and converting to lowercase
  return doc

In [None]:
df.review= df.review.apply(clean_data)
df.review[0]

In [None]:
df.head()

## word2vec model

In [None]:
corp = df.review.tolist()
print(len(corp))

50000


In [None]:
data = [i.split(" ") for i in corp]
print(len(data))

50000


In [None]:
print(data[0])

In [None]:
model = gensim.models.Word2Vec(sentences=data,size=100,workers=8,min_count=12)

In [None]:
model.wv.vectors.shape

(25706, 100)

In [None]:
model.wv['king']

AttributeError: ignored

In [None]:
model.wv['king'].shape

(100,)

In [None]:
model.wv['seohghoeshgre']

In [None]:
model.wv.most_similar("awesome")

[('amazing', 0.8112810850143433),
 ('incredible', 0.7490220069885254),
 ('excellent', 0.7278615236282349),
 ('fantastic', 0.6910278797149658),
 ('exceptional', 0.6683927774429321),
 ('great', 0.6500455737113953),
 ('cool', 0.6491718292236328),
 ('outstanding', 0.6489181518554688),
 ('alright', 0.6269726753234863),
 ('awful', 0.62667316198349)]

In [None]:
model.save("imdb_dataset.vec")

In [None]:
gensim.__version__

'3.6.0'

## Text classification

In [None]:
df = pd.read_csv("/content/labeledTrainData.tsv",sep='\t',quoting=3)
df.shape

(25000, 3)

In [None]:
df.head()

Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,"""With all this stuff going down at the moment ..."
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ..."
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell..."
3,"""3630_4""",0,"""It must be assumed that those who praised thi..."
4,"""9495_8""",1,"""Superbly trashy and wondrously unpretentious ..."


In [None]:
df.review = df.review.apply(clean_data)
df.head()

Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,with all this stuff going down at the moment w...
1,"""2381_9""",1,the classic war of the worlds by timothy hines...
2,"""7759_3""",0,the film starts with a manager nicholas bell g...
3,"""3630_4""",0,it must be assumed that those who praised this...
4,"""9495_8""",1,superbly trashy and wondrously unpretentious s...


In [None]:
#split data into trian and test set
from sklearn.model_selection import train_test_split
xtr,xts,ytr,yts = train_test_split(df.review,df.sentiment,test_size=0.2,random_state=5)
print(xtr.shape,xts.shape,ytr.shape,yts.shape)


(20000,) (5000,) (20000,) (5000,)


### text data prepraration

In [None]:
from tensorflow.keras.preprocessing import text
# create a tokenization object
tok = text.Tokenizer(num_words=20000)
tok.fit_on_texts(xtr.tolist())

In [None]:
print(tok.word_index)



In [None]:
for i in range(5):
  print(len(xtr[i].split(' ')))

437
161
375
384
379


In [None]:
# text to sequence conversion
xtr = tok.texts_to_sequences(xtr.tolist())
xts = tok.texts_to_sequences(xts.tolist())

In [None]:
print(xtr[0])

[7, 12, 247, 5, 347, 10, 17, 8, 447, 749, 281, 480, 64, 71, 126, 1, 947, 113, 224, 2, 615, 25, 30, 85, 927, 15, 1, 15860, 7, 12, 3, 720, 1070, 62, 4, 116, 2, 396, 3041, 15, 1, 19, 3665, 39, 12, 906, 4, 2432, 3066, 2, 1781, 4082, 39, 25, 48, 17700, 15076, 2, 14, 46, 76, 7069, 22, 1, 7886, 1, 275, 1456, 1813, 45, 29, 6, 10216, 16, 1, 3387, 1490, 2311, 2568, 187, 9, 547, 1, 275, 14, 110, 52, 720, 670, 5, 1, 190, 11, 243, 72, 6706, 1, 3387, 1490, 12, 1441, 8048, 663, 26, 1255, 5, 1, 2027, 202, 1042, 1, 3519, 38, 1803, 306, 8048, 462, 14409, 143, 26, 3491, 729, 7, 164, 165, 282, 41, 23, 8, 100, 413, 9, 2628, 1, 190, 11, 8048, 16, 3871, 12777, 3492, 2, 42, 15861, 1, 78, 477, 29, 33, 29, 8, 26, 18802, 2691, 15, 1326, 61, 164, 1896, 8, 3041, 199, 15, 306, 2, 1, 1319, 8, 100, 413, 10, 19, 6, 54, 73, 287, 147, 45, 20, 151, 31, 30, 3493, 5, 1, 497, 34, 313, 775, 83, 472, 3666, 947, 2, 1129, 359, 605, 1, 2049, 93, 10, 3, 347, 41, 127, 19]


206
90
118
425
207


In [None]:
# resize each document to a same size
from tensorflow.keras.preprocessing import sequence
xtr = sequence.pad_sequences(xtr,maxlen=250,padding='post')
xts = sequence.pad_sequences(xts,maxlen=250,padding='post')

In [None]:
for i in xtr[:5]:
  print(len(i))

250
250
250
250
250


In [None]:
wvmodel = gensim.models.Word2Vec.load("/content/imdb_dataset.vec")

In [None]:
wvmodel.wv.vectors.shape

(25706, 100)

In [None]:
custom_wv_embedding = np.zeros((20001,100))

for word,i in sorted(tok.word_index.items(),key=lambda x:x[1]):
  if i>20000:
    break
  if word in wvmodel.wv.vocab:
    custom_wv_embedding[i] = wvmodel.wv[word]

In [None]:
custom_wv_embedding.shape

(20001, 100)

In [None]:
wvmodel.wv['king']

In [None]:
xtr.shape

(20000, 250)

## Modelling

In [None]:
from tensorflow.keras import models,layers

In [None]:
input_layer = layers.Input(shape=(250,))
# adding an embedding layer specifying total unique input words as input dim, size of vector as output dim
em_layer = layers.Embedding(input_dim=20001,output_dim=100,weights=[custom_wv_embedding],trainable=False)(input_layer)
ft = layers.Flatten()(em_layer)
h1 = layers.Dense(400,activation='relu')(ft)
h2 = layers.Dense(200,activation='relu')(h1)
h3 = layers.Dense(80,activation='relu')(h2)
op = layers.Dense(1,activation='sigmoid')(h3)
model = models.Model(inputs=input_layer,outputs=op)
model.summary()

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         [(None, 250)]             0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 250, 100)          2000100   
_________________________________________________________________
flatten_2 (Flatten)          (None, 25000)             0         
_________________________________________________________________
dense_4 (Dense)              (None, 400)               10000400  
_________________________________________________________________
dense_5 (Dense)              (None, 200)               80200     
_________________________________________________________________
dense_6 (Dense)              (None, 80)                16080     
_________________________________________________________________
dense_7 (Dense)              (None, 1)                 81  

In [None]:
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
model.fit(xtr,ytr,epochs=10,batch_size=100,validation_data=(xts,yts))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7fb2031c8ad0>

In [None]:
x = [8,7,5,6,2,6,3,2,5,4,3]
sorted(x)

[2, 2, 3, 3, 4, 5, 5, 6, 6, 7, 8]

In [None]:
y = ['hello','hi','anshu','boy','cat','zebra']
sorted(y)

['anshu', 'boy', 'cat', 'hello', 'hi', 'zebra']

In [None]:
y = [('hello',6),('hi',9),('anshu',3),('boy',2),('cat',7),('zebra',4)]
sorted(y)

[('anshu', 3), ('boy', 2), ('cat', 7), ('hello', 6), ('hi', 9), ('zebra', 4)]

In [None]:
sorted(y,key=lambda x:x[1])

[('boy', 2), ('anshu', 3), ('zebra', 4), ('hello', 6), ('cat', 7), ('hi', 9)]