<a href="https://colab.research.google.com/github/anshupandey/Natural_language_Processing/blob/master/word2vec_using_gensim_%26_sentiment_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
import gensim
import re

# Dataset Preparation

In [2]:
## Downloading datasets
!wget -q https://www.dropbox.com/s/0ygoimffauvl7x5/unlabeledTrainData.tsv
!wget -q https://www.dropbox.com/s/4f1s02mh6bfjcr5/labeledTrainData.tsv

In [3]:
df = pd.read_csv("unlabeledTrainData.tsv",delimiter="\t",quoting=3)
df.head()

Unnamed: 0,id,review
0,"""9999_0""","""Watching Time Chasers, it obvious that it was..."
1,"""45057_0""","""I saw this film about 20 years ago and rememb..."
2,"""15561_0""","""Minor Spoilers<br /><br />In New York, Joan B..."
3,"""7161_0""","""I went to see this film with a great deal of ..."
4,"""43971_0""","""Yes, I agree with everyone on this site this ..."


In [4]:
df.shape

(50000, 2)

In [9]:
df['review'][2]

'"Minor Spoilers<br /><br />In New York, Joan Barnard (Elvire Audrey) is informed that her husband, the archeologist Arthur Barnard (John Saxon), was mysteriously murdered in Italy while searching an Etruscan tomb. Joan decides to travel to Italy, in the company of her colleague, who offers his support. Once in Italy, she starts having visions relative to an ancient people and maggots, many maggots. After shootings and weird events, Joan realizes that her father is an international drug dealer, there are drugs hidden in the tomb and her colleague is a detective of the narcotic department. The story ends back in New York, when Joan and her colleague decide to get married with each other, in a very romantic end. Yesterday I had the displeasure of wasting my time watching this crap. The story is so absurd, mixing thriller, crime, supernatural and horror (and even a romantic end) in a non-sense way. The acting is the worst possible, highlighting the horrible performance of the beautiful El

In [7]:
df.shape

(50000, 2)

In [8]:
def clean_data(doc):
  doc = re.sub("<br"," ",doc)
  doc = re.sub("[^A-Za-z]"," ",doc)
  doc = " ".join([w.strip() for w in doc.strip().lower().split()])
  return doc
clean_data(df['review'][2])

'minor spoilers in new york joan barnard elvire audrey is informed that her husband the archeologist arthur barnard john saxon was mysteriously murdered in italy while searching an etruscan tomb joan decides to travel to italy in the company of her colleague who offers his support once in italy she starts having visions relative to an ancient people and maggots many maggots after shootings and weird events joan realizes that her father is an international drug dealer there are drugs hidden in the tomb and her colleague is a detective of the narcotic department the story ends back in new york when joan and her colleague decide to get married with each other in a very romantic end yesterday i had the displeasure of wasting my time watching this crap the story is so absurd mixing thriller crime supernatural and horror and even a romantic end in a non sense way the acting is the worst possible highlighting the horrible performance of the beautiful elvire audrey john saxon just gives his na

In [10]:
df['review'] = df['review'].apply(clean_data)

#  Building the word2vec model

In [12]:
doc_words = []
for doc in df['review']:
  doc_words.append(doc.split(' '))

In [13]:
print(doc_words[0])

['watching', 'time', 'chasers', 'it', 'obvious', 'that', 'it', 'was', 'made', 'by', 'a', 'bunch', 'of', 'friends', 'maybe', 'they', 'were', 'sitting', 'around', 'one', 'day', 'in', 'film', 'school', 'and', 'said', 'hey', 'let', 's', 'pool', 'our', 'money', 'together', 'and', 'make', 'a', 'really', 'bad', 'movie', 'or', 'something', 'like', 'that', 'what', 'ever', 'they', 'said', 'they', 'still', 'ended', 'up', 'making', 'a', 'really', 'bad', 'movie', 'dull', 'story', 'bad', 'script', 'lame', 'acting', 'poor', 'cinematography', 'bottom', 'of', 'the', 'barrel', 'stock', 'music', 'etc', 'all', 'corners', 'were', 'cut', 'except', 'the', 'one', 'that', 'would', 'have', 'prevented', 'this', 'film', 's', 'release', 'life', 's', 'like', 'that']


In [14]:
len(doc_words)

50000

In [16]:
model = gensim.models.Word2Vec(doc_words,vector_size=60,window=5,min_count=10,workers=8)

In [17]:
model.wv['time'].shape

(60,)

In [None]:
model.wv['anshu']

In [19]:
model.wv.vectors.shape

(28322, 60)

In [20]:
model.wv.most_similar("amazing")

[('incredible', 0.8907842636108398),
 ('awesome', 0.8620311617851257),
 ('exceptional', 0.798582911491394),
 ('outstanding', 0.7963023781776428),
 ('excellent', 0.7875266671180725),
 ('fantastic', 0.7633962035179138),
 ('astonishing', 0.7520110607147217),
 ('astounding', 0.7471282482147217),
 ('wonderful', 0.7079030871391296),
 ('superb', 0.7029137015342712)]

In [23]:
model.wv.most_similar("actress")

[('actor', 0.7604784965515137),
 ('role', 0.7248468995094299),
 ('performer', 0.701571524143219),
 ('performance', 0.7012739181518555),
 ('singer', 0.6594709753990173),
 ('dancer', 0.6127150058746338),
 ('ms', 0.5878314971923828),
 ('garbo', 0.5842373967170715),
 ('meryl', 0.5822567343711853),
 ('kinski', 0.5750608444213867)]

In [24]:
model.save("imdb-vector.vec")

# Sentiment Analysis

In [25]:
df = pd.read_csv("labeledTrainData.tsv",delimiter="\t",quoting=3)
df.shape

(25000, 3)

In [26]:
df.head()

Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,"""With all this stuff going down at the moment ..."
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ..."
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell..."
3,"""3630_4""",0,"""It must be assumed that those who praised thi..."
4,"""9495_8""",1,"""Superbly trashy and wondrously unpretentious ..."


In [27]:
df['review'] = df['review'].apply(clean_data)
df.head()

Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,with all this stuff going down at the moment w...
1,"""2381_9""",1,the classic war of the worlds by timothy hines...
2,"""7759_3""",0,the film starts with a manager nicholas bell g...
3,"""3630_4""",0,it must be assumed that those who praised this...
4,"""9495_8""",1,superbly trashy and wondrously unpretentious s...


In [28]:
df['review'][2]

'the film starts with a manager nicholas bell giving welcome investors robert carradine to primal park a secret project mutating a primal animal using fossilized dna like jurassik park and some scientists resurrect one of nature s most fearsome predators the sabretooth tiger or smilodon scientific ambition turns deadly however and when the high voltage fence is opened the creature escape and begins savagely stalking its prey the human visitors tourists and scientific meanwhile some youngsters enter in the restricted area of the security center and are attacked by a pack of large pre historical animals which are deadlier and bigger in addition a security agent stacy haiduk and her mate brian wimmer fight hardly against the carnivorous smilodons the sabretooths themselves of course are the real star stars and they are astounding terrifyingly though not convincing the giant animals savagely are stalking its prey and the group run afoul and fight against one nature s most fearsome predator

## Split the dataset into train and test

In [29]:
from sklearn.model_selection import train_test_split
xtrain,xtest,ytrain,ytest = train_test_split(df['review'],df['sentiment'],test_size=0.2,random_state=5)

#### text preprocessing with tensorflow

In [30]:
# tokenizing each review
from tensorflow.keras.preprocessing import text
#creating the tokenization object
tok = text.Tokenizer(num_words=15000)
tok.fit_on_texts(xtrain.tolist())


In [31]:
# get the notations of tokenized words for train and test documents
xtrain = tok.texts_to_sequences(xtrain.tolist())
xtest = tok.texts_to_sequences(xtest.tolist())


In [32]:
for doc in xtrain[:5]:
  print(len(doc))

201
90
116
418
205


In [None]:
np.mean([len(doc) for doc in xtrain])

229.2208

In [33]:
doc_length=300
from tensorflow.keras.preprocessing import sequence
# padding of each doc to resize all docs to same size
xtrain = sequence.pad_sequences(xtrain,maxlen=doc_length,padding='post')
xtest = sequence.pad_sequences(xtest,maxlen=doc_length,padding='post')

## Modelling part

In [34]:
word2vec = gensim.models.Word2Vec.load("/content/imdb-vector.vec")

# embedding length for each word
vector_length=word2vec.wv.vector_size
vector_length

60

In [None]:
tok.word_index

In [None]:
# crating a weight matrix for the words in our current local dictionary, weights are captured from
# the pretrained word2vec

weight_matrix = np.zeros((15001,60))

for word,i in sorted(tok.word_index.items(),key=lambda x:x[1]):
  print(word,i)
  if i > 15000:
    break
  if word in word2vec.wv.key_to_index:
    weight_matrix[i] = word2vec.wv[word]

In [37]:
from tensorflow.keras import models,layers

In [38]:
# modelling
model = models.Sequential()
model.add(layers.Embedding(15001,60,input_length=300,weights=[weight_matrix],
                           trainable=False))

model.add(layers.Flatten())
model.add(layers.Dense(500,activation='relu'))
model.add(layers.Dense(250,activation='relu'))
model.add(layers.Dense(50,activation='relu'))
model.add(layers.Dense(1,activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

In [39]:
model.fit(xtrain,ytrain,epochs=10,batch_size=100,validation_data=(xtest,ytest))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7de250ee0c70>

In [40]:
model.predict(xtest[0].reshape(1,300))



array([[0.99960047]], dtype=float32)