# colab setup


In [1]:
# Define Constants and imports
import os
from collections import Counter
import matplotlib.pyplot as plt
import pandas as pd
from string import punctuation
import src
from importlib import reload
import traintracker

reload(src)

raw_dataset_dir = "../dataset/raw"
prep_dataset_dir = "../dataset/preprocessed"

weights_dir="../model_weights/skipgram"
train_tracker_path="../train_tracker"

train_data_dir="../train_tracker"


# Introduction
In this notebook, I'll  implement the [Word2Vec algorithm](https://en.wikipedia.org/wiki/Word2vec) using the skip-gram architecture as a starter step for sentiment analysis creating first the words embeddings then use these embeddings for sentiment analysis

# Data Loading
- load and view the reviews dataset

In [2]:
os.listdir(raw_dataset_dir)

['labels.txt', 'reviews.txt', 'text8']

In [3]:
reviews_df=pd.read_csv(os.path.join(raw_dataset_dir,'movie.csv'))
reviews_df.info()

FileNotFoundError: [Errno 2] No such file or directory: '../dataset/raw\\movie.csv'

In [10]:
reviews_df.groupby(by='label').count()

Unnamed: 0_level_0,text
label,Unnamed: 1_level_1
0,20019
1,19981


In [11]:
reviews_df.head()

Unnamed: 0,text,label
0,I grew up (b. 1965) watching and loving the Th...,0
1,"When I put this movie in my DVD player, and sa...",0
2,Why do people who do not know what a particula...,0
3,Even though I have great interest in Biblical ...,0
4,Im a die hard Dads Army fan and nothing will e...,1


In [12]:
print(reviews_df.iloc[0][0])
print(reviews_df.iloc[0][1])

'I grew up (b. 1965) watching and loving the Thunderbirds. All my mates at school watched. We played "Thunderbirds" before school, during lunch and after school. We all wanted to be Virgil or Scott. No one wanted to be Alan. Counting down from 5 became an art form. I took my children to see the movie hoping they would get a glimpse of what I loved as a child. How bitterly disappointing. The only high point was the snappy theme tune. Not that it could compare with the original score of the Thunderbirds. Thankfully early Saturday mornings one television channel still plays reruns of the series Gerry Anderson and his wife created. Jonatha Frakes should hand in his directors chair, his version was completely hopeless. A waste of film. Utter rubbish. A CGI remake may be acceptable but replacing marionettes with Homo sapiens subsp. sapiens was a huge error of judgment.'

0

In [13]:
# - change the reviews to one txt seprated by `\n` newline character
reviews_list=[]
labels_list=[]
size=len(reviews_df)
for i in range(size):
  reviews_list.append(reviews_df.iloc[i][0])
  
  labels_list.append("positive"  if reviews_df.iloc[i][1]==1 else "negative")
  if i<size-1:
    reviews_list.append('\n')
    labels_list.append('\n')


In [14]:
raw_reviews_txt="".join(reviews_list)
raw_labels_txt="".join(labels_list)
print(len(raw_reviews_txt.split('\n')))

40000


In [15]:
# saving labels to txt file
with open(os.path.join(prep_dataset_dir, "labels.txt"), 'w') as leables_file:
    leables_file.write(raw_labels_txt)

- reviews are line separated and the reviews also

## view words freqencies
- show word freqencies before any preprocessing

In [16]:
words_counter = Counter(raw_reviews_txt.split())


In [17]:
words_df = pd.DataFrame(words_counter.items(), columns=['word', 'count'])
words_df.sort_values(by=['count'], ascending=False, inplace=True)
print(f"no of unique words={len(words_df)}\nno of words ={words_df['count'].sum()}")

no of unique words=381542
no of words =9253570


In [18]:
# get 50 from top , mid and end
src.data_preprocessing.get_head_mid_tail(words_df,50)

Unnamed: 0,word,count
8,the,455003
51,a,246023
6,and,242041
53,of,226965
27,to,209659
...,...,...
198462,"policewoman's,",1
198464,Palusky,1
198465,Kayle,1
198466,"Timler,",1


In [19]:
words_df.describe()

Unnamed: 0,count
count,381542.0
mean,24.253084
std,1240.617558
min,1.0
25%,1.0
50%,1.0
75%,3.0
max,455003.0


- high standard deviation due the usage of words like 'the' and using punctuations in words make them counted as diffrent words
- 75% of the words count smaller than 3
- about 50% words appear once , may be misspelled words

In [20]:
words_df.median()

  words_df.median()


count    1.0
dtype: float64

- a words with count =1 appear from the midean to the end due to existance of punctations and the txt exist in thier lower and upper case 

# Data preprocessing
- remove punctuations , set all chars to lowercase
- remove noise words that appear once , that might be misspelling
- remove high frequent words that doesn't add thing to the neighboring word
- save the preprocessed data

**probability of word removal**
- Words that show up often such as "the", "of", and "for" don't provide much context to the nearby words. If we discard some of them, we can remove some of the noise from our data and in return get faster training and better representations. 
- This process is called subsampling by Mikolov.For each word wi) in the training set, 
- we'll discard it with probability given by 
- ![p_remove1.png](./assets/p_remove1.png)![p_remove2.png](./assets/p_remove2.png)
- p--> probability of discarding word
- t--> threshold of word count that the probability of discarding will begin to increase from `T/t`
- f--> frequency of a word
- T--> total number of words
- `set t=100 and T=500000=1e5 ` probability of removal will start to be grater than zero when the f=5000
- ![p_remove_curve.png](./assets/p_remove_curve.png)

## remove invalid chars

In [21]:
prep_txt = src.remove_punctuations(raw_reviews_txt)


In [22]:
words_counter = Counter(prep_txt.split())

words_df = pd.DataFrame(words_counter.items(), columns=['word', 'count'])
words_df.sort_values(by=['count'], ascending=False, inplace=True)
print(f"no of unique words={len(words_df)}\nno of words ={words_df['count'].sum()}")

no of unique words=160310
no of words =9217166


no of unique words decreased from `381542`  to `160310` because we removed punctuation characters so word like `The. ` and `the?` will be `the` and will be counted as one word

In [23]:
def cnt_ranges(df,step,col_name):
  size=len(df)
  df=df.sort_values(by=col_name)
  col_idx=df.columns.get_loc(col_name)

  ranges_list=[]
  smaller_value=0
  bigger_value=step

  df_idx=0
  while(df_idx<size):
    cnt=0
    while(df_idx<size and df.iloc[df_idx,col_idx]<=bigger_value):
      cnt+=1
      df_idx+=1
    if cnt>0:
      ranges_list.append([f"{smaller_value} to {bigger_value}",cnt])
    smaller_value+=step
    bigger_value+=step
  return ranges_list


In [24]:
r=cnt_ranges(words_df,20,'count')

In [25]:
pd.DataFrame(r,columns=["range",'cnt'])

Unnamed: 0,range,cnt
0,0 to 20,142824
1,20 to 40,6260
2,40 to 60,2771
3,60 to 80,1526
4,80 to 100,1059
...,...,...
397,213640 to 213660,1
398,230740 to 230760,1
399,256820 to 256840,1
400,256900 to 256920,1


In [26]:
src.get_head_mid_tail(words_df,100)

Unnamed: 0,word,count
8,the,531191
6,and,256916
47,a,256838
49,of,230759
23,to,213654
...,...,...
90777,pointyfinger,1
90776,handymancarpenter,1
90775,paragonbr,1
90774,signorelliscreenplay,1


## remove words by apperance pos/neg ration

In [None]:
prep_txt, words_pos_neg_ratio, removed_words = src.remove_common_words(raw_reviews_txt, labels=raw_labels_txt,
                                                                           threshold=0.2, min_freq=5)
min_val,max_val=0,-1

for word,ratio in words_pos_neg_ratio.items():
    min_val=min(ratio,min_val)
    max_val=max(ratio,max_val)
print(f"min_val {min_val}")
print(f"max_val {max_val}")

In [None]:
raw_reviews_txt[:1000]

'I grew up (b. 1965) watching and loving the Thunderbirds. All my mates at school watched. We played "Thunderbirds" before school, during lunch and after school. We all wanted to be Virgil or Scott. No one wanted to be Alan. Counting down from 5 became an art form. I took my children to see the movie hoping they would get a glimpse of what I loved as a child. How bitterly disappointing. The only high point was the snappy theme tune. Not that it could compare with the original score of the Thunderbirds. Thankfully early Saturday mornings one television channel still plays reruns of the series Gerry Anderson and his wife created. Jonatha Frakes should hand in his directors chair, his version was completely hopeless. A waste of film. Utter rubbish. A CGI remake may be acceptable but replacing marionettes with Homo sapiens subsp. sapiens was a huge error of judgment.\nWhen I put this movie in my DVD player, and sat down with a coke and some chips, I had some expectations. I was hoping that

In [None]:
prep_txt[:1000]

'i grew up b 1965 watching and loving the thunderbirds all my mates at school watched we played thunderbirds before school during lunch and after school we all wanted to be virgil or scott no one wanted to be alan counting down from 5 became an art form i took my children to see the movie hoping they would get a glimpse of what i loved as a child how bitterly disappointing the only high point was the snappy theme tune not that it could compare with the original score of the thunderbirds thankfully early saturday mornings one television channel still plays reruns of the series gerry anderson and his wife created jonatha frakes should hand in his directors chair his version was completely hopeless a waste of film utter rubbish a cgi remake may be acceptable but replacing marionettes with homo sapiens subsp sapiens was a huge error of judgment \n when i put this movie in my dvd player and sat down with a coke and some chips i had some expectations i was hoping that this movie would contai

In [None]:
words_p_n_ratio_df=pd.DataFrame(words_pos_neg_ratio.items(),columns=['word','ratio']).sort_values(by=['ratio'])

In [None]:
words_p_n_ratio_df.describe()

Unnamed: 0,ratio
count,49924.0
mean,0.098722
std,0.962885
min,-4.85203
25%,-0.453028
50%,0.105361
75%,0.659387
max,4.477337


**Deciding the Ratio for noise**

In [None]:
ratio_threshold=0.25
is_common_word=(words_p_n_ratio_df['ratio']>-ratio_threshold)&(words_p_n_ratio_df['ratio']<ratio_threshold)
is_positive_word=(words_p_n_ratio_df['ratio']>ratio_threshold)
is_negative_word=(words_p_n_ratio_df['ratio']<-ratio_threshold)


common_words=words_p_n_ratio_df[is_common_word].sort_values(by='ratio')

positive_words=words_p_n_ratio_df[is_positive_word].sort_values(by='ratio')

negative_words=words_p_n_ratio_df[is_negative_word].sort_values(by='ratio')


In [None]:
print("common words ",common_words['word'].count())
print("positive words ",positive_words['word'].count())
print("negative words ",negative_words['word'].count())

common words  11581
positive words  21480
negative words  16863


In [None]:
n_words=100
common_words_range=src.data_preprocessing.get_head_mid_tail(common_words,n_words)
positive_words_range=src.data_preprocessing.get_head_mid_tail(positive_words,n_words)
negative_words_range=src.data_preprocessing.get_head_mid_tail(negative_words,n_words)

print(common_words_range)

Unnamed: 0,word,ratio
4371,jump,-0.249942
1282,intention,-0.249942
7516,sub,-0.249942
4044,falls,-0.249870
2453,merely,-0.249405
...,...,...
1667,exposed,0.248180
9359,educated,0.248461
1782,seven,0.248767
129,story,0.248969


In [None]:
print("common words")
print(f"pos and negative ratio for 'the' {words_pos_neg_ratio['those']}")
print(f"pos and negative ratio for 'him' {words_pos_neg_ratio['him']}")
print(f"pos and negative ratio for 'her' {words_pos_neg_ratio['her']}")
print(f"pos and negative ratio for 'is' {words_pos_neg_ratio['is']}")
print("\nPostive words")

print(f"pos and negative ratio for 'wonderful' {words_pos_neg_ratio['wonderful']}")
print(f"pos and negative ratio for 'amazing' {words_pos_neg_ratio['amazing']}")

print("\nnegative words")
print(f"pos and negative ratio for 'bad' {words_pos_neg_ratio['bad']}")
print(f"pos and negative ratio for 'worse' {words_pos_neg_ratio['worse']}")

common words
pos and negative ratio for 'the' 0.13665904472665205
pos and negative ratio for 'him' 0.23244094824163197
pos and negative ratio for 'her' 0.243471934268631
pos and negative ratio for 'is' 0.12654317655461095

Postive words
pos and negative ratio for 'wonderful' 1.5616646008218735
pos and negative ratio for 'amazing' 1.4000119973486898

negative words
pos and negative ratio for 'bad' -1.3747093868076836
pos and negative ratio for 'worse' -1.6793884090840114


## remove word with hight freq and low freq

In [27]:
import src.data_preprocessing as prep
reload(prep)

<module 'src.data_preprocessing' from '/content/sentiment-analysis-RNN/src/data_preprocessing.py'>

In [28]:
filtered_prep_txt, noise_words,prob_drop_dist = prep.remove_noise(prep_txt, filtering_ratio=1e4, prob_threshold=0.8, min_freq=5,min_rev_freq=5)

In [29]:
prob_drop_word_df=pd.DataFrame(prob_drop_dist.items(),columns=["word",'probabilty of drop']).sort_values(by=['probabilty of drop'],ascending=False)

In [30]:
prob_drop_word_df.head(150)

Unnamed: 0,word,probabilty of drop
8,the,0.958344
6,and,0.940103
47,a,0.940094
49,of,0.936800
23,to,0.934318
...,...,...
426,doesnt,0.641158
296,thing,0.641058
1431,now,0.639187
216,didnt,0.638523


In [31]:
noise_words

['absorbingly',
 'yared',
 'predaot',
 'backupbr',
 'geno',
 'awaycut',
 'castmatesbr',
 'lumetbr',
 'bloodalso',
 'gamesmanship',
 'spagnola',
 'foreverpolite',
 'lillianwho',
 'itmilo',
 'tannhauser',
 'levers',
 'foodculture',
 'ebonicsspeakers',
 'revie',
 'airspace',
 'ballpark',
 'lymi',
 'reenter',
 'grushenka',
 'giraldis',
 'paull',
 'trailbr',
 'benefice',
 'colson',
 'ideai',
 'tails”',
 'joblessness',
 'worserbr',
 'disrespectfully',
 'cowered',
 'heightsis',
 'grisbi',
 'demystified',
 'whatifing',
 'fortuate',
 'bresslaw',
 'tollywood',
 'weptbr',
 'sumptiously',
 '45hrslong',
 'dreadfull',
 'rackand',
 'believablebut',
 'lowpaying',
 'grooving',
 'defiled',
 'likedgreat',
 'gaf',
 'moviesis',
 'indomitability',
 'snoozeopera',
 'restspare',
 'tally',
 'edelmans',
 'matherson',
 'petticoats',
 '£50k',
 'muchwelcomed',
 'ehrr',
 'spinebusting',
 'meagan',
 'hominid',
 'okhe',
 'cardine',
 'salebr',
 'pastelinstead',
 'lahrs',
 'atkind',
 'carnivalesque',
 'decoff',
 'rodor

In [32]:
print(len(noise_words))
prep_txt=filtered_prep_txt

122153


In [33]:
words_counter = Counter(prep_txt.split())

words_df = pd.DataFrame(words_counter.items(), columns=['word', 'count'])
words_df.sort_values(by=['count'], ascending=False, inplace=True)
print(f"no of unique words={len(words_df)}\nno of words ={words_df['count'].sum()}")

no of unique words=38157
no of words =5162542


In [34]:
r=cnt_ranges(words_df,10,'count')

In [35]:
pd.DataFrame(r,columns=["range",'cnt']).head(100)

Unnamed: 0,range,cnt
0,0 to 10,12495
1,10 to 20,8236
2,20 to 30,3929
3,30 to 40,2319
4,40 to 50,1606
...,...,...
95,950 to 960,7
96,960 to 970,11
97,970 to 980,8
98,980 to 990,4


In [36]:
raw_reviews_txt[:250]

'I grew up (b. 1965) watching and loving the Thunderbirds. All my mates at school watched. We played "Thunderbirds" before school, during lunch and after school. We all wanted to be Virgil or Scott. No one wanted to be Alan. Counting down from 5 becam'

In [37]:
prep_txt[:250]

'grew up b 1965 watching loving thunderbirds my mates school watched we played thunderbirds before school during lunch after school we wanted virgil scott no wanted alan counting down 5 became art form took my children see hoping would get glimpse lov'

In [38]:
# save preprocessed reviews
with open(os.path.join(prep_dataset_dir, "reviews.txt"), 'w') as reviews_file:
    reviews_file.write(prep_txt)

In [39]:
prep_words_counter = Counter(prep_txt.split())
prep_words_df = pd.DataFrame(prep_words_counter.items(), columns=['word', 'count'])
print(f"no of unique words={len(prep_words_df)}\nno of words ={prep_words_df['count'].sum()}")
prep_words_df.sort_values(by=['count'], ascending=False, inplace=True)

no of unique words=38157
no of words =5162542


In [40]:
prep_words_df.head(100)

Unnamed: 0,word,count
98,good,22980
156,more,22197
85,when,22140
167,very,22016
7,my,19744
...,...,...
474,real,7270
4,watching,7210
366,doesnt,7158
240,thing,7154


- only `.` is removed with size `327192` , it was the second most frequent word , the std will decrease a little

In [41]:
prep_words_df.describe()

Unnamed: 0,count
count,38157.0
mean,135.297377
std,776.997663
min,5.0
25%,8.0
50%,18.0
75%,51.0
max,22980.0


In [42]:
prep_reviews_list = prep_txt.split('\n')
len(prep_reviews_list)

40000

# Skip-Gram Data loader
- Dataset Loader will load the txt data
- count the frequency for each word and total frequency which used in selecting the noise words which doesn't appear in the word context
- save a map for word to index and index to word , change txt to int
- iterate over the data , return each iteration `(input,target,noise_words)` the target which is the no of words around the given word with size `w` and the input will be the word repeated `w` times
    - ex if the txt is
    - ` They celebrated his birthday with a big party.`
    - and we are at the word `birthday` and the window size = 4
    - input `[birthday,birthday,birthday,birthday]` target `[celebrated,his,with,a] `

- **selecting `noise word`**
- we will select from a probability distribution (probability of selecting a word as noise word) n-words

- ![noise_removal_eq1.png](./assets/noise_removal_eq1.png)
- ![noise_removal_eq1_curve.png](./assets/noise_removal_eq1_curve.png)
- `f--> word frequency`
- `t--> total of words frequencies`
- with a power of `3/8` will make the probability of selecting low frequent words little higher
- ![noise_removal_eq2.png](./assets/noise_removal_eq2.png)
- ![noise_removal_eq2_curve.png](./assets/noise_removal_eq2_curve.png)

In [18]:
with open(os.path.join(prep_dataset_dir, 'reviews.txt')) as prep_file:
    prep_txt = prep_file.read()
prep_reviews_list = prep_txt.split('\n')
len(prep_reviews_list)

40000

In [21]:
word2intx=src.utils.load_json(os.path.join(prep_dataset_dir,"word2int.json"))


In [19]:
rev_skip_gram_data = src.Word2VecDataset(prep_txt, window_size=5, no_noise_outputs=25, batch_size=256)

In [22]:
# rev_skip_gram_data.save_word2int(prep_dataset_dir)
word2intx['sjöström']

23301

In [15]:
len(word2intx)

38158

In [None]:
print(f"int value for amazing {word2intx['amazing']}")
print(f"int value for good {word2intx['good']}")
print(f"int value for bad {word2intx['bad']}")

In [None]:
itrr = iter(rev_skip_gram_data)
word, target, noise = next(itrr)

In [None]:
# 2*window_size*batch_size
print(f"words_in shape {word.shape}")
print(f"target_words shape {target.shape}")
print(f"noise_words shape {noise.shape}")

In [11]:
rev_skip_gram_data.no_unique_words

38157

In [23]:
for word in word2intx.keys():
    if word not in rev_skip_gram_data.word2int:
        print(word)


cliché
clichés
clichéd
–
fiancé
fiancée

it´s
josé
café
don´t
matinée
naïve
risqué
buñuel
à
ramón
clichéridden
that´s
½
astérix
naiveté
chávez
maléfique
i´ve
can´t
protégé
andré
françois
it’s
krabbé
renée
séance
£1
divorcée
château
gérard
sjöström
rené
doesn´t
carrère
i´m
díaz
brontës
clichésbr
poiré
soufflé
jürgen
amenábar
clichébr
aimée
ingénue
didn´t
françoise
i’m
carné
she´s
mérimée
maría
léaud
obélix
körkarlen
brodský
he’s
stéphane
irène
fiancés
£8000
i’ve
rüdiger
passé
gonzález
haven´t
wasn´t
césar
déjàvu
émigré
clichédbr
fiancées
£20
voilà
façade
luján
über
jirí
déjà
garcía
exfiancé
débutante
dueñas
élan
são
lagerlöf


In [18]:
import torch
from torch import optim
import numpy as np
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# instantiating the model
embedding_dim = 300
model = src.SkipGram(rev_skip_gram_data.no_unique_words, embedding_dim).to(device)

# Training


- for window size=3 , batch_size=2
- txt --> `[1,2,3,4,5,6,7,8,9,10]` after being changed from txt to int
- take input of an index of a word repeated with window_size*2
- example at word 4
```
        batch 1       batch 2
    [ 4,4,4,4,4,4, 5,5,5,5,5,5 ]
    [ 1,2,3,5,6,7, 2,3,4,6,7,8 ]
```
- loss function
- ![loss_function1.png](./assets/loss_function1.png)
<br><br>

- Dot product between input word and random words need to be minimized to zero
  - ![loss_function2.png](./assets/loss_function2.png)
<br><br>
- selection or random words will be done using uni-gram distribution for the word
- Dot product between input word and output word need to be maximized to be zero also
<br><br>
  - ![loss_function3.png](./assets/loss_function3.png)

In [19]:

# using the loss that we defined
criterion = src.NegativeSamplingLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

epochs = 25

In [21]:
latest_weights_path=traintracker.latest_weights_path(weights_dir)
print(latest_weights_path)
state_dict=torch.load(latest_weights_path,map_location='cpu')
model.load_state_dict(state_dict)

../model_weights/skipgram/12_04 17_53 Train_(3.66341) .pt


RuntimeError: Error(s) in loading state_dict for SkipGram:
	size mismatch for input_embedding.weight: copying a param with shape torch.Size([38158, 300]) from checkpoint, the shape in current model is torch.Size([38157, 300]).
	size mismatch for output_embedding.weight: copying a param with shape torch.Size([38158, 300]) from checkpoint, the shape in current model is torch.Size([38157, 300]).

In [None]:
src.skipgram_train(model=model,epochs=epochs,skip_gram_data=rev_skip_gram_data,device=device,optimizer=optimizer,criterion=criterion,train_data_dir=train_data_dir,weights_dir=weights_dir)

In [None]:
src.skipgram_train(model=model,epochs=epochs,skip_gram_data=rev_skip_gram_data,device=device,optimizer=optimizer,criterion=criterion,train_data_dir=train_data_dir,weights_dir=weights_dir)

 epoch 1[..........]time remaining (m) = 17.9 Avg Train_Loss=3.42570415Epoch: 1/15
Loss:  3.4633517265319824
your | want, yourself, please, then, our
best | worst, most, greatest, also, excellent
people | them, women, these, those, everyone
work | script, job, camera, works, director
something | anything, nothing, things, someone, least
still | made, though, years, while, also
actors | cast, actor, actresses, performances, acting
know | don, understand, think, do, say
century | th, twentieth, st, period, set
van | damme, dyke, sant, gus, helsing
accent | accents, english, southern, speaking, acting
tone | franchot, atmosphere, style, music, mood
starring | directed, plays, stars, starred, played
studio | mgm, studios, company, hollywood, disney
animated | animation, cartoon, disney, cartoons, bakshi
solid | excellent, fine, performances, great, supporting
...

 epoch 1[=.........]time remaining (m) = 15.73 Avg Train_Loss=3.43731387Epoch: 1/15
Loss:  3.4449784755706787
made | makes, mak