<a href="https://colab.research.google.com/github/amantayal44/CS779_Project/blob/main/word2vec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
from collections  import Counter
import math
import random
import numpy as np
from tqdm.notebook import tqdm
import torch
from torch import nn
from torch.nn import LSTM,GRU,Linear,Embedding,Transformer
import torch.optim as optim
from torch.nn.functional import logsigmoid
import time
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader
import pickle
import csv

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

References:

*   https://arxiv.org/pdf/1310.4546.pdf
*   http://mccormickml.com/2017/01/11/word2vec-tutorial-part-2-negative-sampling/


In [None]:
#class for creating vocabulary
#also calculates sub-sampling prob and negative sampling probability
class Vocab:
  def __init__(self,min_freq=3,sub_sample=0.00001,negative_size = 10**8):
    self.min_freq = min_freq
    self.sub_sample = sub_sample
    self.negative_size = negative_size
    self.index2word = []
    self.word2index = dict()
    # self.negative_samples = []
    self.neg_prob = []
    self.keep_word = []
    self.words = Counter()
    self.total_freq = 0
    
  def create_vocab(self,file_list):
    #give create sentences from all files in file_list
    for file in file_list:
      with open("gdrive/MyDrive/data/{}.csv".format(file),encoding="utf-8") as f:
        csv_reader = csv.reader(f, delimiter=',')
        i = 0
        for r in csv_reader:
          if i == 0:
            i = 1
            continue
          self.words.update(r[3].split())
          self.total_freq += len(r[3].split())
    print("done")

    #creaing word dictionary and calculating sub-sample probability
    for w,f in tqdm(self.words.items()):
      if f >= self.min_freq:
        self.index2word.append(w)
        self.word2index[w] = len(self.index2word)-1
        p = f/self.total_freq
        p = self.sub_sample/p
        self.keep_word.append(math.sqrt(p)+p)
        self.neg_prob.append(f**(3/4))
    print("done")

  #to keep word or not (sub-sampling)
  def discard(self,token):
    index = self.word2index.get(token,None)
    if index == None:
      return False
    v = random.random() < self.keep_word[index]
    # if v: print(v)
    return v
  
  def __len__(self):
    return len(self.index2word)
  
  def __getitem__(self,word):
    return self.word2index.get(word,None)
  

In [None]:
#creating vocabulary with minimum frequency of 20
vocab = Vocab(min_freq=20)
original_files = ["original/o1","original/o2","original/o3","original/o4","original/o5"]
swapped_files = ["swapped/s1","swapped/s2","swapped/s3","swapped/s4","swapped/s5"]
vocab.create_vocab(original_files+swapped_files)
len(vocab)

done


HBox(children=(FloatProgress(value=0.0, max=220319.0), HTML(value='')))


done


45050

In [None]:
# creating negative sample list
negative_samples = []
total_neg = sum(vocab.neg_prob)
# creating list of negative samples
for i in tqdm(range(len(vocab))):
  size = vocab.neg_prob[i]*vocab.negative_size/(total_neg)
  negative_samples += [i]*int(size)

HBox(children=(FloatProgress(value=0.0, max=45050.0), HTML(value='')))




In [None]:
#return negative samples
def get_neg_samples(token,size=5):
    sample = []
    while(len(sample) != size):
      i = random.randint(0,len(negative_samples)-1)
      if negative_samples[i] == token:
        continue
      sample.append(negative_samples[i])
    return sample

def get_data(files,vocab,window_size=5,split=0.1):
  # sentences = []
  dataset = []

  #creaing datastet
  length = []
  
  for file in files:
      with open("gdrive/MyDrive/data/{}.csv".format(file),encoding="utf-8") as f:
        csv_reader = csv.reader(f, delimiter=',')
        i = 0
        for r in tqdm(csv_reader):
          if i == 0:
            i = 1
            continue
          s = r[3]
          #tokenizing sentences
          tokens = [vocab[w] for w in s.split() if vocab.discard(w)]
          #for each token sentence, create dataset
          for i,token in enumerate(tokens):
            size = random.randint(1,window_size) #random window_size
            #range for positive tokens
            f = max(0,i-size)
            l = min(len(tokens),i+size+1)
            for predict in tokens[f:l]:
              if predict == token:
                continue
              dataset.append((token,predict,get_neg_samples(token,size=5)))
        
  # train_set,val_set = train_test_split(dataset,test_size=split,random_state=42)
  return dataset
      
      

In [None]:
#creating dataset for each file and storing them as pickle
total_original = 0
total_swapped = 0
for file in original_files+swapped_files:
  dataset = get_data([file],vocab)
  filename = file.split("/")[-1]
  with open(filename+'.pickle', 'wb') as f:
    pickle.dump(dataset, f)
  print("{} : {} tokens".format(filename,len(dataset)))
  if "s" in file:
    total_swapped += len(dataset)
  else: total_original += len(dataset)
total = total_original + total_swapped
print("total: {}M, original {}M, swapped {}M".format( int(total/10**6), int(total_original/10**6), int(total_swapped/10**6) ))

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


o1 : 4682940 tokens


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


o2 : 4620898 tokens


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


o3 : 4618721 tokens


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


o4 : 4726577 tokens


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


o5 : 4638164 tokens


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


s1 : 4682680 tokens


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


s2 : 4610617 tokens


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


s3 : 4617237 tokens


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


s4 : 4732327 tokens


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


s5 : 4640561 tokens


NameError: ignored

In [None]:
total = total_original + total_swapped
print("total: {}M, original {}M, swapped {}M".format( int(total/10**6), int(total_original/10**6), int(total_swapped/10**6) ))

total: 46M, original 23M, swapped 23M


In [None]:
#saving dataset pickle in drive 
!cp *.pickle gdrive/MyDrive/data/dataset

In [None]:
#used in creating batches and transforming to torch tensor
def batch_data(data):
  inp = []
  neg = []
  pos = []
  for i,p,n in data:
    inp.append(i)
    neg.append(n)
    pos.append(p)
  return torch.tensor(inp,dtype=torch.long),torch.tensor(pos,dtype=torch.long),torch.tensor(neg,dtype=torch.long)

In [None]:
#Word2vec skip gram model
class Word2vec(nn.Module):
  def __init__(self,device,vocab_size,embed_size=256):
    super().__init__()
    self.device = device
    self.vocab_size = vocab_size
    self.embed_size = embed_size
    self.embedding = Embedding(vocab_size,embed_size,sparse=True) 
    self.out = Embedding(vocab_size,embed_size,sparse=True)
  def forward(self,input,pos_target,neg_target):
    emb_input = self.embedding(input)
    emb_pos = self.out(pos_target)
    emb_neg = self.out(neg_target)
    pos_score = torch.clamp(torch.sum(emb_input*emb_pos,dim=1),min=-10,max=10) #score dot product of input and pos embedding
    #clamp to avoid saturation in sigmoid
    pos_score = -logsigmoid(pos_score) #taking negative logsigmoid
    neg_score = torch.clamp(torch.sum(emb_input.unsqueeze(1)*emb_neg,dim=2),min=-10,max=10) #for all negative embedding
    neg_score = -torch.sum(logsigmoid(-neg_score),dim=1) #using -1 on neg score (as this should increase)
    return torch.mean(pos_score+neg_score) #computing avg loss of batch

In [None]:
model = Word2vec(device,vocab_size=len(vocab)).to(device)

In [None]:
#counting no. of parameters in model
def parameters_count(model):
    return sum(parameter.numel() for parameter in model.parameters() if parameter.requires_grad)
parameters_count(model)
#model has 23M parameters

23065600

In [None]:
def initialize_uniform(model):
  u = 0.5/model.embed_size
  def initialize(model):
    for layer,parameter in model.named_parameters():
      #initializing variable
      if "weight" in layer:
        nn.init.uniform_(parameter.data, -u,u)
      #initializing constant with 0
      else:
        nn.init.constant_(parameter.data, 0)
  return initialize

In [None]:
#using uniform initialization
model.apply(initialize_uniform(model))

Word2vec(
  (embedding): Embedding(45050, 256, sparse=True)
  (out): Embedding(45050, 256, sparse=True)
)

In [None]:
#training
def train(model,dataset,optimizer,scheduler):
  model.train()
  total_loss = 0
  for input,pos,neg in tqdm(dataset):
    # initializing optimizer
    optimizer.zero_grad()
    input = input.to(device)
    pos = pos.to(device)
    neg = neg.to(device)
    batch_loss = model(input,pos,neg)
    batch_loss.backward()
    #upgrading gradients
    optimizer.step()
    #schedule step update learning rate of optimizer
    scheduler.step()
    total_loss += batch_loss.item()
  
  return total_loss/len(dataset)

def evaluate(model,dataset):
  model.eval()
  total_loss = 0
  with torch.no_grad(): #not compute graidents
    for input,pos,neg in dataset:
      input = input.to(device)
      pos = pos.to(device)
      neg = neg.to(device)
      batch_loss = model(input,pos,neg)
      total_loss += batch_loss.item()
  
  return total_loss/len(dataset)

In [None]:
#if validation loss increase by more than stop it terminates training
BATCH_SIZE = 256
def fit(model,files,optimizer,scheduler,name="model",EPOCHS=5,min_val=10000):
    min_val1 = min_val
    min_val2 = min_val
    for epoch in range(EPOCHS):
      epoch_time = 0
      epoch_train = 0
      epoch_val = 0
      #training on each file
      for file in files:
        start = time.time()
        print("loading data",end=" ")
        with open("gdrive/MyDrive/data/dataset/"+file+".pickle", 'rb') as handle:
          dataset =  pickle.load(handle)
        print("data loaded")
        #creating split    
        dataset,val_set = train_test_split(dataset,test_size=0.02,random_state=42)
        #creating batches
        train_data = DataLoader(dataset, batch_size=BATCH_SIZE,shuffle=True,collate_fn=batch_data)
        val_data = DataLoader(val_set, batch_size=BATCH_SIZE,shuffle=True,collate_fn=batch_data)

        train_loss = train(model,train_data,optimizer,scheduler)
        val_loss = evaluate(model,val_data)  
        end = time.time()
        epoch_train += train_loss
        epoch_val += val_loss
        print("train loss: {:.3f} val loss: {:.3f}".format(train_loss,val_loss))
        t = end - start
        epoch_time += t
        print("time taken by {} epoch on {} {} min {} s".format(epoch+1,file,int(t/60),int(t%60)))
        if val_loss<min_val1:
          min_val1 = val_loss
          #saving best model
          torch.save(model.state_dict(), name+'_best_1.pt')
      epoch_train /= len(files)
      epoch_val /= len(files)
      t = epoch_time
      print("\nEPOCH{}\ntime taken by {} epoch {} min {} s".format(epoch+1,epoch+1,int(t/60),int(t%60)))
      print("train loss: {:.3f} val loss: {:.3f}".format(epoch_train,epoch_val))
      if epoch_val<min_val2:
        min_val2 = epoch_val
        #saving best model
        torch.save(model.state_dict(), name+'_best_2.pt')
  
    #saving model on last epoch
    torch.save(model.state_dict(), name+'.pt')
    

In [None]:
files = ["o1","o2","o3","o4","o5"]
optimizer = optim.SparseAdam(model.parameters(), lr=0.001)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer,15000*len(files))

In [None]:
fit(model,files,optimizer,scheduler,EPOCHS=10)

loading data data loaded


HBox(children=(FloatProgress(value=0.0, max=17913.0), HTML(value='')))


train loss: 2.632 val loss: 2.457
time taken by 1 epoch on o1 1 min 53 s
loading data data loaded


HBox(children=(FloatProgress(value=0.0, max=17692.0), HTML(value='')))


train loss: 2.467 val loss: 2.346
time taken by 1 epoch on o2 2 min 22 s
loading data data loaded


HBox(children=(FloatProgress(value=0.0, max=17682.0), HTML(value='')))


train loss: 2.431 val loss: 2.358
time taken by 1 epoch on o3 2 min 18 s
loading data data loaded


HBox(children=(FloatProgress(value=0.0, max=18118.0), HTML(value='')))


train loss: 2.523 val loss: 2.492
time taken by 1 epoch on o4 2 min 23 s
loading data data loaded


HBox(children=(FloatProgress(value=0.0, max=17780.0), HTML(value='')))


train loss: 2.574 val loss: 2.546
time taken by 1 epoch on o5 2 min 12 s

EPOCH1
time taken by 1 epoch 11 min 11 s
train loss: 2.525 val loss: 2.440
loading data data loaded


HBox(children=(FloatProgress(value=0.0, max=17913.0), HTML(value='')))


train loss: 2.381 val loss: 2.323
time taken by 2 epoch on o1 2 min 20 s
loading data data loaded


HBox(children=(FloatProgress(value=0.0, max=17692.0), HTML(value='')))


train loss: 2.274 val loss: 2.236
time taken by 2 epoch on o2 2 min 9 s
loading data data loaded


HBox(children=(FloatProgress(value=0.0, max=17682.0), HTML(value='')))


train loss: 2.264 val loss: 2.205
time taken by 2 epoch on o3 2 min 20 s
loading data data loaded


HBox(children=(FloatProgress(value=0.0, max=18118.0), HTML(value='')))


train loss: 2.290 val loss: 2.207
time taken by 2 epoch on o4 2 min 12 s
loading data data loaded


HBox(children=(FloatProgress(value=0.0, max=17780.0), HTML(value='')))


train loss: 2.283 val loss: 2.204
time taken by 2 epoch on o5 2 min 28 s

EPOCH2
time taken by 2 epoch 11 min 31 s
train loss: 2.298 val loss: 2.235
loading data data loaded


HBox(children=(FloatProgress(value=0.0, max=17913.0), HTML(value='')))


train loss: 2.219 val loss: 2.179
time taken by 3 epoch on o1 2 min 15 s
loading data data loaded


HBox(children=(FloatProgress(value=0.0, max=17692.0), HTML(value='')))


train loss: 2.229 val loss: 2.225
time taken by 3 epoch on o2 2 min 26 s
loading data data loaded


HBox(children=(FloatProgress(value=0.0, max=17682.0), HTML(value='')))


train loss: 2.244 val loss: 2.278
time taken by 3 epoch on o3 2 min 14 s
loading data data loaded


HBox(children=(FloatProgress(value=0.0, max=18118.0), HTML(value='')))


train loss: 2.226 val loss: 2.230
time taken by 3 epoch on o4 2 min 34 s
loading data data loaded


HBox(children=(FloatProgress(value=0.0, max=17780.0), HTML(value='')))


train loss: 2.171 val loss: 2.170
time taken by 3 epoch on o5 2 min 9 s

EPOCH3
time taken by 3 epoch 11 min 40 s
train loss: 2.218 val loss: 2.216
loading data data loaded


HBox(children=(FloatProgress(value=0.0, max=17913.0), HTML(value='')))


train loss: 2.121 val loss: 2.120
time taken by 4 epoch on o1 2 min 9 s
loading data data loaded


HBox(children=(FloatProgress(value=0.0, max=17692.0), HTML(value='')))


train loss: 2.136 val loss: 2.113
time taken by 4 epoch on o2 2 min 18 s
loading data data loaded


HBox(children=(FloatProgress(value=0.0, max=17682.0), HTML(value='')))


train loss: 2.131 val loss: 2.111
time taken by 4 epoch on o3 2 min 19 s
loading data data loaded


HBox(children=(FloatProgress(value=0.0, max=18118.0), HTML(value='')))


train loss: 2.153 val loss: 2.147
time taken by 4 epoch on o4 2 min 12 s
loading data data loaded


HBox(children=(FloatProgress(value=0.0, max=17780.0), HTML(value='')))


train loss: 2.167 val loss: 2.183
time taken by 4 epoch on o5 2 min 23 s

EPOCH4
time taken by 4 epoch 11 min 23 s
train loss: 2.142 val loss: 2.135
loading data data loaded


HBox(children=(FloatProgress(value=0.0, max=17913.0), HTML(value='')))


train loss: 2.144 val loss: 2.217
time taken by 5 epoch on o1 2 min 22 s
loading data data loaded


HBox(children=(FloatProgress(value=0.0, max=17692.0), HTML(value='')))


train loss: 2.123 val loss: 2.195
time taken by 5 epoch on o2 2 min 22 s
loading data data loaded


HBox(children=(FloatProgress(value=0.0, max=17682.0), HTML(value='')))


train loss: 2.048 val loss: 2.111
time taken by 5 epoch on o3 2 min 10 s
loading data data loaded


HBox(children=(FloatProgress(value=0.0, max=18118.0), HTML(value='')))


train loss: 2.061 val loss: 2.118
time taken by 5 epoch on o4 2 min 23 s
loading data data loaded


HBox(children=(FloatProgress(value=0.0, max=17780.0), HTML(value='')))


train loss: 2.098 val loss: 2.118
time taken by 5 epoch on o5 2 min 9 s

EPOCH5
time taken by 5 epoch 11 min 28 s
train loss: 2.095 val loss: 2.152
loading data data loaded


HBox(children=(FloatProgress(value=0.0, max=17913.0), HTML(value='')))


train loss: 2.058 val loss: 2.087
time taken by 6 epoch on o1 2 min 20 s
loading data data loaded


HBox(children=(FloatProgress(value=0.0, max=17692.0), HTML(value='')))


train loss: 2.058 val loss: 2.098
time taken by 6 epoch on o2 2 min 9 s
loading data data loaded


HBox(children=(FloatProgress(value=0.0, max=17682.0), HTML(value='')))


train loss: 2.049 val loss: 2.116
time taken by 6 epoch on o3 2 min 20 s
loading data data loaded


HBox(children=(FloatProgress(value=0.0, max=18118.0), HTML(value='')))


train loss: 2.089 val loss: 2.192
time taken by 6 epoch on o4 2 min 13 s
loading data data loaded


HBox(children=(FloatProgress(value=0.0, max=17780.0), HTML(value='')))


train loss: 2.107 val loss: 2.228
time taken by 6 epoch on o5 2 min 21 s

EPOCH6
time taken by 6 epoch 11 min 26 s
train loss: 2.072 val loss: 2.144
loading data data loaded


HBox(children=(FloatProgress(value=0.0, max=17913.0), HTML(value='')))


train loss: 1.994 val loss: 2.120
time taken by 7 epoch on o1 2 min 20 s
loading data data loaded


HBox(children=(FloatProgress(value=0.0, max=17692.0), HTML(value='')))


train loss: 1.957 val loss: 2.086
time taken by 7 epoch on o2 2 min 22 s
loading data data loaded


HBox(children=(FloatProgress(value=0.0, max=17682.0), HTML(value='')))


train loss: 1.979 val loss: 2.082
time taken by 7 epoch on o3 2 min 21 s
loading data data loaded


HBox(children=(FloatProgress(value=0.0, max=18118.0), HTML(value='')))


train loss: 2.034 val loss: 2.112
time taken by 7 epoch on o4 2 min 13 s
loading data data loaded


HBox(children=(FloatProgress(value=0.0, max=17780.0), HTML(value='')))


train loss: 2.052 val loss: 2.120
time taken by 7 epoch on o5 2 min 22 s

EPOCH7
time taken by 7 epoch 11 min 39 s
train loss: 2.003 val loss: 2.104
loading data data loaded


HBox(children=(FloatProgress(value=0.0, max=17913.0), HTML(value='')))


train loss: 1.987 val loss: 2.102
time taken by 8 epoch on o1 2 min 12 s
loading data data loaded


HBox(children=(FloatProgress(value=0.0, max=17692.0), HTML(value='')))


train loss: 1.992 val loss: 2.139
time taken by 8 epoch on o2 2 min 21 s
loading data data loaded


HBox(children=(FloatProgress(value=0.0, max=17682.0), HTML(value='')))


train loss: 1.998 val loss: 2.181
time taken by 8 epoch on o3 2 min 20 s
loading data data loaded


HBox(children=(FloatProgress(value=0.0, max=18118.0), HTML(value='')))


train loss: 1.997 val loss: 2.179
time taken by 8 epoch on o4 2 min 24 s
loading data data loaded


HBox(children=(FloatProgress(value=0.0, max=17780.0), HTML(value='')))


train loss: 1.960 val loss: 2.126
time taken by 8 epoch on o5 2 min 24 s

EPOCH8
time taken by 8 epoch 11 min 44 s
train loss: 1.987 val loss: 2.145
loading data data loaded


HBox(children=(FloatProgress(value=0.0, max=17913.0), HTML(value='')))


train loss: 1.901 val loss: 2.082
time taken by 9 epoch on o1 2 min 29 s
loading data data loaded


HBox(children=(FloatProgress(value=0.0, max=17692.0), HTML(value='')))


train loss: 1.940 val loss: 2.086
time taken by 9 epoch on o2 2 min 28 s
loading data data loaded


HBox(children=(FloatProgress(value=0.0, max=17682.0), HTML(value='')))


train loss: 1.957 val loss: 2.092
time taken by 9 epoch on o3 2 min 27 s
loading data data loaded


HBox(children=(FloatProgress(value=0.0, max=18118.0), HTML(value='')))


train loss: 1.984 val loss: 2.133
time taken by 9 epoch on o4 2 min 30 s
loading data data loaded


HBox(children=(FloatProgress(value=0.0, max=17780.0), HTML(value='')))


train loss: 1.994 val loss: 2.162
time taken by 9 epoch on o5 2 min 23 s

EPOCH9
time taken by 9 epoch 12 min 19 s
train loss: 1.955 val loss: 2.111
loading data data loaded


HBox(children=(FloatProgress(value=0.0, max=17913.0), HTML(value='')))


train loss: 1.940 val loss: 2.174
time taken by 10 epoch on o1 2 min 12 s
loading data data loaded


HBox(children=(FloatProgress(value=0.0, max=17692.0), HTML(value='')))


train loss: 1.931 val loss: 2.180
time taken by 10 epoch on o2 2 min 20 s
loading data data loaded


HBox(children=(FloatProgress(value=0.0, max=17682.0), HTML(value='')))


train loss: 1.879 val loss: 2.112
time taken by 10 epoch on o3 2 min 19 s
loading data data loaded


HBox(children=(FloatProgress(value=0.0, max=18118.0), HTML(value='')))


train loss: 1.889 val loss: 2.122
time taken by 10 epoch on o4 2 min 22 s
loading data data loaded


HBox(children=(FloatProgress(value=0.0, max=17780.0), HTML(value='')))


train loss: 1.937 val loss: 2.126
time taken by 10 epoch on o5 2 min 12 s

EPOCH10
time taken by 10 epoch 11 min 27 s
train loss: 1.915 val loss: 2.143


In [None]:
#to get embedding from model
def get_embedding(model,vocab):
  embedding_dict = dict()
  embeddings = model.embedding.weight.cpu().data.numpy()
  for i in range(len(vocab)):
    embedding_dict[vocab.index2word[i]] = embeddings[i]
  return embedding_dict

In [None]:
#saving embedding as pickle
with open('noraml_embedding.pickle', 'wb') as f:
    pickle.dump(get_embedding(model,vocab), f)

In [None]:
!cp noraml_embedding.pickle gdrive/MyDrive/data/embedding/
!cp *.pt gdrive/MyDrive/data/model/

cp: cannot stat 'embedding.pickle': No such file or directory


training gender neutral

In [None]:
model_gn = Word2vec(device,vocab_size=len(vocab)).to(device)
model_gn.apply(initialize_uniform(model))

In [None]:
files = ["o1","s1","o2","s2","o3","s3","o4","s4","o5","s5"]
optimizer = optim.SparseAdam(model_gn.parameters(), lr=0.001)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer,15000*len(files))

In [None]:
#training gender neutral embedding
fit(model_gn,files,optimizer,scheduler,name="model_gn")

In [None]:
with open('GN_embedding.pickle', 'wb') as f:
    pickle.dump(get_embedding(model_gn,vocab), f)

In [None]:
!cp GN_embedding.pickle gdrive/MyDrive/data/embedding/
!cp *.pt gdrive/MyDrive/data/model/

In [None]:
import numpy as np

In [None]:
#saving glove embedding used in results
glove_embedding = dict()
with open("gdrive/MyDrive/glove.6B.100d.txt") as f:
  lines = f.readlines()
  print(len(lines))
  for line in tqdm(lines):
    word = line.split(" ")[0]
    if word in vocab.index2word:
      arr = []
      for v in line.split(" ")[1:]:
        arr.append(float(v))
      glove_embedding[word] = np.array(arr)

400000


HBox(children=(FloatProgress(value=0.0, max=400000.0), HTML(value='')))




In [None]:
len(glove_embedding)

44186

In [None]:
with open('glove_embedding.pickle', 'wb') as f:
    pickle.dump(glove_embedding, f)

In [None]:
!cp glove_embedding.pickle gdrive/MyDrive/data/embedding/

In [None]:
#saving gender neutral glove embedding used in results
gn_glove_embedding = dict()
with open("gdrive/MyDrive/1b-vectors300-0.8-0.8.txt") as f:
  lines = f.readlines()
  print(len(lines))
  for line in tqdm(lines):
    word = line.split(" ")[0]
    if word in vocab.index2word:
      arr = []
      for v in line.split(" ")[1:]:
        arr.append(float(v))
      gn_glove_embedding[word] = np.array(arr)

142527


HBox(children=(FloatProgress(value=0.0, max=142527.0), HTML(value='')))




In [None]:
len(gn_glove_embedding)

27464

In [None]:
with open('gn_glove_embedding.pickle', 'wb') as f:
    pickle.dump(gn_glove_embedding, f)

In [None]:
!cp gn_glove_embedding.pickle gdrive/MyDrive/data/embedding/