<a href="https://colab.research.google.com/github/bhardwaj1230/NMT/blob/master/replace_with_nearest_neighbor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
#!/usr/bin/env python
# coding: utf-8

import re
import os
import codecs
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
import pickle
from pyfasttext import FastText as FT
import nltk
from nltk.corpus import stopwords
import random
import math
import string


In [0]:
# wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz
# wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.fr.300.bin.gz
    
print('Loading English fastText pretrained model')

#pretrained model:
model_en = FT('/data/rali5/sans-bkp/shiven/classification/data/cc.en.300.bin')


print('\n\n Reading europarl from directory')
train_ep_en =[]

en = '/data/rali5/sans-bkp/shiven/classification/data/train/train_en'
# encoding="utf8", errors='ignore'

with open(en) as f:
    eng = [line.strip('\n') for line in tqdm(f)]
    eng = [nltk.word_tokenize(line) for line in tqdm(eng[0:1000000])]
    train_ep_en.append(eng)


In [0]:
print('Cleaning data for unique words')

from operator import itemgetter  
import random

print('\n\n Cleaning')

clean_en =[]
for line in tqdm(train_ep_en[0]):
    words = [word for word in line if word.isalpha()]
    stop_words = set(stopwords.words('english'))
    words = [w for w in words if not w in stop_words]
    clean_en.append(words)

flat_list = [item for sublist in clean_en for item in sublist]

print('\n\n Calculating Zipf-Law')
frequency= {}

for word in tqdm(flat_list):
    count = frequency.get(word,0)
    frequency[word] = count + 1
     
zipf_data = {}

for key, value in reversed(sorted(frequency.items(), key = itemgetter(1))):
    zipf_data[key] = value

print('\n\n Size of Vocab: ', len(zipf_data))

'''
freq_data = {key: value for key, value in zipf_data.items() if value > 100 }

print('\n\n Number of words with frequency > 100: ', len(freq_data))

words_to_nn = random.choices(list(freq_data.keys()), k = round(len(freq_data) * 0.3 ))

print('\n\n Number of words to be replaced: ', len(words_to_nn))
'''


In [0]:

print('\n\n Started finding NN')

nn_dict = {}
for word in tqdm(list(zipf_data.keys())):
    replaced_words = model_en.nearest_neighbors(word, k=5)
    nn_dict[word] = replaced_words

print('\n\n Writing words to directory')

with open('/u/bhardwas/en_fr_data/nn_dict_en', 'wb') as fp:
    pickle.dump(nn_dict, fp)


In [0]:
print('\n\n Reading English europarl from directory')
train_ep_en =[]

en = '/u/bhardwas/en_fr_data/europarl/fr-en/europarl-v7.fr-en.en'
# encoding="utf8", errors='ignore'

with open(en) as f:
    eng = [ re.sub(r'[^\w\s]', '', line).strip(' \n').lower() for idx, line in tqdm(enumerate(f))]
    eng = [nltk.word_tokenize(line) for line in tqdm(eng)]
    train_ep_en.append(eng)


print ("\n\n Reading English Dictionary")

with open ('/u/bhardwas/en_fr_data/nn_dict_en', 'rb') as fp:
    dict_en = pickle.load(fp)
    

In [0]:
import string
print('\n\n Start replacing with Nearest Neighbors for Englsih data')

print ("Random number with seed 101")
random.seed( 101 )

neg_data_en = []
for line in tqdm(train_ep_en[0]):
    
    big_word = [word for word in line if len(word) > 4]
    sz = round(len(big_word) * 1) #100% replacement
    
    rand = random.sample(big_word, sz)
    
    replaced_words = []
    for word_replace in rand:
        if word_replace in dict_en:
            test_word = random.choices(dict_en[word_replace],k=2)
            test_word = list(test_word)
            
            if len(test_word[0][0]) > 3*len(word_replace):
                replaced_words.append((word_replace,'1'))
                
            elif word_replace == test_word[0][0].lower():
                replaced_words.append(test_word[1])
            
            elif '.' in test_word[0][0]:
                if '.' in test_word[1][0]:
                    replaced_words.append((word_replace,'1'))
                else:
                    replaced_words.append(test_word[1])
            
            else:
                replaced_words.append(test_word[0])
                
        else :
            replaced_words.append((word_replace,'1'))
            
    #print('Old word :', rand,'\n','New Words :',  replaced_words,'\n\n')
    
    line = "".join([" "+i if not i.startswith("'") and i not in string.punctuation else i for i in line]).strip()
    
    for old, nn in zip(rand, replaced_words):
        #print('Old word :', old,'\n','New Words :',  nn,'\n\n')
        line = line.replace(old, nn[0])

    neg_data_en.append(line)



In [0]:

print('\n\n Writing English Negative Samples to directory')

with open('/u/bhardwas/en_fr_data/neg_data_fr_en_en', 'wb') as fp:
    pickle.dump(neg_data_en, fp)

train_ep_en = 0
neg_data_en = 0
dict_en = 0



print('\n\n Reading French europarl from directory')
train_ep_fr =[]

fr = '/u/bhardwas/en_fr_data/europarl/fr-en/europarl-v7.fr-en.fr'
# encoding="utf8", errors='ignore'

with open(fr) as f:
    frc = [ re.sub(r'[^\w\s]', '', line).strip(' \n').lower() for idx, line in tqdm(enumerate(f))]
    frc = [nltk.word_tokenize(line) for line in tqdm(frc)]
    train_ep_fr.append(frc)

print ("\n\n Reading French Dictionary")

with open ('/u/bhardwas/en_fr_data/nn_dict_fr', 'rb') as fp:
    dict_fr = pickle.load(fp)
    


In [0]:

import string
print('\n\n Start replacing with Nearest Neighbors for French data')

print ("Random number with seed 101")
random.seed( 101 )

neg_data_fr = []
for line in tqdm(train_ep_fr[0]):
    
    big_word = [word for word in line if len(word) > 4]
    sz = round(len(big_word) * 1) #100% replacement
    
    rand = random.sample(big_word, sz)
    
    replaced_words = []
    for word_replace in rand:
        if word_replace in dict_fr:
            test_word = random.choices(dict_en[word_replace],k=2)
            test_word = list(test_word)
            
            if len(test_word[0][0]) > 3*len(word_replace):
                replaced_words.append((word_replace,'1'))
                
            elif word_replace == test_word[0][0].lower():
                replaced_words.append(test_word[1])
            
            elif '.' in test_word[0][0]:
                if '.' in test_word[1][0]:
                    replaced_words.append((word_replace,'1'))
                else:
                    replaced_words.append(test_word[1])
            
            else:
                replaced_words.append(test_word[0])
                
        else :
            replaced_words.append((word_replace,'1'))
            
    line = "".join([" "+i if not i.startswith("'") and i not in string.punctuation else i for i in line]).strip()
    
    for old, nn in zip(rand, replaced_words):
        #print('Old word :', old,'\n','New Words :',  nn,'\n\n')
        line = line.replace(old, nn[0])

    neg_data_fr.append(line)


