# Speech Recognition | Create word embeddings
### Aditya Hajare | https://github.com/aditya43

In [1]:
import numpy as np
from IPython.display import display, HTML
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
import pandas as pd
import re

In [2]:
import nltk
nltk.download('stopwords')

from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/adiinviter/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


## Read data from file.

In [3]:
def get_file_data(stop_word_removal='no'):
    file_contents = []
    with open('/Users/adiinviter/work/AdityaHajare/Assignments/w2v.txt') as f:
        file_contents = f.read()
    text = []
    for val in file_contents.split('.'):
        sent = re.findall("[A-Za-z]+", val)
        line = ''
        for words in sent:
            
            if stop_word_removal == 'yes': 
                if len(words) > 1 and words not in stop_words:
                    line = line + ' ' + words
            else:
                if len(words) > 1 :
                    line = line + ' ' + words
        text.append(line)
    return text

## Generate variables

##### word_to_index: A dictionary mapping each word to an integer value {'aditya': 0, 'hajare': 1}
##### index_to_word: A dictionary mapping each word to an integer value {0: 'aditya', 1: 'hajare'}
##### corpus: The entire data consisting of all the words
##### vocab_size: Number of unique words in the corpus

In [4]:
def generate_dictinoary_data(text):
    word_to_index= dict()
    index_to_word = dict()
    corpus = []
    count = 0
    vocab_size = 0
    
    for row in text:
        for word in row.split():
            word = word.lower()
            corpus.append(word)
            if word_to_index.get(word) == None:
                word_to_index.update ( {word : count})
                index_to_word.update ( {count : word })
                count  += 1
    vocab_size = len(word_to_index)
    length_of_corpus = len(corpus)
    
    return word_to_index,index_to_word,corpus,vocab_size,length_of_corpus

## Testing generate_dictinoary_data() definition.

In [18]:
text = get_file_data()
word_to_index,index_to_word,corpus,vocab_size,length_of_corpus = generate_dictinoary_data(text)
print('Number of unique words:' , vocab_size)
print('word_to_index : ',word_to_index)
print('index_to_word : ',index_to_word)
print('corpus:',corpus)
print('Length of corpus :',length_of_corpus)

Number of unique words: 261
word_to_index :  {'in': 0, 'linguistics': 1, 'word': 2, 'embeddings': 3, 'were': 4, 'discussed': 5, 'the': 6, 'research': 7, 'area': 8, 'of': 9, 'distributional': 10, 'semantics': 11, 'it': 12, 'aims': 13, 'to': 14, 'quantify': 15, 'and': 16, 'categorize': 17, 'semantic': 18, 'similarities': 19, 'between': 20, 'linguistic': 21, 'items': 22, 'based': 23, 'on': 24, 'their': 25, 'properties': 26, 'large': 27, 'samples': 28, 'language': 29, 'data': 30, 'underlying': 31, 'idea': 32, 'that': 33, 'is': 34, 'characterized': 35, 'by': 36, 'company': 37, 'keeps': 38, 'was': 39, 'popularized': 40, 'firth': 41, 'technique': 42, 'representing': 43, 'words': 44, 'as': 45, 'vectors': 46, 'has': 47, 'roots': 48, 'with': 49, 'development': 50, 'vector': 51, 'space': 52, 'model': 53, 'for': 54, 'information': 55, 'retrieval': 56, 'reducing': 57, 'number': 58, 'dimensions': 59, 'using': 60, 'singular': 61, 'value': 62, 'decomposition': 63, 'then': 64, 'led': 65, 'introduction'


## Generate training data

#### The training data is in the following format :

#####   Example:

#####   Window size = 2, Vocab size = 9


#####   We will set the indicies as 1 according to the word_to_index dict i.e natural: 0, so we set the 0th index as 1 to denote natural

#####   Target word = best    
#####   Context words = (way,to)
#####   Target_word_one_hot_vector = [1, 0, 0, 0, 0, 0, 0, 0, 0]
#####   Context_word_one_hot_vector = [0, 1, 1, 0, 0, 0, 0, 0, 0]

#####   Target word = way    
#####   Context words = (best,to,success)
#####   Target_word_one_hot_vector = [0, 1, 0, 0, 0, 0, 0, 0, 0]
#####   Context_word_one_hot_vector= [1, 0, 1, 1, 0, 0, 0, 0, 0]


In [19]:
def get_one_hot_vectors(target_word,context_words,vocab_size,word_to_index):    
    #Create an array of size = vocab_size filled with zeros
    trgt_word_vector = np.zeros(vocab_size)
    
    #Get the index of the target_word according to the dictionary word_to_index. 
    #If target_word = best, the index according to the dictionary word_to_index is 0. 
    #So the one hot vector will be [1, 0, 0, 0, 0, 0, 0, 0, 0]
    index_of_word_dictionary = word_to_index.get(target_word) 
    
    #Set the index to 1
    trgt_word_vector[index_of_word_dictionary] = 1
    
    #Repeat same steps for context_words but in a loop
    ctxt_word_vector = np.zeros(vocab_size)
    
    
    for word in context_words:
        index_of_word_dictionary = word_to_index.get(word) 
        ctxt_word_vector[index_of_word_dictionary] = 1
        
    return trgt_word_vector,ctxt_word_vector

In [20]:
#Note : Below comments for trgt_word_index, ctxt_word_index are with the above sample text for understanding the code flow

def generate_training_data(corpus,window_size,vocab_size,word_to_index,length_of_corpus,sample=None):

    training_data =  []
    training_sample_words =  []
    for i,word in enumerate(corpus):

        index_target_word = i
        target_word = word
        context_words = []

        #when target word is the first word
        if i == 0:  

            # trgt_word_index:(0), ctxt_word_index:(1,2)
            context_words = [corpus[x] for x in range(i + 1 , window_size + 1)] 


        #when target word is the last word
        elif i == len(corpus)-1:

            # trgt_word_index:(9), ctxt_word_index:(8,7), length_of_corpus = 10
            context_words = [corpus[x] for x in range(length_of_corpus - 2 ,length_of_corpus -2 - window_size  , -1 )]

        #When target word is the middle word
        else:

            #Before the middle target word
            before_target_word_index = index_target_word - 1
            for x in range(before_target_word_index, before_target_word_index - window_size , -1):
                if x >=0:
                    context_words.extend([corpus[x]])

            #After the middle target word
            after_target_word_index = index_target_word + 1
            for x in range(after_target_word_index, after_target_word_index + window_size):
                if x < len(corpus):
                    context_words.extend([corpus[x]])


        trgt_word_vector,ctxt_word_vector = get_one_hot_vectors(target_word,context_words,vocab_size,word_to_index)
        training_data.append([trgt_word_vector,ctxt_word_vector])   
        
        if sample is not None:
            training_sample_words.append([target_word,context_words])   
        
    return training_data,training_sample_words

In [21]:
window_size = 2
training_data,training_sample_words = generate_training_data(corpus,2,vocab_size,word_to_index,length_of_corpus,'yes')

In [23]:
for i in range(len(training_data)):
    print('*' * 50)
    # print('Target word:%s . Target vector: %s ' %(training_sample_words[i][0],training_data[i][0]))
    # print('Context word:%s . Context  vector: %s ' %(training_sample_words[i][1],training_data[i][1]))
    print('Target word:%s' %(training_sample_words[i][0]))
    print('Context word:%s' %(training_sample_words[i][1]))

**************************************************
Target word:in
Context word:['linguistics', 'word']
**************************************************
Target word:linguistics
Context word:['in', 'word', 'embeddings']
**************************************************
Target word:word
Context word:['linguistics', 'in', 'embeddings', 'were']
**************************************************
Target word:embeddings
Context word:['word', 'linguistics', 'were', 'discussed']
**************************************************
Target word:were
Context word:['embeddings', 'word', 'discussed', 'in']
**************************************************
Target word:discussed
Context word:['were', 'embeddings', 'in', 'the']
**************************************************
Target word:in
Context word:['discussed', 'were', 'the', 'research']
**************************************************
Target word:the
Context word:['in', 'discussed', 'research', 'area']
**************************************