In [1]:
import io
import nltk
from nltk import sent_tokenize,word_tokenize
from nltk.corpus import stopwords
import re
import numpy as np
import numpy.linalg as lin
import pandas as pd
import unicodedata
import pickle
import warnings 
stop_words = set(stopwords.words('english'))
warnings.filterwarnings('ignore')


In [2]:
file=open('fasttext.vec','r',encoding='utf-8')
fasttext={}
for line in file:
    line=line.strip("\n").strip(" ")
    line=line.split(" ")
    fasttext[line[0]]=np.array(list(map(float,line[1:])))

In [3]:
del fasttext['999994']

In [4]:
def strip_accents(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')

In [5]:
def remove_special_characters(text):
    regex = re.compile('[^a-zA-Z\s]')
    text_returned = re.sub(regex,' ',text)
    return text_returned

In [6]:
def word_embedding(word):
    word=word.lower()
    if word in fasttext.keys():
        return np.array(fasttext[word],dtype=np.float64)
    return np.array([0.0]*300)

In [7]:
def sentence_embedding(sentence):
    sentence=strip_accents(sentence)
    sentence=remove_special_characters(sentence)
    words=word_tokenize(sentence)
    if len(words)>0:
        words=[word  for word in words if word not in stop_words]
        sentence_embedding=[word_embedding(word) for word in words]
        return np.array(list(map(lambda x: sum(x)/len(x), zip(*sentence_embedding))))
    return np.array([0]*300)

In [8]:
data=pd.read_csv('../English.csv')

# Sentence By Sentence

In [9]:
verse_embeddings_sentence=[]
verse_embeddings_max=[]
verse_embeddings_mean=[]
for i in range(len(data)):
    text=data.loc[i]['Commentary']
    text=strip_accents(text)
    sentences=sent_tokenize(text)
    embeddings=[sentence_embedding(sentence) for sentence in sentences]
    #sentence
    verse_embeddings_sentence.append(embeddings)
    #Max Pooling
    norms=[lin.norm(i) for i in embeddings]
    index=norms.index(max(norms))
    verse_embeddings_max.append(embeddings[index])
    #Mean Pooling
    embeddings=np.array(list(map(lambda x: sum(x)/len(x), zip(*embeddings))))
    verse_embeddings_mean.append(embeddings)

# Whole Sentence

In [11]:
verse_embeddings_whole=[]
for i in range(len(data)):
    text=data.loc[i]['Commentary']
    sentence=strip_accents(text)
    embeddings=sentence_embedding(sentence)
    verse_embeddings_whole.append(embeddings)

In [12]:
file=open('sentence.pkl','wb')
pickle.dump(verse_embeddings_sentence,file)
file.close()

In [13]:
file=open('whole.pkl','wb')
pickle.dump(verse_embeddings_whole,file)
file.close()

In [14]:
file=open('max.pkl','wb')
pickle.dump(verse_embeddings_max,file)
file.close()
file=open('mean.pkl','wb')
pickle.dump(verse_embeddings_mean,file)
file.close()

In [15]:
file=open('fasttext.pkl','wb')
pickle.dump(fasttext,file)
file.close()