In [1]:
import io
import re
import numpy as np
import pandas as pd
import numpy.linalg as lin
import warnings 
import pickle
warnings.filterwarnings('ignore')

In [2]:
from indic_transliteration import sanscript
from indic_transliteration.sanscript import transliterate

def t2h(text):
    return transliterate(text,sanscript.TELUGU,sanscript.DEVANAGARI)

In [3]:
stopwords=['కూడా', 'ఉన్నారు', 'ఎవరైనా', 'ఎప్పుడు', 'ఎక్కడైనా', 'అందుబాటులో', 'ఒక ప్రక్కన', 'సంబంధం', 'మాత్రమే', 'అడగండి', 'గా', 'మధ్య', 'ప్రకారం', 'అనుమతించు', 'మెచ్చుకో', 'వద్ద', 'తగిన', 'అడ్డంగా', 'అడగడం', 'ఇప్పటికే', 'కనిపిస్తాయి', 'కాదు', 'మరియు', 'మరొక', 'అనుగుణంగా', 'అయితే', 'చేయగలిగింది', 'దాదాపు', 'గురించి', 'పై', 'వెంట', 'వేరుగా', 'చుట్టూ', 'దూరంగా', 'ఏ', 'నిజంగా', 'అనుమతిస్తుంది', 'అందరూ', 'ఎవరో ఒకరు', 'వ్యతిరేకంగా', 'ఏదైనా', 'తర్వాత', 'మళ్ళీ', 'ఏమైనప్పటికి', 'ఒక']

In [4]:
vowels=set("అ ఆ ఇ ఈ ఉ ఊ ఋ ౠ ఌ ౡ ఎ ఏ ఐ ఒ ఓ ఔ అం అః".split())

conso="క	ఖ	గ	ఘ	ఙ	చ	ఛ	జ	ఝ	ఞ	ట	ఠ	డ	ఢ	ణ	త	థ	ద	ధ	న	ప	ఫ	బ	భ	మ	య	ర	ల	వ	శ	ష	స	హ	ళ	క్ష	ఱ"

matra = "ఀ	ఁ	ం	ః	ఄ	ఽ	ా	ి	ీ	ు	ూ	ృ	ౄ	ె	ే	ై	ొ	ో	ౌ	్	ౕ	ౖ	ౢ	ౣ"

halantha='్'

In [5]:
consonants=set(conso.split("\t"))
matra = set(matra.split("\t"))

vowels = list(vowels)
consonants=list(consonants)
matra=list(matra)

alphabets = vowels+consonants

In [6]:
delimiters=["।","?","!","."]

In [7]:
glove=open('glove/te-d100-glove.txt','rb')
txt=glove.read().decode(errors='replace')
glove={}
for line in txt.split("\n"):
    line=line.strip()
    line=line.split()
    try:
        glove[line[0]]=np.array(list(map(float,line[1:])))
    except:
        continue

In [8]:
def remove_special_characters(text):
    string=''
    for i in text:
        if i in alphabets+matra+delimiters:
            string=string+i
        else:
            string=string+" "
    return " ".join(string.split())
def sent_tokenize(text):
    text=remove_special_characters(text)
    regexPattern = '.'.join(map(re.escape, delimiters))
    sentences=re.split(regexPattern,text)
    return [sent.strip() for sent in sentences if len(sent.strip())>0]

In [9]:
def word_embedding(word):
    try:
        return glove[t2h(word)]
    except:
        return np.array([0.0]*100)

In [10]:
def sentence_embedding(sentence):
    sentence=remove_special_characters(sentence)
    words=sentence.split()
    words=[word  for word in words if word not in stopwords]
    if len(words)>0:
        sentence_embedding=[word_embedding(word) for word in words]
        return np.array(list(map(lambda x: sum(x)/len(x), zip(*sentence_embedding))))
    return np.array([0.0]*100)

In [11]:
data=pd.read_csv('../telugu.csv')

# Sentence By Sentence

In [12]:
verse_embeddings_sentence=[]
verse_embeddings_max=[]
verse_embeddings_mean=[]
for i in range(len(data)):
    text=data.loc[i]['Commentary']
    sentences=sent_tokenize(text)
    embeddings=[sentence_embedding(sentence) for sentence in sentences]
    #sentence
    verse_embeddings_sentence.append(embeddings)
    #Max Pooling
    norms=[lin.norm(i) for i in embeddings]
    index=norms.index(max(norms))
    verse_embeddings_max.append(embeddings[index])
    #Mean Pooling
    embeddings=np.array(list(map(lambda x: sum(x)/len(x), zip(*embeddings))))
    verse_embeddings_mean.append(embeddings)

# Whole

In [13]:
verse_embeddings_whole=[]
for i in range(len(data)):
    text=data.loc[i]['Commentary']
    embeddings=sentence_embedding(text)
    verse_embeddings_whole.append(embeddings)

In [14]:
file=open('sentence.pkl','wb')
pickle.dump(verse_embeddings_sentence,file)
file.close()
file=open('max.pkl','wb')
pickle.dump(verse_embeddings_max,file)
file.close()
file=open('mean.pkl','wb')
pickle.dump(verse_embeddings_mean,file)
file.close()

In [15]:
file=open('whole.pkl','wb')
pickle.dump(verse_embeddings_whole,file)
file.close()

In [16]:
file=open('glove.pkl','wb')
pickle.dump(glove,file)
file.close()