# Import Required Modules

In [1]:
import json
import tqdm
import os
import pandas as pd
import re
import math

# List Files Available

In [2]:
data_dir = "./data/"

In [3]:
os.listdir(data_dir)

['english-hindi-all-linked.tsv',
 'english-hindi-assamese-linked.tsv',
 'english-hindi-bengali-gujarati-kannada-malayalam-punjabi-telugu-tamil-urdu-linked.tsv',
 'english-hindi-bengali-linked.tsv',
 'english-hindi-bodo-linked.tsv',
 'english-hindi-gujarati-linked.tsv',
 'english-hindi-kannada-linked.tsv',
 'english-hindi-kashmiri-linked.tsv',
 'english-hindi-konkani-linked.tsv',
 'english-hindi-linked.tsv',
 'english-hindi-malayalam-linked.tsv',
 'english-hindi-manipuri-linked.tsv',
 'english-hindi-marathi-linked.tsv',
 'english-hindi-nepali-linked.tsv',
 'english-hindi-oriya-linked.tsv',
 'english-hindi-punjabi-linked.tsv',
 'english-hindi-sanskrit-linked.tsv',
 'english-hindi-tamil-linked.tsv',
 'english-hindi-telugu-linked.tsv',
 'english-hindi-urdu-linked.tsv']

# Pre-Processing Functions

In [4]:
def getSynsetColumns(df):
    synsetColumns = []
    for column in list(df.columns):
        if "_synset" in column:
            synsetColumns.append(column)
    return synsetColumns

def getGlossColumns(df):
    synsetColumns = []
    for column in list(df.columns):
        if "_gloss" in column:
            synsetColumns.append(column)
    return synsetColumns

In [6]:
def splitWord(word):
    if "_" in word:
        return " ".join(word.split("_"))
    else:
        return word

In [7]:
def removeTags(word):
    word = word.replace("(a)","")
    word = word.replace("(p)","")
    word = word.replace("(ip)","")
    return word

In [8]:
def isAlpha(word):
    return word.isalpha()

In [9]:
def hasDigit(s):
    return any(i.isdigit() for i in s)

In [10]:
def isRomanNumeral(word):
    upperRoman = bool(re.search("^M{0,3}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})$",word))
    lowerRoman = bool(re.search("^m{0,3}(cm|cd|d?c{0,3})(xc|xl|l?x{0,3})(ix|iv|v?i{0,3})$",word))
    return upperRoman ^ lowerRoman

In [11]:
def processWord(word):
    word = splitWord(word)
    word = removeTags(word)
    if len(word)>1 and word.isnumeric()!=True and hasDigit(word)!=True and isRomanNumeral(word)!=True:
        return word

In [12]:
def isNan(word):
    try:
        isNanObject = float(word)
        return math.isnan(isNanObject)
    except:return False

# Load the TSVs into Dataframes

In [13]:
data_dict = {}
for file in tqdm.tqdm(os.listdir(data_dir)):
    filename = file.split(".")[0]
    data_dict[filename] = pd.read_csv(os.path.join(data_dir,file),sep="\t")

100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [00:04<00:00,  4.27it/s]


# Processing function for Dataframe

In [14]:
def processIndoDf(df):
    new_df = df.copy(deep=True)
    requiredColumns = []
    synsetColumns = getSynsetColumns(new_df)
    glossColumns = getGlossColumns(new_df)
    requiredColumns+=synsetColumns
    requiredColumns+=glossColumns
    try:
        new_df = new_df[requiredColumns+["english_category"]]
    except:
        new_df = new_df[requiredColumns+["english_category_x"]]
        new_df = new_df.rename(columns={"english_category_x":"english_category"})
    
    for synsetColumn in synsetColumns:
        new_df[synsetColumn] = new_df[synsetColumn].apply(lambda x:str(x))
        new_df[synsetColumn] = new_df[synsetColumn].apply(lambda x:x.strip())
        new_df[synsetColumn] = new_df[synsetColumn].apply(lambda x:x.split(","))
        new_df[synsetColumn] = new_df[synsetColumn].apply(lambda x:[y.strip() for y in x])
        new_df[synsetColumn] = new_df[synsetColumn].apply(lambda x:[processWord(y) for y in x])
        new_df[synsetColumn] = new_df[synsetColumn].apply(lambda x:[y for y in x if y is not None])
        new_df[synsetColumn] = new_df[synsetColumn].apply(lambda x:[y for y in x if isNan(y)==False])
        new_df[synsetColumn] = new_df[synsetColumn].apply(lambda x:[y for y in x if y!=""])
        new_df[synsetColumn] = new_df[synsetColumn].apply(lambda x:[y.lower() for y in x])
        new_df[synsetColumn] = new_df[synsetColumn].apply(lambda x:[y.replace('"','') for y in x])
        new_df[synsetColumn] = new_df[synsetColumn].apply(lambda x:[y.replace("\\","") for y in x])
        new_df[synsetColumn] = new_df[synsetColumn].apply(lambda x:[y.replace(".","") for y in x])
        new_df[synsetColumn] = new_df[synsetColumn].apply(lambda x:[y.replace(";","") for y in x])
        new_df[synsetColumn] = new_df[synsetColumn].apply(lambda x:[y.lower() for y in x])
        new_df[synsetColumn] = new_df[synsetColumn].apply(lambda x:";".join(x))
        
    for glossColumn in glossColumns:
        new_df[glossColumn] = new_df[glossColumn].apply(lambda x:str(x))
        new_df[glossColumn] = new_df[glossColumn].apply(lambda x:x.replace(',',";"))
        new_df[glossColumn] = new_df[glossColumn].apply(lambda x:x.replace('|',";"))
        new_df[glossColumn] = new_df[glossColumn].apply(lambda x:x.split(";"))
        new_df[glossColumn] = new_df[glossColumn].apply(lambda x:[y.replace('"',"") for y in x])
        new_df[glossColumn] = new_df[glossColumn].apply(lambda x:[y.replace('।',"") for y in x])
        new_df[glossColumn] = new_df[glossColumn].apply(lambda x:[y.replace('.',"") for y in x])
        new_df[glossColumn] = new_df[glossColumn].apply(lambda x:[y.replace(';',"") for y in x])
        new_df[glossColumn] = new_df[glossColumn].apply(lambda x:[y.strip() for y in x])
        new_df[glossColumn] = new_df[glossColumn].apply(lambda x:[y for y in x if y is not None])
        new_df[glossColumn] = new_df[glossColumn].apply(lambda x:[y for y in x if y!=""])
        new_df[glossColumn] = new_df[glossColumn].apply(lambda x:[y for y in x if isNan(y)==False])
        new_df[glossColumn] = new_df[glossColumn].apply(lambda x:";".join(x))
        
    requiredColumns.remove("english_synset_words")
    
    aggMap = {}
    for column in requiredColumns:
        aggMap[column] = ";".join
    
    requiredColumns.remove("english_gloss")
    requiredColumns.sort()
        
    new_df = new_df.groupby(["english_synset_words","english_category"]).agg(aggMap).reset_index()
    new_df = new_df[["english_synset_words","english_category","english_gloss"]+requiredColumns]
    new_df = new_df.rename(columns={"english_synset_words":"english_word"})
    new_df = new_df.rename(columns={"english_category":"pos"})
    new_df["english_word"] = new_df["english_word"].apply(lambda x:x.split(';'))
    new_df = new_df.explode(["english_word"])
    
    return new_df

# Process each dataframe in the main dictionary

In [15]:
for key in tqdm.tqdm(data_dict.keys()):
    data_dict[key] = processIndoDf(data_dict[key])

100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [01:17<00:00,  3.86s/it]


# Save the processed Dataframes as TSVs

In [16]:
for key in tqdm.tqdm(data_dict.keys()):
    data_dict[key].to_csv(f"processed/tsv/{key}.tsv",sep='\t',index=False)

100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [00:08<00:00,  2.24it/s]


# Save the Dataframes as json files

In [17]:
def dfToJson(df,filename):
    data_json = {}
    for word,row in df.iterrows():
        if word not in data_json:
            data_json[word] = []
        data_json[word].append(row.to_dict())
    with open(filename,'w',encoding="utf-8") as f:
        json.dump(data_json,f,indent=4,ensure_ascii=False)

In [18]:
for key in tqdm.tqdm(data_dict.keys()):
    data_dict[key]=data_dict[key].set_index("english_word")
    dfToJson(data_dict[key],f"processed/json/{key}.json")

100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [00:36<00:00,  1.82s/it]
