In [1]:
import pandas as pd
from tqdm import tqdm
import re
import string
from pickle import dump

In [2]:
data = pd.read_csv("./data.csv", encoding='utf-8')
data.head()

Unnamed: 0,Nepali,English
0,“मानौ एउटी स्त्रीसँग दशवटा चाँदीका सिक्काहरू छ...,"Or what woman, if she had ten drachma coins, i..."
1,ती दुष्ट मानिसहरू हिंस्रक सिंहहरू जस्तै अन्य प...,"He is like a lion that is greedy of his prey, ..."
2,प्रक्रिया दृश्य क्रम स्तम्भ,Process view sort column
3,जहा ट्याबहरु देखाइन सकिन्थ्यो वा सकिन्नथ्यो,Whether tooltips should be shown on widgets
4,अनुष्ठान अनुसार जहां केटि र महिलाहरु पूजाहार...,Ritual servitude where girls and women are ple...


# Removing Columns with NaN values

In [3]:
pd.isnull(data).values.any()

True

In [4]:
pd.isnull(data).sum()

Nepali     17
English     9
dtype: int64

In [5]:
data.isna().any()

Nepali     True
English    True
dtype: bool

In [6]:
np_nan = pd.isnull(data["Nepali"])
data[np_nan]

Unnamed: 0,Nepali,English
3017,,
24275,,It is necessary to recognize that the farmers ...
24800,,
29455,,While the number of lower secondary schools wa...
30774,,
37756,,D. level.
38730,,4. Mule tracks:
51280,,
55781,,B.
58450,,"Radio Broadcasting , Television Transmission a..."


In [7]:
en_nan = pd.isnull(data["English"])
data[en_nan]

Unnamed: 0,Nepali,English
3017,,
24800,,
30621,खर्चको व्यवस्था :,
30774,,
51280,,
60377,,
74068,,
113614,,
128349,,


In [8]:
data = data.dropna()
data.shape

(160241, 2)

In [9]:
data.isna().any()

Nepali     False
English    False
dtype: bool

In [10]:
pd.isnull(data).sum()

Nepali     0
English    0
dtype: int64

# Defining Functions for Preprocessing Data

In [11]:
def to_lower(text):
    return text.lower()

In [12]:
def remove_quotes(text):
    result = re.sub("'", '', text)
    return result

In [13]:
def remove_vertical_bar(text):
    result = re.sub("।", '', text)
    return result

In [14]:
def clean_punctuations(text):
    word_list = []
    # Defining a set of all the punctuations
    punctuations = set(string.punctuation)
    for word in text:
        if word not in punctuations:
            word_list.append(word)
        else:
            continue
    result = "".join(word_list)
    return result

In [15]:
def remove_extra_spaces(text):
    stripped_text = text.strip()
    result = re.sub(" +", " ", stripped_text)
    return result

In [16]:
def preprocess_data(language_column):
    result = []
    for sentence in tqdm(data[language_column].values):
        lowered_text = to_lower(sentence)
        non_quoted_text = remove_quotes(lowered_text)
        v_text = remove_vertical_bar(non_quoted_text)
        cleaned_text = clean_punctuations(v_text)
        final_text = remove_extra_spaces(cleaned_text)
        result.append(final_text)
    return result

# Function calling for Preprocessing Data

In [17]:
data["Nepali"] = preprocess_data("Nepali")
data["English"] = preprocess_data("English")
data.head()

100%|██████████| 160241/160241 [00:03<00:00, 45786.32it/s]
100%|██████████| 160241/160241 [00:02<00:00, 60366.40it/s]


Unnamed: 0,Nepali,English
0,“मानौ एउटी स्त्रीसँग दशवटा चाँदीका सिक्काहरू छ...,or what woman if she had ten drachma coins if ...
1,ती दुष्ट मानिसहरू हिंस्रक सिंहहरू जस्तै अन्य प...,he is like a lion that is greedy of his prey a...
2,प्रक्रिया दृश्य क्रम स्तम्भ,process view sort column
3,जहा ट्याबहरु देखाइन सकिन्थ्यो वा सकिन्नथ्यो,whether tooltips should be shown on widgets
4,अनुष्ठान अनुसार जहां केटि र महिलाहरु पूजाहारीह...,ritual servitude where girls and women are ple...


In [18]:
data["Nepali"] = data["Nepali"].apply(lambda x : 'START_TOKEN ' + x + ' END_TOKEN')
data.head()

Unnamed: 0,Nepali,English
0,START_TOKEN “मानौ एउटी स्त्रीसँग दशवटा चाँदीका...,or what woman if she had ten drachma coins if ...
1,START_TOKEN ती दुष्ट मानिसहरू हिंस्रक सिंहहरू ...,he is like a lion that is greedy of his prey a...
2,START_TOKEN प्रक्रिया दृश्य क्रम स्तम्भ END_TOKEN,process view sort column
3,START_TOKEN जहा ट्याबहरु देखाइन सकिन्थ्यो वा स...,whether tooltips should be shown on widgets
4,START_TOKEN अनुष्ठान अनुसार जहां केटि र महिलाह...,ritual servitude where girls and women are ple...


# Saving Preprocessed Data to csv file

In [19]:
data.to_csv("./cleaned_data.csv", index=False)

# Tokenize the text

In [20]:
def create_tokens(language_column):
    tokens_list = list()
    for sentence in tqdm(data[language_column].values):
        for word in sentence.split():
            tokens_list.append(word)
            
    print("The total words in", language_column, "text is:", len(tokens_list))
    
    tokens_set = set(tokens_list)
    print("The length of", language_column, "tokens is:", len(tokens_set))
    
    return tokens_set

In [21]:
np_tokens = create_tokens("Nepali")

100%|██████████| 160241/160241 [00:00<00:00, 346438.56it/s]


The total words in Nepali text is: 2505000
The length of Nepali tokens is: 186399


In [22]:
en_tokens = create_tokens("English")

100%|██████████| 160241/160241 [00:00<00:00, 464557.16it/s]


The total words in English text is: 2502007
The length of English tokens is: 64316


In [23]:
dump(np_tokens, open('./np_tokens.pkl', 'wb'))
dump(en_tokens, open('./en_tokens.pkl', 'wb'))