In [1]:
#library import
import pandas as pd # for data manipulation and analysis
import numpy as np # for scientific computing with Python

In [2]:
df= pd.read_csv('../Data/Raw/Nepali_dataset.csv')

In [3]:
df_copy= df.copy()

In [4]:
# Importing custom python script to transform Hate sentiment dataset's Output classes to single target class
from create_target_label import create_target_column

df_copy,label_encoder = create_target_column(df_copy, "Sentiment", "Polarity")

In [5]:
list(label_encoder.classes_)

['FEEDBACK_0',
 'FEEDBACK_1',
 'GENERAL_0',
 'GENERAL_1',
 'PROFANITY_0',
 'PROFANITY_1',
 'VIOLENCE_0',
 'VIOLENCE_1']

In [6]:
df_copy.head()

Unnamed: 0,Text,Aspect Term,Target
0,गुठी विधेक ल्याएर ठमेल मा राज गुठि को जग्गा मा...,जोगाउन को लागि,2
1,दले ले देश सकेछन सबै बेचे र खान सुरू गरेछन अब ...,लखेटनु पछ,3
2,नेपाल को ससकृती ध्वस्त पार्ने योजना हो यो !,ससकृती ध्वस्त पार्ने,3
3,मठ मन्दिर गुम्बा का जग्गा हरु मा भुमाफिया को न...,भुमाफिया,3
4,नेपाल का कल कर्खाना र नदि नाला बेची सके अब मठ ...,बेची सके,3


## We created a single target for each Sentiment polarity to simplify Model training 

In [7]:
# Dropping the Aspect Term feature

df_copy.drop("Aspect Term", axis=1, inplace= True)

## Now we import a custom python script for returning the embeddings to the dataframe


The script imported below supports dataset's embeddings generation by word2vec, fasttext, glove while also supports loading the model by simply passing pre-trained embeddings. 

<i>LASER, NepBERTa embeddings that are being considered should support embeddings generation through the script, add the functionality in the script </i>

In [8]:
# Transforming the dataset to vectors generated by pre-trained Nepali word2vec

from generate_embeddings import load_word2vec_model, generate_word2vec_embeddings

# Loading the model with pre-trained embeddings
pre_trained_word2vec = load_word2vec_model("../Data/Embeddings/nepali_embeddings_word2vec.txt")

In [9]:
# Now creating a new df and generating embeddings for text data 

df_pretrained_word2vec = df_copy.copy()
df_pretrained_word2vec= generate_word2vec_embeddings(df_pretrained_word2vec, "Text", pre_trained_word2vec)

In [10]:
df_pretrained_word2vec.head()

Unnamed: 0,Text,0,1,2,3,4,5,6,7,8,...,291,292,293,294,295,296,297,298,299,Target
0,गुठी विधेक ल्याएर ठमेल मा राज गुठि को जग्गा मा...,-0.353896,-0.362858,-0.423552,1.104782,-0.010077,0.141672,0.289903,0.133494,-0.097301,...,-1.099061,0.555198,-0.914626,0.487582,-0.510756,0.030198,0.281444,-0.032571,-0.526055,2
1,दले ले देश सकेछन सबै बेचे र खान सुरू गरेछन अब ...,0.024839,-0.27619,-0.076623,0.665478,0.05623,0.578328,-0.23649,0.222671,-0.147569,...,-0.03652,0.926974,0.483738,0.741491,-0.771113,-0.622755,0.375458,0.073563,0.495352,3
2,नेपाल को ससकृती ध्वस्त पार्ने योजना हो यो !,-0.714236,-1.034701,0.636945,1.421791,-0.247544,-0.153742,0.468619,-1.279188,0.328922,...,-0.715127,0.374132,-0.018418,0.93712,-0.91152,-0.015114,-0.662453,0.295455,0.262537,3
3,मठ मन्दिर गुम्बा का जग्गा हरु मा भुमाफिया को न...,-0.147426,0.211941,-0.482885,0.63793,0.10793,-0.186813,0.514862,0.093724,0.079478,...,-0.783017,-0.011488,-0.30262,0.428254,-0.037419,0.036183,0.024025,-0.149977,-0.402901,3
4,नेपाल का कल कर्खाना र नदि नाला बेची सके अब मठ ...,-0.554735,0.216233,-0.209741,0.497239,-0.159377,0.180602,-0.095353,-0.048488,0.356057,...,-1.09241,-0.020246,0.343114,-0.085956,-0.928195,-0.355542,0.296502,0.076301,0.043207,3


In [12]:
# Saving the df as csv inside /Data/Preprocessed for ready to use df while classifiers training 

df_pretrained_word2vec.drop("Text", axis=1, inplace= True)
df_pretrained_word2vec.to_csv("../Data/Preprocessed/pretrained_word2vec_df.csv")