In [1]:
#library import
import pandas as pd # for data manipulation and analysis
import numpy as np # for scientific computing with Python

In [2]:
df= pd.read_csv('../Data/Raw/Nepali_dataset.csv')

In [3]:
df_copy= df.copy()

In [4]:
# Importing custom python script to transform Hate sentiment dataset's Output classes to single target class
from create_target_label import create_target_column

df_copy,label_encoder = create_target_column(df_copy, "Sentiment", "Polarity")

In [5]:
list(label_encoder.classes_)

['FEEDBACK_0',
 'FEEDBACK_1',
 'GENERAL_0',
 'GENERAL_1',
 'PROFANITY_0',
 'PROFANITY_1',
 'VIOLENCE_0',
 'VIOLENCE_1']

In [6]:
df_copy.head()

Unnamed: 0,Text,Aspect Term,Target
0,गुठी विधेक ल्याएर ठमेल मा राज गुठि को जग्गा मा...,जोगाउन को लागि,2
1,दले ले देश सकेछन सबै बेचे र खान सुरू गरेछन अब ...,लखेटनु पछ,3
2,नेपाल को ससकृती ध्वस्त पार्ने योजना हो यो !,ससकृती ध्वस्त पार्ने,3
3,मठ मन्दिर गुम्बा का जग्गा हरु मा भुमाफिया को न...,भुमाफिया,3
4,नेपाल का कल कर्खाना र नदि नाला बेची सके अब मठ ...,बेची सके,3


## We created a single target for each Sentiment polarity to simplify Model training 

In [7]:
# Dropping the Aspect Term feature

df_copy.drop("Aspect Term", axis=1, inplace= True)

## Now we import a custom python script for returning the embeddings to the dataframe


The script imported below supports dataset's embeddings generation by word2vec, fasttext, glove while also supports loading the model by simply passing pre-trained embeddings. 

<i>LASER, NepBERTa embeddings that are being considered should support embeddings generation through the script, add the functionality in the script </i>

In [None]:
# Transforming the dataset to vectors generated by pre-trained Nepali word2vec

from generate_embeddings import load_word2vec_model, generate_word2vec_embeddings

# Loading the model with pre-trained embeddings
pre_trained_word2vec = load_word2vec_model("../Data/Embeddings/nepali_embeddings_word2vec.txt")

In [None]:
# Now creating a new df and generating embeddings for text data 

df_pretrained_word2vec = df_copy.copy()
df_pretrained_word2vec= generate_word2vec_embeddings(df_pretrained_word2vec, "Text", pre_trained_word2vec)

In [None]:
# Saving the df as csv inside /Data/Preprocessed for ready to use df while classifiers training 

df_pretrained_word2vec.drop("Text", axis=1, inplace= True)
df_pretrained_word2vec.to_csv("../Data/Preprocessed/pretrained_word2vec_df.csv", index= False)

Loading and Generate embedding for our own word2vec model

In [10]:

from generate_embeddings import load_word2vec_model, generate_word2vec_embeddings

word2vec = load_word2vec_model("../Data/Embeddings/word2vec_embeddings.txt")

df_word2vec= df_copy.copy()
df_word2vec = generate_word2vec_embeddings(df_word2vec, "Text", word2vec)

df_word2vec.drop("Text", axis=1, inplace= True)
df_word2vec.to_csv("../Data/Preprocessed/word2vec_df.csv", index= False)

## Fine-tuning pre-trained word2vec model

In [8]:
from generate_embeddings import load_word2vec_model, generate_word2vec_embeddings
from gensim.models import Word2Vec, KeyedVectors

# Loading the model with pre-trained embeddings
finetune_word2vec = load_word2vec_model("../Data/Embeddings/nepali_embeddings_word2vec.txt") 


In [9]:
# load_word2vec_model returns keyedVectors objects which needs to converted back to Word2Vec

finetune_word2vec_model = Word2Vec(vector_size=finetune_word2vec.vector_size, min_count=1)
finetune_word2vec_model.build_vocab([list(finetune_word2vec.key_to_index.keys())], update= False)  

finetune_word2vec_model.wv.vectors= finetune_word2vec.vectors
finetune_word2vec_model.wv.key_to_index = finetune_word2vec.key_to_index
finetune_word2vec_model.wv.index_to_key = finetune_word2vec.index_to_key

# initialize vectors_lockf for training
finetune_word2vec_model.vectors_lockf =  np.ones(len(finetune_word2vec_model.wv), dtype= np.float32)

In [10]:
# Tokenizing using nltk

import nltk
from nltk.tokenize import word_tokenize
import string

# Download required NLTK data
nltk.download('punkt')
nltk.download('punkt_tab')

df_finetune_word2vec= df_copy.copy()

# Preprocess the text data
def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    # Remove punctuation
    tokens = [token for token in tokens if token not in string.punctuation]
    # Remove Nepali stop words
    # tokens = [token for token in tokens if token not in nepali_stopwords]
    # Apply stemming
    #tokens = [nepali_stemmer.stemWord(token) for token in tokens]
    return tokens

df_finetune_word2vec['Tokens']= df_finetune_word2vec['Text'].apply(preprocess_text)
df_finetune_word2vec.head()


[nltk_data] Downloading package punkt to /home/angel-
[nltk_data]     tamang/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/angel-
[nltk_data]     tamang/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Unnamed: 0,Text,Target,Tokens
0,गुठी विधेक ल्याएर ठमेल मा राज गुठि को जग्गा मा...,2,"[गुठी, विधेक, ल्याएर, ठमेल, मा, राज, गुठि, को,..."
1,दले ले देश सकेछन सबै बेचे र खान सुरू गरेछन अब ...,3,"[दले, ले, देश, सकेछन, सबै, बेचे, र, खान, सुरू,..."
2,नेपाल को ससकृती ध्वस्त पार्ने योजना हो यो !,3,"[नेपाल, को, ससकृती, ध्वस्त, पार्ने, योजना, हो,..."
3,मठ मन्दिर गुम्बा का जग्गा हरु मा भुमाफिया को न...,3,"[मठ, मन्दिर, गुम्बा, का, जग्गा, हरु, मा, भुमाफ..."
4,नेपाल का कल कर्खाना र नदि नाला बेची सके अब मठ ...,3,"[नेपाल, का, कल, कर्खाना, र, नदि, नाला, बेची, स..."


In [11]:
sentences = df_finetune_word2vec['Tokens'].tolist()


In [12]:
# Updating vocalbulary and fine-tuning the model

finetune_word2vec_model.build_vocab(sentences, update= True)  # update True to handle out of vocab words in pre-trained model
finetune_word2vec_model.train(sentences, total_examples=len(sentences), epochs=5)

# Saving the model keyedVectors

finetune_word2vec_model.wv.save_word2vec_format("../Data/Embeddings/fintuned_word2vec_embeddings.txt", binary= False)

Actually did a typo up there, while saving embeddings. I will fix manually, moving on :)

In [15]:
# Now we resume to the original intention of this notebook, i.e. generating embeddings for the dataset

ft_word2vec = load_word2vec_model("../Data/Embeddings/finetuned_word2vec_embeddings.txt")

df_finetune_word2vec.drop("Tokens", axis=1, inplace= True)

df_ft_word2vec= generate_word2vec_embeddings(df_finetune_word2vec, "Text", ft_word2vec)

In [18]:
df_ft_word2vec.drop("Text", axis=1, inplace= True)
df_ft_word2vec.to_csv("../Data/Preprocessed/finetuned_word2vec_df.csv", index= False)

# Embeddings for new unbalanced raw dataset with finetuned word2vec

In [37]:


df_unbal= pd.read_csv("../Data/Raw/Nepali_dataset_unbal.csv")
df_unbal.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2859 entries, 0 to 2858
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Text         2859 non-null   object
 1   Aspect Term  2859 non-null   object
 2   Sentiment    2859 non-null   object
 3   Polarity     2859 non-null   int64 
dtypes: int64(1), object(3)
memory usage: 89.5+ KB


In [38]:
df_unbal.loc[:,"Sentiment":"Polarity"].value_counts()

Sentiment  Polarity
GENERAL    1           1358
           0            901
PROFANITY  0            250
VIOLENCE   1            159
           0            108
PROFANITY  1             83
Name: count, dtype: int64

In [39]:
# Importing custom python script to transform Hate sentiment dataset's Output classes to single target class
from create_target_label import create_target_column

df_unbal,label_encoder = create_target_column(df_unbal, "Sentiment", "Polarity")
df_unbal.head()

Unnamed: 0,Text,Aspect Term,Target
0,गुठी विधेक ल्याएर ठमेल मा राज गुठि को जग्गा मा...,जोगाउन को लागि,0
1,दले ले देश सकेछन सबै बेचे र खान सुरू गरेछन अब ...,लखेटनु पछ,1
2,नेपाल को ससकृती ध्वस्त पार्ने योजना हो यो !,ससकृती ध्वस्त पार्ने,1
3,मठ मन्दिर गुम्बा का जग्गा हरु मा भुमाफिया को न...,भुमाफिया,1
4,नेपाल का कल कर्खाना र नदि नाला बेची सके अब मठ ...,बेची सके,1


In [49]:
list(label_encoder.classes_)

['GENERAL_0',
 'GENERAL_1',
 'PROFANITY_0',
 'PROFANITY_1',
 'VIOLENCE_0',
 'VIOLENCE_1']

In [50]:
# The classes and there mapping are

for class_name, encoded_value in zip(label_encoder.classes_, range(len(label_encoder.classes_))):
    print(f"{class_name}: {encoded_value}")

GENERAL_0: 0
GENERAL_1: 1
PROFANITY_0: 2
PROFANITY_1: 3
VIOLENCE_0: 4
VIOLENCE_1: 5


In [40]:
df_unbal.drop("Aspect Term", axis=1, inplace=True)
df_unbal_word2vec= generate_word2vec_embeddings(df_unbal, "Text", ft_word2vec)



In [41]:
df_unbal_word2vec.drop("Text", axis=1, inplace= True)
df_unbal_word2vec.to_csv("../Data/Preprocessed/unbal_word2vec.csv", index= False)

# Remove stopwords from this dataset


In [42]:
df_stopword= df_unbal.copy()
df_stopword.drop("word2vec_embeddings", axis=1, inplace= True)
df_stopword.head()

Unnamed: 0,Text,Target
0,गुठी विधेक ल्याएर ठमेल मा राज गुठि को जग्गा मा...,0
1,दले ले देश सकेछन सबै बेचे र खान सुरू गरेछन अब ...,1
2,नेपाल को ससकृती ध्वस्त पार्ने योजना हो यो !,1
3,मठ मन्दिर गुम्बा का जग्गा हरु मा भुमाफिया को न...,1
4,नेपाल का कल कर्खाना र नदि नाला बेची सके अब मठ ...,1


In [43]:
import nltk
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec
import string

# Download required NLTK data
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
nepali_stopwords = stopwords.words('nepali')

[nltk_data] Downloading package punkt to /home/angel-
[nltk_data]     tamang/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/angel-
[nltk_data]     tamang/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [44]:
# Apply preprocessing: remove punctuation, stopwords removal and lowering to the 'Text' column
df_stopword['Text'] = (
    df_stopword['Text']
    .str.lower()               # Convert to lowercase
    .str.replace(f"[{string.punctuation}]", "")  # Remove punctuation
    .apply(lambda x: ' '.join([word for word in x.split() if word not in nepali_stopwords]))  # Remove Nepali stopwords
)

In [45]:
df_stopword.head()

Unnamed: 0,Text,Target
0,गुठी विधेक ल्याएर ठमेल राज गुठि जग्गा छाया सेन...,0
1,दले देश सकेछन बेचे खान सुरू गरेछन दले लखेटनु पछ ।,1
2,नेपाल ससकृती ध्वस्त पार्ने योजना !,1
3,मठ मन्दिर गुम्बा जग्गा हरु भुमाफिया नजर परे हु...,1
4,नेपाल कल कर्खाना नदि नाला बेची सके मठ मन्दीर ब...,1


In [46]:
# Since tokenization, flattening is handled by our custom script, we'll do that through word2vec

df_stopword_word2vec= generate_word2vec_embeddings(df_stopword, "Text", ft_word2vec)

In [47]:
df_stopword_word2vec.head(1)

Unnamed: 0,Text,0,1,2,3,4,5,6,7,8,...,291,292,293,294,295,296,297,298,299,Target
0,गुठी विधेक ल्याएर ठमेल राज गुठि जग्गा छाया सेन...,-0.397011,-0.332576,-0.236242,0.253269,0.419496,0.379075,-0.523131,0.053369,-0.241984,...,-0.792762,0.933628,-0.525954,-0.087645,-0.115768,0.036328,0.341761,0.148474,-0.50575,0


In [48]:
df_stopword_word2vec.drop("Text", axis=1, inplace= True)
df_stopword_word2vec.to_csv("../Data/Preprocessed/unbal_word2vec.csv", index= False)