In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from transformers import BertTokenizer, BertModel
import torch
import joblib



In [2]:
# Load your dataset with Malayalam and Malayalam-English mixed comment
data = pd.read_csv(r"Dataset.csv")


In [3]:
data

Unnamed: 0,comments,labels
0,December 12 inn katta waitingullar like poad,Not_offensive
1,Jobi jorge padam ano. Potti mone,Offensive_Targeted_Insult_Individual
2,Cinema kanditum idakku idakku keri trailer kanund,Not_offensive
3,ജിമിട്ടൊളികൾ ഇപ്പോളെ unlike അടിച്ചു തുടങ്ങി അ...,Offensive_Targeted_Insult_Group
4,Muriyandikal pannikuutatam pole thurimezhukuva...,Offensive_Targeted_Insult_Group
...,...,...
4492,```HIGH VOLTAGE INSIDE.. UNAUTHORIZED PERSONS ...,not-malayalam
4493,ഇത് നയൻതാരയ്ക്ക് വെച്ച റോൾ ആയിരുന്നില്ലേ Any...,Not_offensive
4494,open for online job for students,not-malayalam
4495,Like a tamil and Telugu movie,not-malayalam


In [4]:
data.columns

Index(['comments', 'labels'], dtype='object')

In [5]:
data

Unnamed: 0,comments,labels
0,December 12 inn katta waitingullar like poad,Not_offensive
1,Jobi jorge padam ano. Potti mone,Offensive_Targeted_Insult_Individual
2,Cinema kanditum idakku idakku keri trailer kanund,Not_offensive
3,ജിമിട്ടൊളികൾ ഇപ്പോളെ unlike അടിച്ചു തുടങ്ങി അ...,Offensive_Targeted_Insult_Group
4,Muriyandikal pannikuutatam pole thurimezhukuva...,Offensive_Targeted_Insult_Group
...,...,...
4492,```HIGH VOLTAGE INSIDE.. UNAUTHORIZED PERSONS ...,not-malayalam
4493,ഇത് നയൻതാരയ്ക്ക് വെച്ച റോൾ ആയിരുന്നില്ലേ Any...,Not_offensive
4494,open for online job for students,not-malayalam
4495,Like a tamil and Telugu movie,not-malayalam


In [6]:
data['labels'].unique()

array(['Not_offensive', 'Offensive_Targeted_Insult_Individual',
       'Offensive_Targeted_Insult_Group', 'not-malayalam',
       'Offensive_Untargetede', nan], dtype=object)

In [7]:
data['labels'].value_counts()

Not_offensive                           1904
not-malayalam                           1572
Offensive_Targeted_Insult_Individual     499
Offensive_Untargetede                    305
Offensive_Targeted_Insult_Group          216
Name: labels, dtype: int64

In [8]:
data.isnull().sum()


comments    1
labels      1
dtype: int64

In [9]:
data.columns

Index(['comments', 'labels'], dtype='object')

In [10]:
data['labels'] = data['labels'].replace({"not-malayalam":np.NaN})


In [11]:
data.isnull().sum()

comments       1
labels      1573
dtype: int64

In [12]:
data.dropna(axis=0,how='any', inplace=True)

In [13]:
data

Unnamed: 0,comments,labels
0,December 12 inn katta waitingullar like poad,Not_offensive
1,Jobi jorge padam ano. Potti mone,Offensive_Targeted_Insult_Individual
2,Cinema kanditum idakku idakku keri trailer kanund,Not_offensive
3,ജിമിട്ടൊളികൾ ഇപ്പോളെ unlike അടിച്ചു തുടങ്ങി അ...,Offensive_Targeted_Insult_Group
4,Muriyandikal pannikuutatam pole thurimezhukuva...,Offensive_Targeted_Insult_Group
...,...,...
4488,Dislike adicha thaayolikale adikendavark like ...,Offensive_Untargetede
4489,പ്രിത്വി ഞെട്ടിക്കൽ പരിപാടി നിർത്തുന്നില്ല അല്...,Not_offensive
4491,Ikkha Mess aanu #mamookkha #dq,Not_offensive
4493,ഇത് നയൻതാരയ്ക്ക് വെച്ച റോൾ ആയിരുന്നില്ലേ Any...,Not_offensive


In [14]:
data['labels'] = [0 if i =="Not_offensive" else 1 for i in data['labels']]

In [15]:
data.head()

Unnamed: 0,comments,labels
0,December 12 inn katta waitingullar like poad,0
1,Jobi jorge padam ano. Potti mone,1
2,Cinema kanditum idakku idakku keri trailer kanund,0
3,ജിമിട്ടൊളികൾ ഇപ്പോളെ unlike അടിച്ചു തുടങ്ങി അ...,1
4,Muriyandikal pannikuutatam pole thurimezhukuva...,1


In [16]:
X=data['comments']
y=data['labels']

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [18]:
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
model = BertModel.from_pretrained('bert-base-multilingual-cased')

In [19]:
# Tokenize the text data and extract mBERT embeddings
X_train_embeddings = []
X_test_embeddings = []

In [20]:
for text in X_train:
    tokens = tokenizer(text, padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
        model_output = model(**tokens)
    embeddings = model_output['last_hidden_state'].mean(dim=1).squeeze().numpy()
    X_train_embeddings.append(embeddings)

In [21]:
for text in X_test:
    tokens = tokenizer(text, padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
        model_output = model(**tokens)
    embeddings = model_output['last_hidden_state'].mean(dim=1).squeeze().numpy()
    X_test_embeddings.append(embeddings)

In [22]:
# Convert the embeddings to numpy arrays
X_train_embeddings = np.array(X_train_embeddings)
X_test_embeddings = np.array(X_test_embeddings)

In [23]:
print(X_train_embeddings.shape, y_train.shape)


(2339, 768) (2339,)


In [24]:
from sklearn.tree import DecisionTreeClassifier
d_classifier = DecisionTreeClassifier(random_state=42)
d_classifier.fit(X_train_embeddings, y_train)

DecisionTreeClassifier(random_state=42)

In [25]:
y_pred = d_classifier.predict(X_test_embeddings)


In [26]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

In [27]:
print("Accuracy: {:.2f}%".format(accuracy * 100))
print("Classification Report:\n", report)

Accuracy: 66.84%
Classification Report:
               precision    recall  f1-score   support

           0       0.74      0.74      0.74       375
           1       0.54      0.54      0.54       210

    accuracy                           0.67       585
   macro avg       0.64      0.64      0.64       585
weighted avg       0.67      0.67      0.67       585



In [28]:
joblib.dump(d_classifier, 'decision_tree_model.joblib')


['decision_tree_model.joblib']

In [29]:
note = ['ജിമിട്ടൊളികൾ ഇപ്പോളെ unlike അടിച്ചു തുടങ്ങി  അസുരൻ loading..']
tokens = tokenizer(note, padding=True, truncation=True, return_tensors='pt')
with torch.no_grad():
    model_output = model(**tokens)
embeddings = model_output['last_hidden_state'].mean(dim=1).squeeze().numpy()


input_embedding = embeddings.reshape(1, -1)  

predicted_label = d_classifier.predict(input_embedding)

print("Predicted Label:", predicted_label)

Predicted Label: [1]


In [30]:
results_df = pd.DataFrame({'Actual_Labels': y_test.values, 'Predicted_Labels': y_pred})
results_df['Actual_Labels'] = results_df['Actual_Labels'].replace({0: 'not offensive', 1: 'offensive'})
results_df['Predicted_Labels'] = results_df['Predicted_Labels'].replace({0: 'not offensive', 1: 'offensive'})


In [31]:
results_df.head(10)


Unnamed: 0,Actual_Labels,Predicted_Labels
0,offensive,not offensive
1,not offensive,not offensive
2,offensive,offensive
3,offensive,not offensive
4,offensive,offensive
5,not offensive,not offensive
6,offensive,not offensive
7,not offensive,not offensive
8,not offensive,offensive
9,not offensive,offensive


#classifying according to the level of offensiveness

In [32]:
df = pd.read_csv(r"Dataset.csv")


In [33]:
df

Unnamed: 0,comments,labels
0,December 12 inn katta waitingullar like poad,Not_offensive
1,Jobi jorge padam ano. Potti mone,Offensive_Targeted_Insult_Individual
2,Cinema kanditum idakku idakku keri trailer kanund,Not_offensive
3,ജിമിട്ടൊളികൾ ഇപ്പോളെ unlike അടിച്ചു തുടങ്ങി അ...,Offensive_Targeted_Insult_Group
4,Muriyandikal pannikuutatam pole thurimezhukuva...,Offensive_Targeted_Insult_Group
...,...,...
4492,```HIGH VOLTAGE INSIDE.. UNAUTHORIZED PERSONS ...,not-malayalam
4493,ഇത് നയൻതാരയ്ക്ക് വെച്ച റോൾ ആയിരുന്നില്ലേ Any...,Not_offensive
4494,open for online job for students,not-malayalam
4495,Like a tamil and Telugu movie,not-malayalam


In [34]:
print(df['labels'].unique())


['Not_offensive' 'Offensive_Targeted_Insult_Individual'
 'Offensive_Targeted_Insult_Group' 'not-malayalam' 'Offensive_Untargetede'
 nan]


In [35]:
df.dropna(axis=0,how='any', inplace=True)

In [36]:
df

Unnamed: 0,comments,labels
0,December 12 inn katta waitingullar like poad,Not_offensive
1,Jobi jorge padam ano. Potti mone,Offensive_Targeted_Insult_Individual
2,Cinema kanditum idakku idakku keri trailer kanund,Not_offensive
3,ജിമിട്ടൊളികൾ ഇപ്പോളെ unlike അടിച്ചു തുടങ്ങി അ...,Offensive_Targeted_Insult_Group
4,Muriyandikal pannikuutatam pole thurimezhukuva...,Offensive_Targeted_Insult_Group
...,...,...
4492,```HIGH VOLTAGE INSIDE.. UNAUTHORIZED PERSONS ...,not-malayalam
4493,ഇത് നയൻതാരയ്ക്ക് വെച്ച റോൾ ആയിരുന്നില്ലേ Any...,Not_offensive
4494,open for online job for students,not-malayalam
4495,Like a tamil and Telugu movie,not-malayalam


In [37]:
df['labels']=df['labels'].replace({"not-malayalam":np.NaN})

In [38]:
df.dropna(axis=0,how='any', inplace=True)

In [39]:
df

Unnamed: 0,comments,labels
0,December 12 inn katta waitingullar like poad,Not_offensive
1,Jobi jorge padam ano. Potti mone,Offensive_Targeted_Insult_Individual
2,Cinema kanditum idakku idakku keri trailer kanund,Not_offensive
3,ജിമിട്ടൊളികൾ ഇപ്പോളെ unlike അടിച്ചു തുടങ്ങി അ...,Offensive_Targeted_Insult_Group
4,Muriyandikal pannikuutatam pole thurimezhukuva...,Offensive_Targeted_Insult_Group
...,...,...
4488,Dislike adicha thaayolikale adikendavark like ...,Offensive_Untargetede
4489,പ്രിത്വി ഞെട്ടിക്കൽ പരിപാടി നിർത്തുന്നില്ല അല്...,Not_offensive
4491,Ikkha Mess aanu #mamookkha #dq,Not_offensive
4493,ഇത് നയൻതാരയ്ക്ക് വെച്ച റോൾ ആയിരുന്നില്ലേ Any...,Not_offensive


In [40]:
df['labels']=df['labels'].replace({"Not_offensive":np.NaN})

In [41]:
df.dropna(axis=0,how='any', inplace=True)

In [42]:
df

Unnamed: 0,comments,labels
1,Jobi jorge padam ano. Potti mone,Offensive_Targeted_Insult_Individual
3,ജിമിട്ടൊളികൾ ഇപ്പോളെ unlike അടിച്ചു തുടങ്ങി അ...,Offensive_Targeted_Insult_Group
4,Muriyandikal pannikuutatam pole thurimezhukuva...,Offensive_Targeted_Insult_Group
5,Nth bhashayado ith vech ketti malabr bhasha sa...,Offensive_Targeted_Insult_Individual
9,ആരും കണ്ടേക്കരുതേ,Offensive_Targeted_Insult_Group
...,...,...
4451,Oompitharam alle mr. Producer.,Offensive_Targeted_Insult_Individual
4475,Mammuni thendikal ethra dislikes adichalum Raj...,Offensive_Targeted_Insult_Group
4476,Ennalum manassilakathadh eadh oolakal aane eee...,Offensive_Untargetede
4484,ഇവനൊന്നും ഇങ്ങനെയുള്ള പഠത്തിന് പറ്റൂല ഓഞ്ഞ ഒരു...,Offensive_Targeted_Insult_Individual


In [43]:
df['labels'] = df['labels'].str.strip()


In [44]:
from sklearn.preprocessing import LabelEncoder
lb=LabelEncoder()
df["labels"]=lb.fit_transform(df["labels"])


In [45]:
df['labels'].value_counts()

1    499
2    305
0    216
Name: labels, dtype: int64

In [46]:
df

Unnamed: 0,comments,labels
1,Jobi jorge padam ano. Potti mone,1
3,ജിമിട്ടൊളികൾ ഇപ്പോളെ unlike അടിച്ചു തുടങ്ങി അ...,0
4,Muriyandikal pannikuutatam pole thurimezhukuva...,0
5,Nth bhashayado ith vech ketti malabr bhasha sa...,1
9,ആരും കണ്ടേക്കരുതേ,0
...,...,...
4451,Oompitharam alle mr. Producer.,1
4475,Mammuni thendikal ethra dislikes adichalum Raj...,0
4476,Ennalum manassilakathadh eadh oolakal aane eee...,2
4484,ഇവനൊന്നും ഇങ്ങനെയുള്ള പഠത്തിന് പറ്റൂല ഓഞ്ഞ ഒരു...,1


In [47]:
df.head(30)

Unnamed: 0,comments,labels
1,Jobi jorge padam ano. Potti mone,1
3,ജിമിട്ടൊളികൾ ഇപ്പോളെ unlike അടിച്ചു തുടങ്ങി അ...,0
4,Muriyandikal pannikuutatam pole thurimezhukuva...,0
5,Nth bhashayado ith vech ketti malabr bhasha sa...,1
9,ആരും കണ്ടേക്കരുതേ,0
13,Iyyalkke rest cheyyanulla time aayilla.. kanda...,1
23,eda rithwik nine njan theriparjilaloo ennitt n...,2
31,Trailer Releasinu munne 6k dislike itta ella p...,2
48,Itinum dislike adikkan aalundallo...nallatine ...,2
51,Nalla assal moonjjal thank you lola,2


In [48]:
X=df['comments']
y=df['labels']

In [49]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [50]:
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
bert_model = BertModel.from_pretrained('bert-base-multilingual-cased')

In [51]:
# Tokenize the text data and extract mBERT embeddings
X_train_embeddings = []
X_test_embeddings = []

In [52]:
for text in X_train:
    tokens = bert_tokenizer(text, padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
        model_output = bert_model(**tokens)
    embeddings = model_output['last_hidden_state'].mean(dim=1).squeeze().numpy()
    X_train_embeddings.append(embeddings)

In [53]:
for text in X_test:
    tokens = bert_tokenizer(text, padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
        model_output =bert_model(**tokens)
    embeddings = model_output['last_hidden_state'].mean(dim=1).squeeze().numpy()
    X_test_embeddings.append(embeddings)

In [54]:
# Convert the embeddings to numpy arrays
X_train_bert_embeddings = np.array(X_train_embeddings)
X_test_bert_embeddings = np.array(X_test_embeddings)

In [55]:
print(X_train_bert_embeddings.shape, y_train.shape)


(816, 768) (816,)


In [56]:
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier(random_state=42)
classifier.fit(X_train_embeddings, y_train)

DecisionTreeClassifier(random_state=42)

In [57]:
y_pred = classifier.predict(X_test_embeddings)


In [58]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

In [59]:
print("Accuracy: {:.2f}%".format(accuracy * 100))
print("Classification Report:\n", report)

Accuracy: 60.78%
Classification Report:
               precision    recall  f1-score   support

           0       0.41      0.48      0.44        44
           1       0.71      0.70      0.71       100
           2       0.60      0.55      0.57        60

    accuracy                           0.61       204
   macro avg       0.58      0.58      0.57       204
weighted avg       0.62      0.61      0.61       204



In [60]:
note = ['ജിമിട്ടൊളികൾ ഇപ്പോളെ unlike അടിച്ചു തുടങ്ങി  അസുരൻ loading..']
tokens = bert_tokenizer(note, padding=True, truncation=True, return_tensors='pt')
with torch.no_grad():
    model_output = bert_model(**tokens)
embeddings = model_output['last_hidden_state'].mean(dim=1).squeeze().numpy()


input_embedding = embeddings.reshape(1, -1)  

predicted_label = classifier.predict(input_embedding)


print("Predicted Label:", predicted_label)

Predicted Label: [0]


In [61]:
results_df = pd.DataFrame({'Actual_Labels': y_test.values, 'Predicted_Labels': y_pred})
results_df['Actual_Labels'] = results_df['Actual_Labels'].replace({0: 'Offensive_Targeted_Insult_Group', 1: 'Offensive_Targeted_Insult_Individual',2:'Offensive_Untargetede'})
results_df['Predicted_Labels'] = results_df['Predicted_Labels'].replace({0: 'Offensive_Targeted_Insult_Group', 1: 'Offensive_Targeted_Insult_Individual',2:'Offensive_Untargetede'})


In [62]:
results_df.head(10)

Unnamed: 0,Actual_Labels,Predicted_Labels
0,Offensive_Targeted_Insult_Individual,Offensive_Untargetede
1,Offensive_Targeted_Insult_Individual,Offensive_Targeted_Insult_Individual
2,Offensive_Targeted_Insult_Group,Offensive_Targeted_Insult_Group
3,Offensive_Targeted_Insult_Group,Offensive_Targeted_Insult_Individual
4,Offensive_Targeted_Insult_Individual,Offensive_Targeted_Insult_Individual
5,Offensive_Targeted_Insult_Individual,Offensive_Targeted_Insult_Individual
6,Offensive_Untargetede,Offensive_Untargetede
7,Offensive_Targeted_Insult_Individual,Offensive_Targeted_Insult_Group
8,Offensive_Targeted_Insult_Individual,Offensive_Targeted_Insult_Individual
9,Offensive_Targeted_Insult_Individual,Offensive_Targeted_Insult_Individual


In [63]:
joblib.dump(classifier, 'decision_model.joblib')


['decision_model.joblib']