In [1]:
from data_preprocessing import *
from model import ml_classifier_model, classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from joblib import dump
from time import process_time

In [2]:
stopword_list = get_stopwords("../marathi-stopwords.txt")

# Loading Data

In [2]:
training_path = "../dataset/original-dataset/marathi-training-data.csv"
train_data = read_data(training_path)
train_data

Unnamed: 0,text,label
0,"प्रा . प्रताप हरिदास : होय , मला वाटते की हा ए...",com_tech
1,"तर , विशिष्ट गोष्टींद्वारे , ठराविक कायद्यांद्...",bioche
2,- - - - - - - - - - - - - - - - - - - - - - - ...,cse
3,"तर , आपला अर्धा चिन्ह 9 वाजता असेल .",phy
4,"म्हणून , मी असे म्हणालो की जर शेकडो , हजारो कि...",phy
...,...,...
41992,"जरी आपण डेटा कूटबद्ध केला , तरीही हा मुख्य व्य...",cse
41993,"ते म्हणतात - "" ज्याला पाहण्यासाठी डोळे , ऎकण्य...",com_tech
41994,"प्रथम क्रोनोलॉजिकल , क्रॉनोलॉजी म्हणजे आपल्याल...",com_tech
41995,"त्या थोड्या तपशीलावर येईल , जेणेकरून संपूर्ण ग...",bioche


In [3]:
train_data.label.value_counts()

com_tech    17995
phy          9656
cse          9344
bioche       5002
Name: label, dtype: int64

In [4]:
val_path = "../dataset/original-dataset/marathi-validation-data.csv"
val_data = read_data(val_path)
val_data

Unnamed: 0,text,label
0,1 ची ओळ .,cse
1,"तर , ही एक टॉवर आहे जी टॉवरवर निश्चित केली जात...",phy
2,"तर , थ्रेडच्या परतीच्या स्थितीस पास करण्यासाठी...",cse
3,आपण लोक शोधत आहात जे आपल्यासाठी काहीतरी करू शक...,com_tech
4,लिनक्स कर्नल अनुसूचीतकरणामध्ये अशी तंत्र असण्य...,cse
...,...,...
3775,"नंतर वृद्धत्व , व्हॉट मोठ्या प्रमाणात फ्रॉन 12...",bioche
3776,आणि मग सुद्धा आपल्याला काही सेकंदांनंतर माहित ...,phy
3777,"तर , आपण ELF शीर्षलेखासह प्रारंभ करू .",cse
3778,"तर , त्या क्रॉस्टची जाडी आहे .",phy


In [5]:
val_data.label.value_counts()

com_tech    1505
phy          970
cse          885
bioche       420
Name: label, dtype: int64

In [7]:
x_train = train_data.text.apply(lambda x: clean_text(x)).values.tolist()
y_train = train_data.label.values.tolist()
x_val = val_data.text.apply(lambda x: clean_text(x)).values.tolist()
y_val = val_data.label.values.tolist()
print(len(x_train))
print(len(y_train))
print(len(x_val))
print(len(y_val))

41997
41997
3780
3780


In [8]:
y_train, y_val = label_encoder(y_train, y_val)

# Computing the sentence representation

In [9]:
bow_vectorizer, bow_x_train, bow_x_val = bow_vectorize(x_train, x_val, min_df=1)
len(bow_vectorizer.vocabulary_)

52566

In [10]:
char_bow_vectorizer, char_bow_x_train, char_bow_x_val = char_bow_vectorize(x_train, x_val, min_df=1)
len(char_bow_vectorizer.vocabulary_)

28981

In [11]:
tfidf_vectorizer, tfidf_x_train, tfidf_x_val = tfidf_vectorize(x_train, x_val, min_df=1)
len(tfidf_vectorizer.vocabulary_)

52566

In [12]:
n_gram_tfidf_vectorizer, n_gram_tfidf_x_train, n_gram_tfidf_x_val = n_gram_tfidf_vectorize(x_train, x_val, min_df=1)
len(n_gram_tfidf_vectorizer.vocabulary_)

383099

In [13]:
char_tfidf_vectorizer, char_tfidf_x_train, char_tfidf_x_val = char_tfidf_vectorize(x_train, x_val)
len(char_tfidf_vectorizer.vocabulary_)

28981

In [14]:
dump(bow_vectorizer, "../tokenizers/bow-vectorizer.pkl")
dump(char_bow_vectorizer, "../tokenizers/char-bow-vectorizer.pkl")
dump(tfidf_vectorizer, "../tokenizers/tfidf-vectorizer.pkl")
dump(n_gram_tfidf_vectorizer, "../tokenizers/ngram-tfidf-vectorizer.pkl")
dump(char_tfidf_vectorizer, "../tokenizers/char-tfidf-vectorizer.pkl")

['../tokenizers/char-tfidf-vectorizer.pkl']

In [15]:
vocab = bow_vectorizer.vocabulary_

# Ai4Bharat Indic-Fasttext Marathi Word Embeddings

In [16]:
start = process_time()
embedding_path1 = "/home/eastwind/word-embeddings/fasttext/indicnlp.ft.mr.300.vec"
embedding_matrix1 = get_embedding_matrix(embedding_path1, vocab, 300)
end = process_time()
print("Total time taken: ", end-start)
embedding_matrix1.shape

29680
Total time taken:  19.138324173999997


(52567, 300)

In [17]:
ft_bow_x_train = get_sentence_embedding(embedding_matrix1, bow_x_train, 'bow')
ft_bow_x_val = get_sentence_embedding(embedding_matrix1, bow_x_val, 'bow')
print(ft_bow_x_train.shape)
print(ft_bow_x_val.shape)

(41997, 300)
(3780, 300)


In [18]:
ft_tfidf_x_train = get_sentence_embedding(embedding_matrix1, tfidf_x_train, 'tfidf')
ft_tfidf_x_val = get_sentence_embedding(embedding_matrix1, tfidf_x_val, 'tfidf')
print(ft_tfidf_x_train.shape)
print(ft_tfidf_x_val.shape)

(41997, 300)
(3780, 300)


# Domain Specific fasttext Word Embeddings

In [36]:
start = process_time()
embedding_path2 = "/home/eastwind/word-embeddings/fasttext/TechDofication.mr.raw.ft.skipgram.d300.vec"
embedding_matrix2 = get_embedding_matrix(embedding_path2, vocab, 300)
end = process_time()
print("Total time taken: ", end-start)
embedding_matrix2.shape

51246
Total time taken:  6.155927305000091


(52567, 300)

In [37]:
ds_bow_x_train = get_sentence_embedding(embedding_matrix2, bow_x_train, 'bow')
ds_bow_x_val = get_sentence_embedding(embedding_matrix2, bow_x_val, 'bow')
print(ds_bow_x_train.shape)
print(ds_bow_x_val.shape)

(41997, 300)
(3780, 300)


In [38]:
ds_tfidf_x_train = get_sentence_embedding(embedding_matrix2, tfidf_x_train, 'tfidf')
ds_tfidf_x_val = get_sentence_embedding(embedding_matrix2, tfidf_x_val, 'tfidf')
print(ds_tfidf_x_train.shape)
print(ds_tfidf_x_val.shape)

(41997, 300)
(3780, 300)


# Multinomial Naive Bayes

In [22]:
# Naive Bayes on Count Vectors
NB_bow, NB_bow_predictions = ml_classifier_model(MultinomialNB(), 
                                                  bow_x_train, bow_x_val, 
                                                  y_train, y_val)

acc, precision, recall, f1 = classification_report(y_val, NB_bow_predictions)
print("Validation Accuracy: ", acc)
print("\nPrecision: ", precision)
print("Average Precision: ", np.mean(precision))
print("\nRecall: ", recall)
print("Average nRecall: ", np.mean(recall))
print("\nF1-Score: ", f1)
print("Average F1-Score: ", np.mean(f1))

Validation Accuracy:  0.8674603174603175

Precision:  [0.92038217 0.84232868 0.91625616 0.85273632]
Average Precision:  0.8829258314248659

Recall:  [0.68809524 0.92292359 0.84067797 0.88350515]
Average nRecall:  0.8338004867189939

F1-Score:  [0.78746594 0.8807863  0.87684148 0.8678481 ]
Average F1-Score:  0.8532354573502416


In [23]:
# Naive Bayes on character level Count Vectors
NB_char_bow, NB_char_bow_predictions = ml_classifier_model(MultinomialNB(), 
                                                           char_bow_x_train, char_bow_x_val, 
                                                           y_train, y_val)

acc, precision, recall, f1 = classification_report(y_val, NB_char_bow_predictions)
print("Validation Accuracy: ", acc)
print("\nPrecision: ", precision)
print("Average Precision: ", np.mean(precision))
print("\nRecall: ", recall)
print("Average nRecall: ", np.mean(recall))
print("\nF1-Score: ", f1)
print("Average F1-Score: ", np.mean(f1))

Validation Accuracy:  0.8161375661375662

Precision:  [0.73860911 0.83279115 0.8200692  0.81960375]
Average Precision:  0.8027683055916047

Recall:  [0.73333333 0.85049834 0.80338983 0.81030928]
Average nRecall:  0.7993826952656888

F1-Score:  [0.73596177 0.84155161 0.81164384 0.81493002]
Average F1-Score:  0.8010218075426877


In [24]:
# Naive Bayes on TF-IDF
NB_tfidf, NB_tfidf_predictions = ml_classifier_model(MultinomialNB(), 
                                                      tfidf_x_train, tfidf_x_val, 
                                                      y_train, y_val)

acc, precision, recall, f1 = classification_report(y_val, NB_tfidf_predictions)
print("Validation Accuracy: ", acc)
print("\nPrecision: ", precision)
print("Average Precision: ", np.mean(precision))
print("\nRecall: ", recall)
print("Average nRecall: ", np.mean(recall))
print("\nF1-Score: ", f1)
print("Average F1-Score: ", np.mean(f1))

Validation Accuracy:  0.7716931216931217

Precision:  [1.         0.66696997 0.95761381 0.86934673]
Average Precision:  0.8734826302818676

Recall:  [0.3547619  0.97408638 0.68926554 0.71340206]
Average nRecall:  0.6828789705195701

F1-Score:  [0.52372583 0.79179044 0.80157687 0.78369196]
Average F1-Score:  0.7251962766868902


In [25]:
# Naive Bayes on n-gram TF-IDF
NB_ngram_tfidf, NB_ngram_predictions = ml_classifier_model(MultinomialNB(), 
                                                           n_gram_tfidf_x_train, n_gram_tfidf_x_val, 
                                                           y_train, y_val)

acc, precision, recall, f1 = classification_report(y_val, NB_ngram_predictions)
print("Validation Accuracy: ", acc)
print("\nPrecision: ", precision)
print("Average Precision: ", np.mean(precision))
print("\nRecall: ", recall)
print("Average nRecall: ", np.mean(recall))
print("\nF1-Score: ", f1)
print("Average F1-Score: ", np.mean(f1))

Validation Accuracy:  0.6198412698412699

Precision:  [1.         0.52018093 0.96632124 0.90546218]
Average Precision:  0.8479910902238814

Recall:  [0.1047619  0.99335548 0.42146893 0.4443299 ]
Average nRecall:  0.49097905248759205

F1-Score:  [0.18965517 0.68280429 0.58693942 0.59612725]
Average F1-Score:  0.5138815327480567


In [26]:
# Naive Bayes on character-level TF-IDF
NB_char_tfidf, NB_char_predictions = ml_classifier_model(MultinomialNB(), 
                                                         char_tfidf_x_train, char_tfidf_x_val, 
                                                         y_train, y_val)

acc, precision, recall, f1 = classification_report(y_val, NB_char_predictions)
print("Validation Accuracy: ", acc)
print("\nPrecision: ", precision)
print("Average Precision: ", np.mean(precision))
print("\nRecall: ", recall)
print("Average nRecall: ", np.mean(recall))
print("\nF1-Score: ", f1)
print("Average F1-Score: ", np.mean(f1))

Validation Accuracy:  0.7693121693121693

Precision:  [0.95652174 0.68279828 0.89985486 0.83780488]
Average Precision:  0.8442449385835413

Recall:  [0.41904762 0.94684385 0.70056497 0.70824742]
Average nRecall:  0.6936759668250105

F1-Score:  [0.58278146 0.79342984 0.78780178 0.76759777]
Average F1-Score:  0.7329027113305022


In [27]:
dump(NB_bow, "../models/naive-bayes/NB-bow.pkl")
dump(NB_char_bow, "../models/naive-bayes/NB-char-bow.pkl")
dump(NB_tfidf, "../models/naive-bayes/NB-tfidf.pkl")
dump(NB_ngram_tfidf, "../models/naive-bayes/NB-ngram-tfidf.pkl")
dump(NB_char_tfidf, "../models/naive-bayes/NB-char-tfidf.pkl")

['../models/naive-bayes/NB-char-tfidf.pkl']

# Linear SVC

## Statsitical word representation approach

In [28]:
# Linear SVC on Count Vectors
LSVC_bow, LSVC_bow_predictions = ml_classifier_model(LinearSVC(max_iter=2000), 
                                                     bow_x_train, bow_x_val, 
                                                     y_train, y_val)

acc, precision, recall, f1 = classification_report(y_val, LSVC_bow_predictions)
print("Validation Accuracy: ", acc)
print("\nPrecision: ", precision)
print("Average Precision: ", np.mean(precision))
print("\nRecall: ", recall)
print("Average nRecall: ", np.mean(recall))
print("\nF1-Score: ", f1)
print("Average F1-Score: ", np.mean(f1))

Validation Accuracy:  0.8576719576719577

Precision:  [0.84254144 0.85759695 0.87514451 0.84795918]
Average Precision:  0.8558105193285293

Recall:  [0.72619048 0.89634551 0.85536723 0.85670103]
Average nRecall:  0.8336510634267239

F1-Score:  [0.78005115 0.87654321 0.86514286 0.85230769]
Average F1-Score:  0.8435112275555583


In [30]:
# Linear SVC on character-level Count Vectors
LSVC_char_bow, LSVC_char_bow_predictions = ml_classifier_model(LinearSVC(max_iter=5000), 
                                                               char_bow_x_train, char_bow_x_val, 
                                                               y_train, y_val)

acc, precision, recall, f1 = classification_report(y_val, LSVC_char_bow_predictions)
print("Validation Accuracy: ", acc)
print("\nPrecision: ", precision)
print("Average Precision: ", np.mean(precision))
print("\nRecall: ", recall)
print("Average nRecall: ", np.mean(recall))
print("\nF1-Score: ", f1)
print("Average F1-Score: ", np.mean(f1))

Validation Accuracy:  0.861904761904762

Precision:  [0.80916031 0.87702265 0.87514318 0.84726522]
Average Precision:  0.8521478413412386

Recall:  [0.75714286 0.90033223 0.86327684 0.84639175]
Average nRecall:  0.8417859179479975

F1-Score:  [0.78228782 0.88852459 0.86916951 0.84682826]
Average F1-Score:  0.8467025464601541


In [31]:
# Linear SVC on TF-IDF
LSVC_tfidf, LSVC_tfidf_predictions = ml_classifier_model(LinearSVC(), 
                                                         tfidf_x_train, tfidf_x_val, 
                                                         y_train, y_val)

acc, precision, recall, f1 = classification_report(y_val, LSVC_tfidf_predictions)
print("Validation Accuracy: ", acc)
print("\nPrecision: ", precision)
print("Average Precision: ", np.mean(precision))
print("\nRecall: ", recall)
print("Average nRecall: ", np.mean(recall))
print("\nF1-Score: ", f1)
print("Average F1-Score: ", np.mean(f1))

Validation Accuracy:  0.8817460317460317

Precision:  [0.88135593 0.871875   0.90592334 0.87668394]
Average Precision:  0.8839595537437399

Recall:  [0.74285714 0.9269103  0.88135593 0.87216495]
Average nRecall:  0.8558220806293658

F1-Score:  [0.80620155 0.89855072 0.89347079 0.8744186 ]
Average F1-Score:  0.868160417513612


In [32]:
# Linear SVC on n-gram TF-IDF
LSVC_ngram_tfidf, LSVC_ngram_tfidf_predictions = ml_classifier_model(LinearSVC(), 
                                                                     n_gram_tfidf_x_train, n_gram_tfidf_x_val, 
                                                                     y_train, y_val)

acc, precision, recall, f1 = classification_report(y_val, LSVC_ngram_tfidf_predictions)
print("Validation Accuracy: ", acc)
print("\nPrecision: ", precision)
print("Average Precision: ", np.mean(precision))
print("\nRecall: ", recall)
print("Average nRecall: ", np.mean(recall))
print("\nF1-Score: ", f1)
print("Average F1-Score: ", np.mean(f1))

Validation Accuracy:  0.8727513227513227

Precision:  [0.89367816 0.85067319 0.89647059 0.88185654]
Average Precision:  0.8806696209634165

Recall:  [0.74047619 0.92358804 0.86101695 0.86185567]
Average nRecall:  0.8467342123997338

F1-Score:  [0.80989583 0.88563237 0.87838617 0.8717414 ]
Average F1-Score:  0.861413941191252


In [33]:
# Linear SVC on character-level TF-IDF
LSVC_char_tfidf, LSVC_char_tfidf_predictions = ml_classifier_model(LinearSVC(), 
                                                                   char_tfidf_x_train, char_tfidf_x_val, 
                                                                   y_train, y_val)

acc, precision, recall, f1 = classification_report(y_val, LSVC_char_tfidf_predictions)
print("Validation Accuracy: ", acc)
print("\nPrecision: ", precision)
print("Average Precision: ", np.mean(precision))
print("\nRecall: ", recall)
print("Average nRecall: ", np.mean(recall))
print("\nF1-Score: ", f1)
print("Average F1-Score: ", np.mean(f1))

Validation Accuracy:  0.8878306878306879

Precision:  [0.9154519  0.87352759 0.89874858 0.89206349]
Average Precision:  0.8949478883453471

Recall:  [0.74761905 0.93621262 0.89265537 0.86907216]
Average nRecall:  0.8613898010959643

F1-Score:  [0.82306684 0.90378448 0.89569161 0.88041775]
Average F1-Score:  0.8757401707977434


## Indic-Word Embedding bassed approach

In [34]:
# Linear SVC on Count Vectors based indic fasttext word embeddings
LSVC_ft_bow, LSVC_ft_bow_predictions = ml_classifier_model(LinearSVC(max_iter=10000), 
                                                           ft_bow_x_train, ft_bow_x_val, 
                                                           y_train, y_val)

acc, precision, recall, f1 = classification_report(y_val, LSVC_ft_bow_predictions)
print("Validation Accuracy: ", acc)
print("\nPrecision: ", precision)
print("Average Precision: ", np.mean(precision))
print("\nRecall: ", recall)
print("Average nRecall: ", np.mean(recall))
print("\nF1-Score: ", f1)
print("Average F1-Score: ", np.mean(f1))

Validation Accuracy:  0.792063492063492

Precision:  [0.80063291 0.78950507 0.80260047 0.78427205]
Average Precision:  0.794252625947511

Recall:  [0.60238095 0.87973422 0.76723164 0.76082474]
Average nRecall:  0.7525428880840439

F1-Score:  [0.6875     0.83218102 0.78451762 0.77237049]
Average F1-Score:  0.7691422811891593


In [35]:
# Linear SVC on TF-IDF based indic fasttext word embeddings
start = process_time()
LSVC_ft_tfidf, LSVC_ft_tfidf_predictions = ml_classifier_model(LinearSVC(max_iter=10000), 
                                                               ft_tfidf_x_train, ft_tfidf_x_val, 
                                                               y_train, y_val)

acc, precision, recall, f1 = classification_report(y_val, LSVC_ft_tfidf_predictions)
end = process_time()
print("Total time taken: ", end-start)
print("\nValidation Accuracy: ", acc)
print("\nPrecision: ", precision)
print("Average Precision: ", np.mean(precision))
print("\nRecall: ", recall)
print("Average nRecall: ", np.mean(recall))
print("\nF1-Score: ", f1)
print("Average F1-Score: ", np.mean(f1))

Total time taken:  454.692336331

Validation Accuracy:  0.7767195767195767

Precision:  [0.76708075 0.77982196 0.77267373 0.77813853]
Average Precision:  0.7744287414353981

Recall:  [0.58809524 0.8730897  0.74124294 0.74123711]
Average nRecall:  0.7359162475867713

F1-Score:  [0.66576819 0.82382445 0.75663206 0.7592397 ]
Average F1-Score:  0.7513661036001857


## Domain Specific Word Embedding bassed approach

In [39]:
# Linear SVC on Count Vectors based domain specific word embeddings
start = process_time()
LSVC_ds_bow, LSVC_ds_bow_predictions = ml_classifier_model(LinearSVC(max_iter=10000), 
                                                           ds_bow_x_train, ds_bow_x_val, 
                                                           y_train, y_val)

acc, precision, recall, f1 = classification_report(y_val, LSVC_ds_bow_predictions)
end = process_time()
print("Total time taken: ", end-start)
print("\nValidation Accuracy: ", acc)
print("\nPrecision: ", precision)
print("Average Precision: ", np.mean(precision))
print("\nRecall: ", recall)
print("Average nRecall: ", np.mean(recall))
print("\nF1-Score: ", f1)
print("Average F1-Score: ", np.mean(f1))

Total time taken:  127.31809967800007

Validation Accuracy:  0.8544973544973545

Precision:  [0.89189189 0.84051724 0.88056206 0.84210526]
Average Precision:  0.8637691143297567

Recall:  [0.70714286 0.90697674 0.84971751 0.84123711]
Average nRecall:  0.8262685572138149

F1-Score:  [0.78884462 0.87248322 0.86486486 0.84167096]
Average F1-Score:  0.8419659180674914


In [40]:
# Linear SVC on TF-IDF based domain specific word embeddings
start = process_time()
LSVC_ds_tfidf, LSVC_ds_tfidf_predictions = ml_classifier_model(LinearSVC(max_iter=10000), 
                                                               ds_tfidf_x_train, ds_tfidf_x_val, 
                                                               y_train, y_val)

acc, precision, recall, f1 = classification_report(y_val, LSVC_ds_tfidf_predictions)
end = process_time()
print("Total time taken: ", end-start)
print("\nValidation Accuracy: ", acc)
print("\nPrecision: ", precision)
print("Average Precision: ", np.mean(precision))
print("\nRecall: ", recall)
print("Average nRecall: ", np.mean(recall))
print("\nF1-Score: ", f1)
print("Average F1-Score: ", np.mean(f1))

Total time taken:  663.0182165110001

Validation Accuracy:  0.8542328042328042

Precision:  [0.89877301 0.82891566 0.88690476 0.85429769]
Average Precision:  0.8672227811526673

Recall:  [0.69761905 0.91428571 0.84180791 0.84020619]
Average nRecall:  0.8234797142690731

F1-Score:  [0.78552279 0.86951027 0.86376812 0.84719335]
Average F1-Score:  0.8414986299753827




In [41]:
dump(LSVC_bow, "../models/linear-svc/LSVC-bow.pkl")
dump(LSVC_char_bow, "../models/linear-svc/LSVC-char-bow.pkl")
dump(LSVC_tfidf, "../models/linear-svc/LSVC-tfidf.pkl")
dump(LSVC_ngram_tfidf, "../models/linear-svc/LSVC-ngram-tfidf.pkl")
dump(LSVC_char_tfidf, "../models/linear-svc/LSVC-char-tfidf.pkl")
dump(LSVC_ft_bow, "../models/linear-svc/LSVC-indic-bow.pkl")
dump(LSVC_ft_tfidf, "../models/linear-svc/LSVC-indic-tfidf.pkl")
dump(LSVC_ds_bow, "../models/linear-svc/LSVC-ds-bow.pkl")
dump(LSVC_ds_tfidf, "../models/linear-svc/LSVC-ds-tfidf.pkl")

['../models/linear-svc/LSVC-ds-tfidf.pkl']

# K-Nearest Neighbors

## Statistical word representation

In [42]:
# K-Nearest Neighbors on  Count Vectors
knn_bow, knn_bow_predictions = ml_classifier_model(KNeighborsClassifier(n_neighbors=7), 
                                                   bow_x_train, bow_x_val, 
                                                   y_train, y_val)

acc, precision, recall, f1 = classification_report(y_val, knn_bow_predictions)
print("Validation Accuracy: ", acc)
print("\nPrecision: ", precision)
print("Average Precision: ", np.mean(precision))
print("\nRecall: ", recall)
print("Average nRecall: ", np.mean(recall))
print("\nF1-Score: ", f1)
print("Average F1-Score: ", np.mean(f1))

Validation Accuracy:  0.4846560846560847

Precision:  [0.25619835 0.55561829 0.43559322 0.47159091]
Average Precision:  0.42975019282151494

Recall:  [0.22142857 0.6538206  0.29039548 0.51340206]
Average nRecall:  0.4197616778792187

F1-Score:  [0.23754789 0.6007326  0.34847458 0.49160908]
Average F1-Score:  0.4195910379147352


In [43]:
# K-Nearest Neighbors on  character level Count Vectors
knn_char_bow, knn_char_bow_predictions = ml_classifier_model(KNeighborsClassifier(n_neighbors=7), 
                                                             char_bow_x_train, char_bow_x_val, 
                                                             y_train, y_val)

acc, precision, recall, f1 = classification_report(y_val, knn_char_bow_predictions)
print("Validation Accuracy: ", acc)
print("\nPrecision: ", precision)
print("Average Precision: ", np.mean(precision))
print("\nRecall: ", recall)
print("Average nRecall: ", np.mean(recall))
print("\nF1-Score: ", f1)
print("Average F1-Score: ", np.mean(f1))

Validation Accuracy:  0.6021164021164022

Precision:  [0.53149606 0.62104689 0.64050235 0.56303318]
Average Precision:  0.589019621293726

Recall:  [0.32142857 0.75681063 0.46101695 0.61237113]
Average nRecall:  0.537906821457742

F1-Score:  [0.40059347 0.68224019 0.53613666 0.58666667]
Average F1-Score:  0.5514092481093438


In [44]:
# Random Forest on TF-IDF
knn_tfidf, knn_tfidf_predictions = ml_classifier_model(KNeighborsClassifier(n_neighbors=7), 
                                                       tfidf_x_train, tfidf_x_val, 
                                                       y_train, y_val)

acc, precision, recall, f1 = classification_report(y_val, knn_tfidf_predictions)
print("Validation Accuracy: ", acc)
print("\nPrecision: ", precision)
print("Average Precision: ", np.mean(precision))
print("\nRecall: ", recall)
print("Average nRecall: ", np.mean(recall))
print("\nF1-Score: ", f1)
print("Average F1-Score: ", np.mean(f1))

Validation Accuracy:  0.753968253968254

Precision:  [0.69920844 0.77891374 0.71384615 0.77816492]
Average Precision:  0.7425333149108697

Recall:  [0.63095238 0.80996678 0.78644068 0.69072165]
Average nRecall:  0.7295203714529142

F1-Score:  [0.66332916 0.79413681 0.7483871  0.73184052]
Average F1-Score:  0.7344233975868143


In [45]:
# Random Forest on n-gram TF-IDF
knn_ngram_tfidf, knn_ngram_tfidf_predictions = ml_classifier_model(KNeighborsClassifier(n_neighbors=7), 
                                                                   n_gram_tfidf_x_train, n_gram_tfidf_x_val, 
                                                                   y_train, y_val)

acc, precision, recall, f1 = classification_report(y_val, knn_ngram_tfidf_predictions)
print("Validation Accuracy: ", acc)
print("\nPrecision: ", precision)
print("Average Precision: ", np.mean(precision))
print("\nRecall: ", recall)
print("Average nRecall: ", np.mean(recall))
print("\nF1-Score: ", f1)
print("Average F1-Score: ", np.mean(f1))

Validation Accuracy:  0.47433862433862434

Precision:  [0.79061372 0.91377091 0.31631154 0.75      ]
Average Precision:  0.692674041954507

Recall:  [0.52142857 0.4717608  0.97288136 0.00309278]
Average nRecall:  0.49229087705203056

F1-Score:  [0.62840746 0.62226117 0.47740505 0.00616016]
Average F1-Score:  0.43355846124226016


In [46]:
# Random Forest on character-level TF-IDF
knn_char_tfidf, knn_char_tfidf_predictions = ml_classifier_model(KNeighborsClassifier(n_neighbors=7), 
                                                                 char_tfidf_x_train, char_tfidf_x_val, 
                                                                 y_train, y_val)

acc, precision, recall, f1 = classification_report(y_val, knn_char_tfidf_predictions)
print("Validation Accuracy: ", acc)
print("\nPrecision: ", precision)
print("Average Precision: ", np.mean(precision))
print("\nRecall: ", recall)
print("Average nRecall: ", np.mean(recall))
print("\nF1-Score: ", f1)
print("Average F1-Score: ", np.mean(f1))

Validation Accuracy:  0.8113756613756614

Precision:  [0.81792717 0.82207792 0.81526549 0.78855975]
Average Precision:  0.8109575836309557

Recall:  [0.6952381  0.84119601 0.83276836 0.79587629]
Average nRecall:  0.7912696896922116

F1-Score:  [0.75160875 0.83152709 0.82392398 0.79220113]
Average F1-Score:  0.7998152384664572


## Indic-fasttext Word Embeddings based approach

In [47]:
# KNN on Indic fasttext embeddings (BoW)
knn_bow_indic, knn_bow_indic_predictions = ml_classifier_model(KNeighborsClassifier(n_neighbors=7), 
                                                                 ft_bow_x_train, ft_bow_x_val, 
                                                                 y_train, y_val)

acc, precision, recall, f1 = classification_report(y_val, knn_bow_indic_predictions)
print("Validation Accuracy: ", acc)
print("\nPrecision: ", precision)
print("Average Precision: ", np.mean(precision))
print("\nRecall: ", recall)
print("Average nRecall: ", np.mean(recall))
print("\nF1-Score: ", f1)
print("Average F1-Score: ", np.mean(f1))

Validation Accuracy:  0.7304232804232804

Precision:  [0.6313253  0.76489028 0.79207921 0.67638758]
Average Precision:  0.7161705933928695

Recall:  [0.62380952 0.81063123 0.63276836 0.74123711]
Average nRecall:  0.7021115570073467

F1-Score:  [0.62754491 0.78709677 0.70351759 0.70732907]
Average F1-Score:  0.7063720856630717


In [48]:
# KNN on Indic fasttext embeddings (TF-IDF)
knn_tfidf_indic, knn_tfidf_indic_predictions = ml_classifier_model(KNeighborsClassifier(n_neighbors=7), 
                                                                 ft_tfidf_x_train, ft_tfidf_x_val, 
                                                                 y_train, y_val)

acc, precision, recall, f1 = classification_report(y_val, knn_tfidf_indic_predictions)
print("Validation Accuracy: ", acc)
print("\nPrecision: ", precision)
print("Average Precision: ", np.mean(precision))
print("\nRecall: ", recall)
print("Average nRecall: ", np.mean(recall))
print("\nF1-Score: ", f1)
print("Average F1-Score: ", np.mean(f1))

Validation Accuracy:  0.7444444444444445

Precision:  [0.63636364 0.80601093 0.6904277  0.75469613]
Average Precision:  0.7218745991241021

Recall:  [0.65       0.78405316 0.76610169 0.70412371]
Average nRecall:  0.72606964060041

F1-Score:  [0.64310954 0.79488043 0.72629888 0.72853333]
Average F1-Score:  0.7232055450729556


## Domain Specific Word Embeddings based approach

In [49]:
# KNN on Domain Specific fasttext embeddings (BoW)

knn_bow_ds, knn_bow_ds_predictions = ml_classifier_model(KNeighborsClassifier(n_neighbors=9), 
                                                         ds_bow_x_train, ds_bow_x_val, 
                                                         y_train, y_val)

acc, precision, recall, f1 = classification_report(y_val, knn_bow_ds_predictions)

print("Validation Accuracy: ", acc)
print("\nPrecision: ", precision)
print("Average Precision: ", np.mean(precision))
print("\nRecall: ", recall)
print("Average nRecall: ", np.mean(recall))
print("\nF1-Score: ", f1)
print("Average F1-Score: ", np.mean(f1))

Validation Accuracy:  0.796031746031746

Precision:  [0.81615599 0.79409957 0.9222395  0.72198089]
Average Precision:  0.8136189867842435

Recall:  [0.69761905 0.85847176 0.6700565  0.85670103]
Average nRecall:  0.7707120841298415

F1-Score:  [0.75224647 0.82503193 0.77617801 0.78359264]
Average F1-Score:  0.7842622634408278


In [50]:
# KNN on Domain Specific fasttext embeddings (TF-IDF)

knn_tfidf_ds, knn_tfidf_ds_predictions = ml_classifier_model(KNeighborsClassifier(n_neighbors=9), 
                                                             ds_tfidf_x_train, ds_tfidf_x_val, 
                                                             y_train, y_val)

acc, precision, recall, f1 = classification_report(y_val, knn_tfidf_ds_predictions)

print("Validation Accuracy: ", acc)
print("\nPrecision: ", precision)
print("Average Precision: ", np.mean(precision))
print("\nRecall: ", recall)
print("Average nRecall: ", np.mean(recall))
print("\nF1-Score: ", f1)
print("Average F1-Score: ", np.mean(f1))

Validation Accuracy:  0.8026455026455026

Precision:  [0.84294872 0.79176755 0.87664042 0.75426945]
Average Precision:  0.8164065355227534

Recall:  [0.62619048 0.86910299 0.75480226 0.81958763]
Average nRecall:  0.767420838744171

F1-Score:  [0.71857923 0.82863478 0.81117183 0.78557312]
Average F1-Score:  0.785989741230471
