In [1]:
from data_preprocessing import *
from model import ml_classifier_model, classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from joblib import dump
from time import process_time

# Loading Data

In [2]:
training_path = "../dataset/marathi-dataset/marathi-training-data.csv"
train_data = read_data(training_path)
train_data

Unnamed: 0,text,label
0,"प्रा . प्रताप हरिदास : होय , मला वाटते की हा ए...",com_tech
1,"तर , विशिष्ट गोष्टींद्वारे , ठराविक कायद्यांद्...",bioche
2,- - - - - - - - - - - - - - - - - - - - - - - ...,cse
3,"तर , आपला अर्धा चिन्ह 9 वाजता असेल .",phy
4,"म्हणून , मी असे म्हणालो की जर शेकडो , हजारो कि...",phy
...,...,...
41992,"जरी आपण डेटा कूटबद्ध केला , तरीही हा मुख्य व्य...",cse
41993,"ते म्हणतात - "" ज्याला पाहण्यासाठी डोळे , ऎकण्य...",com_tech
41994,"प्रथम क्रोनोलॉजिकल , क्रॉनोलॉजी म्हणजे आपल्याल...",com_tech
41995,"त्या थोड्या तपशीलावर येईल , जेणेकरून संपूर्ण ग...",bioche


In [3]:
train_data.label.value_counts()

com_tech    17995
phy          9656
cse          9344
bioche       5002
Name: label, dtype: int64

In [4]:
val_path = "../dataset/marathi-dataset/marathi-validation-data.csv"
val_data = read_data(val_path)
val_data

Unnamed: 0,text,label
0,1 ची ओळ .,cse
1,"तर , ही एक टॉवर आहे जी टॉवरवर निश्चित केली जात...",phy
2,"तर , थ्रेडच्या परतीच्या स्थितीस पास करण्यासाठी...",cse
3,आपण लोक शोधत आहात जे आपल्यासाठी काहीतरी करू शक...,com_tech
4,लिनक्स कर्नल अनुसूचीतकरणामध्ये अशी तंत्र असण्य...,cse
...,...,...
3775,"नंतर वृद्धत्व , व्हॉट मोठ्या प्रमाणात फ्रॉन 12...",bioche
3776,आणि मग सुद्धा आपल्याला काही सेकंदांनंतर माहित ...,phy
3777,"तर , आपण ELF शीर्षलेखासह प्रारंभ करू .",cse
3778,"तर , त्या क्रॉस्टची जाडी आहे .",phy


In [5]:
val_data.label.value_counts()

com_tech    1505
phy          970
cse          885
bioche       420
Name: label, dtype: int64

In [7]:
x_train = train_data.text.apply(lambda x: clean_text(x)).values.tolist()
y_train = train_data.label.values.tolist()
x_val = val_data.text.apply(lambda x: clean_text(x)).values.tolist()
y_val = val_data.label.values.tolist()
print(len(x_train))
print(len(y_train))
print(len(x_val))
print(len(y_val))

41997
41997
3780
3780


In [9]:
le, y_train, y_val = label_encoder(y_train, y_val)

# Computing the BoW and TF-IDF representation

In [10]:
bow_vectorizer, bow_x_train, bow_x_val = bow_vectorize(x_train, x_val, min_df=1)
len(bow_vectorizer.vocabulary_)

52566

In [11]:
char_bow_vectorizer, char_bow_x_train, char_bow_x_val = char_bow_vectorize(x_train, x_val, min_df=1)
len(char_bow_vectorizer.vocabulary_)

28981

In [12]:
tfidf_vectorizer, tfidf_x_train, tfidf_x_val = tfidf_vectorize(x_train, x_val, min_df=1)
len(tfidf_vectorizer.vocabulary_)

52566

In [13]:
n_gram_tfidf_vectorizer, n_gram_tfidf_x_train, n_gram_tfidf_x_val = n_gram_tfidf_vectorize(x_train, x_val, min_df=1)
len(n_gram_tfidf_vectorizer.vocabulary_)

383099

In [14]:
char_tfidf_vectorizer, char_tfidf_x_train, char_tfidf_x_val = char_tfidf_vectorize(x_train, x_val)
len(char_tfidf_vectorizer.vocabulary_)

28981

In [15]:
dump(bow_vectorizer, "../tokenizers/bow-vectorizer-raw.pkl")
dump(char_bow_vectorizer, "../tokenizers/char-bow-vectorizer-raw.pkl")
dump(tfidf_vectorizer, "../tokenizers/tfidf-vectorizer-raw.pkl")
dump(n_gram_tfidf_vectorizer, "../tokenizers/ngram-tfidf-vectorizer-raw.pkl")
dump(char_tfidf_vectorizer, "../tokenizers/char-tfidf-vectorizer-raw.pkl")

['../tokenizers/char-tfidf-vectorizer-raw.pkl']

In [16]:
vocab = bow_vectorizer.vocabulary_

# Ai4Bharat Indic-Fasttext Marathi Word Embeddings

In [16]:
start = process_time()
embedding_path1 = "/home/eastwind/word-embeddings/fasttext/indicnlp.ft.mr.300.vec"
embedding_matrix1 = get_embedding_matrix(embedding_path1, vocab, 300)
end = process_time()
print("Total time taken: ", end-start)
embedding_matrix1.shape

29155
Total time taken:  11.362035446999997


(51800, 300)

In [17]:
ft_bow_x_train = get_sentence_embedding(embedding_matrix1, bow_x_train, 'bow')
ft_bow_x_val = get_sentence_embedding(embedding_matrix1, bow_x_val, 'bow')
print(ft_bow_x_train.shape)
print(ft_bow_x_val.shape)

(41997, 300)
(3780, 300)


In [18]:
ft_tfidf_x_train = get_sentence_embedding(embedding_matrix1, tfidf_x_train, 'tfidf')
ft_tfidf_x_val = get_sentence_embedding(embedding_matrix1, tfidf_x_val, 'tfidf')
print(ft_tfidf_x_train.shape)
print(ft_tfidf_x_val.shape)

(41997, 300)
(3780, 300)


# Domain Specific fasttext Word Embeddings

In [19]:
start = process_time()
embedding_path2 = "/home/eastwind/word-embeddings/fasttext/TechDofication.mr.cleaned.ft.skipgram.d300.vec"
embedding_matrix2 = get_embedding_matrix(embedding_path2, vocab, 300)
end = process_time()
print("Total time taken: ", end-start)
embedding_matrix2.shape

51795
Total time taken:  4.662203094000006


(51800, 300)

In [20]:
ds_bow_x_train = get_sentence_embedding(embedding_matrix2, bow_x_train, 'bow')
ds_bow_x_val = get_sentence_embedding(embedding_matrix2, bow_x_val, 'bow')
print(ds_bow_x_train.shape)
print(ds_bow_x_val.shape)

(41997, 300)
(3780, 300)


In [21]:
ds_tfidf_x_train = get_sentence_embedding(embedding_matrix2, tfidf_x_train, 'tfidf')
ds_tfidf_x_val = get_sentence_embedding(embedding_matrix2, tfidf_x_val, 'tfidf')
print(ds_tfidf_x_train.shape)
print(ds_tfidf_x_val.shape)

(41997, 300)
(3780, 300)


# Multinomial Naive Bayes

In [17]:
# Naive Bayes on Count Vectors
NB_bow, NB_bow_predictions = ml_classifier_model(MultinomialNB(), 
                                                  bow_x_train, bow_x_val, 
                                                  y_train, y_val)

acc, precision, recall, f1 = classification_report(y_val, NB_bow_predictions)
print("Validation Accuracy: ", acc)
print("\nPrecision: ", precision)
print("Average Precision: ", np.mean(precision))
print("\nRecall: ", recall)
print("Average nRecall: ", np.mean(recall))
print("\nF1-Score: ", f1)
print("Average F1-Score: ", np.mean(f1))

Validation Accuracy:  0.8674603174603175

Precision:  [0.92038217 0.84232868 0.91625616 0.85273632]
Average Precision:  0.8829258314248659

Recall:  [0.68809524 0.92292359 0.84067797 0.88350515]
Average nRecall:  0.8338004867189939

F1-Score:  [0.78746594 0.8807863  0.87684148 0.8678481 ]
Average F1-Score:  0.8532354573502416


In [18]:
# Naive Bayes on character level Count Vectors
NB_char_bow, NB_char_bow_predictions = ml_classifier_model(MultinomialNB(), 
                                                           char_bow_x_train, char_bow_x_val, 
                                                           y_train, y_val)

acc, precision, recall, f1 = classification_report(y_val, NB_char_bow_predictions)
print("Validation Accuracy: ", acc)
print("\nPrecision: ", precision)
print("Average Precision: ", np.mean(precision))
print("\nRecall: ", recall)
print("Average nRecall: ", np.mean(recall))
print("\nF1-Score: ", f1)
print("Average F1-Score: ", np.mean(f1))

Validation Accuracy:  0.8161375661375662

Precision:  [0.73860911 0.83279115 0.8200692  0.81960375]
Average Precision:  0.8027683055916047

Recall:  [0.73333333 0.85049834 0.80338983 0.81030928]
Average nRecall:  0.7993826952656888

F1-Score:  [0.73596177 0.84155161 0.81164384 0.81493002]
Average F1-Score:  0.8010218075426877


In [19]:
# Naive Bayes on TF-IDF
NB_tfidf, NB_tfidf_predictions = ml_classifier_model(MultinomialNB(), 
                                                      tfidf_x_train, tfidf_x_val, 
                                                      y_train, y_val)

acc, precision, recall, f1 = classification_report(y_val, NB_tfidf_predictions)
print("Validation Accuracy: ", acc)
print("\nPrecision: ", precision)
print("Average Precision: ", np.mean(precision))
print("\nRecall: ", recall)
print("Average nRecall: ", np.mean(recall))
print("\nF1-Score: ", f1)
print("Average F1-Score: ", np.mean(f1))

Validation Accuracy:  0.7716931216931217

Precision:  [1.         0.66696997 0.95761381 0.86934673]
Average Precision:  0.8734826302818676

Recall:  [0.3547619  0.97408638 0.68926554 0.71340206]
Average nRecall:  0.6828789705195701

F1-Score:  [0.52372583 0.79179044 0.80157687 0.78369196]
Average F1-Score:  0.7251962766868902


In [20]:
# Naive Bayes on n-gram TF-IDF
NB_ngram_tfidf, NB_ngram_predictions = ml_classifier_model(MultinomialNB(), 
                                                           n_gram_tfidf_x_train, n_gram_tfidf_x_val, 
                                                           y_train, y_val)

acc, precision, recall, f1 = classification_report(y_val, NB_ngram_predictions)
print("Validation Accuracy: ", acc)
print("\nPrecision: ", precision)
print("Average Precision: ", np.mean(precision))
print("\nRecall: ", recall)
print("Average nRecall: ", np.mean(recall))
print("\nF1-Score: ", f1)
print("Average F1-Score: ", np.mean(f1))

Validation Accuracy:  0.6198412698412699

Precision:  [1.         0.52018093 0.96632124 0.90546218]
Average Precision:  0.8479910902238814

Recall:  [0.1047619  0.99335548 0.42146893 0.4443299 ]
Average nRecall:  0.49097905248759205

F1-Score:  [0.18965517 0.68280429 0.58693942 0.59612725]
Average F1-Score:  0.5138815327480567


In [21]:
# Naive Bayes on character-level TF-IDF
NB_char_tfidf, NB_char_predictions = ml_classifier_model(MultinomialNB(), 
                                                         char_tfidf_x_train, char_tfidf_x_val, 
                                                         y_train, y_val)

acc, precision, recall, f1 = classification_report(y_val, NB_char_predictions)
print("Validation Accuracy: ", acc)
print("\nPrecision: ", precision)
print("Average Precision: ", np.mean(precision))
print("\nRecall: ", recall)
print("Average nRecall: ", np.mean(recall))
print("\nF1-Score: ", f1)
print("Average F1-Score: ", np.mean(f1))

Validation Accuracy:  0.7693121693121693

Precision:  [0.95652174 0.68279828 0.89985486 0.83780488]
Average Precision:  0.8442449385835413

Recall:  [0.41904762 0.94684385 0.70056497 0.70824742]
Average nRecall:  0.6936759668250105

F1-Score:  [0.58278146 0.79342984 0.78780178 0.76759777]
Average F1-Score:  0.7329027113305022


In [22]:
dump(NB_bow, "../models/naive-bayes/NB-bow-raw.pkl")
dump(NB_char_bow, "../models/naive-bayes/NB-char-bow-raw.pkl")
dump(NB_tfidf, "../models/naive-bayes/NB-tfidf-raw.pkl")
dump(NB_ngram_tfidf, "../models/naive-bayes/NB-ngram-tfidf-raw.pkl")
dump(NB_char_tfidf, "../models/naive-bayes/NB-char-tfidf-raw.pkl")

['../models/naive-bayes/NB-char-tfidf-raw.pkl']

# Linear SVC

## Statsitical word representation approach

In [23]:
# Linear SVC on Count Vectors
LSVC_bow, LSVC_bow_predictions = ml_classifier_model(LinearSVC(max_iter=2000), 
                                                     bow_x_train, bow_x_val, 
                                                     y_train, y_val)

acc, precision, recall, f1 = classification_report(y_val, LSVC_bow_predictions)
print("Validation Accuracy: ", acc)
print("\nPrecision: ", precision)
print("Average Precision: ", np.mean(precision))
print("\nRecall: ", recall)
print("Average nRecall: ", np.mean(recall))
print("\nF1-Score: ", f1)
print("Average F1-Score: ", np.mean(f1))

Validation Accuracy:  0.8576719576719577

Precision:  [0.84254144 0.85759695 0.87514451 0.84795918]
Average Precision:  0.8558105193285293

Recall:  [0.72619048 0.89634551 0.85536723 0.85670103]
Average nRecall:  0.8336510634267239

F1-Score:  [0.78005115 0.87654321 0.86514286 0.85230769]
Average F1-Score:  0.8435112275555583


In [24]:
# Linear SVC on character-level Count Vectors
LSVC_char_bow, LSVC_char_bow_predictions = ml_classifier_model(LinearSVC(max_iter=2000), 
                                                               char_bow_x_train, char_bow_x_val, 
                                                               y_train, y_val)

acc, precision, recall, f1 = classification_report(y_val, LSVC_char_bow_predictions)
print("Validation Accuracy: ", acc)
print("\nPrecision: ", precision)
print("Average Precision: ", np.mean(precision))
print("\nRecall: ", recall)
print("Average nRecall: ", np.mean(recall))
print("\nF1-Score: ", f1)
print("Average F1-Score: ", np.mean(f1))

Validation Accuracy:  0.861904761904762

Precision:  [0.80916031 0.8771022  0.87514318 0.84710744]
Average Precision:  0.8521282817513447

Recall:  [0.75714286 0.90099668 0.86327684 0.84536082]
Average nRecall:  0.8416942989460453

F1-Score:  [0.78228782 0.88888889 0.86916951 0.84623323]
Average F1-Score:  0.8466448631772533




In [25]:
# Linear SVC on TF-IDF
LSVC_tfidf, LSVC_tfidf_predictions = ml_classifier_model(LinearSVC(), 
                                                         tfidf_x_train, tfidf_x_val, 
                                                         y_train, y_val)

acc, precision, recall, f1 = classification_report(y_val, LSVC_tfidf_predictions)
print("Validation Accuracy: ", acc)
print("\nPrecision: ", precision)
print("Average Precision: ", np.mean(precision))
print("\nRecall: ", recall)
print("Average nRecall: ", np.mean(recall))
print("\nF1-Score: ", f1)
print("Average F1-Score: ", np.mean(f1))

Validation Accuracy:  0.8817460317460317

Precision:  [0.88135593 0.871875   0.90592334 0.87668394]
Average Precision:  0.8839595537437399

Recall:  [0.74285714 0.9269103  0.88135593 0.87216495]
Average nRecall:  0.8558220806293658

F1-Score:  [0.80620155 0.89855072 0.89347079 0.8744186 ]
Average F1-Score:  0.868160417513612


In [26]:
# Linear SVC on n-gram TF-IDF
LSVC_ngram_tfidf, LSVC_ngram_tfidf_predictions = ml_classifier_model(LinearSVC(), 
                                                                     n_gram_tfidf_x_train, n_gram_tfidf_x_val, 
                                                                     y_train, y_val)

acc, precision, recall, f1 = classification_report(y_val, LSVC_ngram_tfidf_predictions)
print("Validation Accuracy: ", acc)
print("\nPrecision: ", precision)
print("Average Precision: ", np.mean(precision))
print("\nRecall: ", recall)
print("Average nRecall: ", np.mean(recall))
print("\nF1-Score: ", f1)
print("Average F1-Score: ", np.mean(f1))

Validation Accuracy:  0.8727513227513227

Precision:  [0.89367816 0.85067319 0.89647059 0.88185654]
Average Precision:  0.8806696209634165

Recall:  [0.74047619 0.92358804 0.86101695 0.86185567]
Average nRecall:  0.8467342123997338

F1-Score:  [0.80989583 0.88563237 0.87838617 0.8717414 ]
Average F1-Score:  0.861413941191252


In [27]:
# Linear SVC on character-level TF-IDF
LSVC_char_tfidf, LSVC_char_tfidf_predictions = ml_classifier_model(LinearSVC(), 
                                                                   char_tfidf_x_train, char_tfidf_x_val, 
                                                                   y_train, y_val)

acc, precision, recall, f1 = classification_report(y_val, LSVC_char_tfidf_predictions)
print("Validation Accuracy: ", acc)
print("\nPrecision: ", precision)
print("Average Precision: ", np.mean(precision))
print("\nRecall: ", recall)
print("Average nRecall: ", np.mean(recall))
print("\nF1-Score: ", f1)
print("Average F1-Score: ", np.mean(f1))

Validation Accuracy:  0.8878306878306879

Precision:  [0.9154519  0.87352759 0.89874858 0.89206349]
Average Precision:  0.8949478883453471

Recall:  [0.74761905 0.93621262 0.89265537 0.86907216]
Average nRecall:  0.8613898010959643

F1-Score:  [0.82306684 0.90378448 0.89569161 0.88041775]
Average F1-Score:  0.8757401707977434


## Indic-Word Embedding bassed approach

In [34]:
# Linear SVC on Count Vectors based indic fasttext word embeddings
LSVC_ft_bow, LSVC_ft_bow_predictions = ml_classifier_model(LinearSVC(max_iter=5000), 
                                                           ft_bow_x_train, ft_bow_x_val, 
                                                           y_train, y_val)

acc, precision, recall, f1 = classification_report(y_val, LSVC_ft_bow_predictions)
print("Validation Accuracy: ", acc)
print("\nPrecision: ", precision)
print("Average Precision: ", np.mean(precision))
print("\nRecall: ", recall)
print("Average nRecall: ", np.mean(recall))
print("\nF1-Score: ", f1)
print("Average F1-Score: ", np.mean(f1))



Validation Accuracy:  0.7775132275132275

Precision:  [0.78594249 0.77227139 0.78103044 0.78104575]
Average Precision:  0.780072518760579

Recall:  [0.58571429 0.86976744 0.75367232 0.73917526]
Average nRecall:  0.7370823254227226

F1-Score:  [0.67121419 0.818125   0.76710753 0.7595339 ]
Average F1-Score:  0.7539951549093648


In [35]:
# Linear SVC on TF-IDF based indic fasttext word embeddings
start = process_time()
LSVC_ft_tfidf, LSVC_ft_tfidf_predictions = ml_classifier_model(LinearSVC(max_iter=2000), 
                                                               ft_tfidf_x_train, ft_tfidf_x_val, 
                                                               y_train, y_val)

acc, precision, recall, f1 = classification_report(y_val, LSVC_ft_tfidf_predictions)
end = process_time()
print("Total time taken: ", end-start)
print("\nValidation Accuracy: ", acc)
print("\nPrecision: ", precision)
print("Average Precision: ", np.mean(precision))
print("\nRecall: ", recall)
print("Average nRecall: ", np.mean(recall))
print("\nF1-Score: ", f1)
print("Average F1-Score: ", np.mean(f1))

Total time taken:  199.24928511500002

Validation Accuracy:  0.7735449735449735

Precision:  [0.76380368 0.77008798 0.77842907 0.77901786]
Average Precision:  0.7728346471302543

Recall:  [0.59285714 0.87242525 0.75028249 0.71958763]
Average nRecall:  0.733788126692066

F1-Score:  [0.66756032 0.81806854 0.76409666 0.74812433]
Average F1-Score:  0.7494624626225255




## Domain Specific Word Embedding bassed approach

In [36]:
# Linear SVC on Count Vectors based domain specific word embeddings
start = process_time()
LSVC_ds_bow, LSVC_ds_bow_predictions = ml_classifier_model(LinearSVC(max_iter=2000), 
                                                           ds_bow_x_train, ds_bow_x_val, 
                                                           y_train, y_val)

acc, precision, recall, f1 = classification_report(y_val, LSVC_ds_bow_predictions)
end = process_time()
print("Total time taken: ", end-start)
print("\nValidation Accuracy: ", acc)
print("\nPrecision: ", precision)
print("Average Precision: ", np.mean(precision))
print("\nRecall: ", recall)
print("Average nRecall: ", np.mean(recall))
print("\nF1-Score: ", f1)
print("Average F1-Score: ", np.mean(f1))

Total time taken:  144.21333912900002

Validation Accuracy:  0.8452380952380952

Precision:  [0.85087719 0.82685298 0.88809524 0.83718487]
Average Precision:  0.8507525704852511

Recall:  [0.69285714 0.90431894 0.84293785 0.82164948]
Average nRecall:  0.8154408543444116

F1-Score:  [0.76377953 0.86385275 0.86492754 0.82934443]
Average F1-Score:  0.8304760604584049




In [37]:
# Linear SVC on TF-IDF based domain specific word embeddings
start = process_time()
LSVC_ds_tfidf, LSVC_ds_tfidf_predictions = ml_classifier_model(LinearSVC(max_iter=2000), 
                                                               ds_tfidf_x_train, ds_tfidf_x_val, 
                                                               y_train, y_val)

acc, precision, recall, f1 = classification_report(y_val, LSVC_ds_tfidf_predictions)
end = process_time()
print("Total time taken: ", end-start)
print("\nValidation Accuracy: ", acc)
print("\nPrecision: ", precision)
print("Average Precision: ", np.mean(precision))
print("\nRecall: ", recall)
print("Average nRecall: ", np.mean(recall))
print("\nF1-Score: ", f1)
print("Average F1-Score: ", np.mean(f1))

Total time taken:  147.42722584299997

Validation Accuracy:  0.8494708994708995

Precision:  [0.88588589 0.81932021 0.89366786 0.85101822]
Average Precision:  0.8624730456894688

Recall:  [0.70238095 0.91295681 0.84519774 0.8185567 ]
Average nRecall:  0.819773051039026

F1-Score:  [0.78353254 0.86360779 0.86875726 0.83447189]
Average F1-Score:  0.8375923689642764




In [38]:
dump(LSVC_bow, "../models/linear-svc/LSVC-bow-raw.pkl")
dump(LSVC_char_bow, "../models/linear-svc/LSVC-char-bow-raw.pkl")
dump(LSVC_tfidf, "../models/linear-svc/LSVC-tfidf-raw.pkl")
dump(LSVC_ngram_tfidf, "../models/linear-svc/LSVC-ngram-tfidf-raw.pkl")
dump(LSVC_char_tfidf, "../models/linear-svc/LSVC-char-tfidf-raw.pkl")
dump(LSVC_ft_bow, "../models/linear-svc/LSVC-indic-bow-raw.pkl")
dump(LSVC_ft_tfidf, "../models/linear-svc/LSVC-indic-tfidf-raw.pkl")
dump(LSVC_ds_bow, "../models/linear-svc/LSVC-ds-bow-raw.pkl")
dump(LSVC_ds_tfidf, "../models/linear-svc/LSVC-ds-tfidf-raw.pkl")

['../models/linear-svc/LSVC-ds-tfidf.pkl']

# K-Nearest Neighbors

## Statistical word representation

In [44]:
# K-Nearest Neighbors on  Count Vectors
knn_bow, knn_bow_predictions = ml_classifier_model(KNeighborsClassifier(n_neighbors=7), 
                                                   bow_x_train, bow_x_val, 
                                                   y_train, y_val)

acc, precision, recall, f1 = classification_report(y_val, knn_bow_predictions)
print("Validation Accuracy: ", acc)
print("\nPrecision: ", precision)
print("Average Precision: ", np.mean(precision))
print("\nRecall: ", recall)
print("Average nRecall: ", np.mean(recall))
print("\nF1-Score: ", f1)
print("Average F1-Score: ", np.mean(f1))

Validation Accuracy:  0.4714285714285714

Precision:  [0.28853755 0.56741214 0.5511811  0.40512048]
Average Precision:  0.4530628185680275

Recall:  [0.34761905 0.59003322 0.23728814 0.55463918]
Average nRecall:  0.43239489526534053

F1-Score:  [0.31533477 0.57850163 0.33175355 0.46823325]
Average F1-Score:  0.42345580067153477


In [45]:
# K-Nearest Neighbors on  character level Count Vectors
knn_char_bow, knn_char_bow_predictions = ml_classifier_model(KNeighborsClassifier(n_neighbors=7), 
                                                             char_bow_x_train, char_bow_x_val, 
                                                             y_train, y_val)

acc, precision, recall, f1 = classification_report(y_val, knn_char_bow_predictions)
print("Validation Accuracy: ", acc)
print("\nPrecision: ", precision)
print("Average Precision: ", np.mean(precision))
print("\nRecall: ", recall)
print("Average nRecall: ", np.mean(recall))
print("\nF1-Score: ", f1)
print("Average F1-Score: ", np.mean(f1))

Validation Accuracy:  0.6060846560846561

Precision:  [0.55483871 0.61638633 0.68630573 0.55255255]
Average Precision:  0.6025208303388485

Recall:  [0.40952381 0.75481728 0.48700565 0.56907216]
Average nRecall:  0.5551047249843214

F1-Score:  [0.47123288 0.6786141  0.56972902 0.56069071]
Average F1-Score:  0.5700666739562386


In [46]:
# Random Forest on TF-IDF
knn_tfidf, knn_tfidf_predictions = ml_classifier_model(KNeighborsClassifier(n_neighbors=7), 
                                                       tfidf_x_train, tfidf_x_val, 
                                                       y_train, y_val)

acc, precision, recall, f1 = classification_report(y_val, knn_tfidf_predictions)
print("Validation Accuracy: ", acc)
print("\nPrecision: ", precision)
print("Average Precision: ", np.mean(precision))
print("\nRecall: ", recall)
print("Average nRecall: ", np.mean(recall))
print("\nF1-Score: ", f1)
print("Average F1-Score: ", np.mean(f1))

Validation Accuracy:  0.344973544973545

Precision:  [0.15348837 0.44532131 0.625      0.28912876]
Average Precision:  0.37823460963647915

Recall:  [0.23571429 0.52491694 0.04519774 0.38659794]
Average nRecall:  0.29810672687330114

F1-Score:  [0.18591549 0.48185422 0.08429926 0.3308337 ]
Average F1-Score:  0.27072567002856734


In [47]:
# Random Forest on n-gram TF-IDF
knn_ngram_tfidf, knn_ngram_tfidf_predictions = ml_classifier_model(KNeighborsClassifier(n_neighbors=7), 
                                                                   n_gram_tfidf_x_train, n_gram_tfidf_x_val, 
                                                                   y_train, y_val)

acc, precision, recall, f1 = classification_report(y_val, knn_ngram_tfidf_predictions)
print("Validation Accuracy: ", acc)
print("\nPrecision: ", precision)
print("Average Precision: ", np.mean(precision))
print("\nRecall: ", recall)
print("Average nRecall: ", np.mean(recall))
print("\nF1-Score: ", f1)
print("Average F1-Score: ", np.mean(f1))

Validation Accuracy:  0.34973544973544973

Precision:  [0.09322034 0.41794514 0.4375     0.28434974]
Average Precision:  0.30825380590342877

Recall:  [0.05238095 0.59734219 0.01581921 0.39896907]
Average nRecall:  0.26612785656911975

F1-Score:  [0.06707317 0.49179431 0.03053435 0.33204633]
Average F1-Score:  0.23036204116129455


In [48]:
# Random Forest on character-level TF-IDF
knn_char_tfidf, knn_char_tfidf_predictions = ml_classifier_model(KNeighborsClassifier(n_neighbors=7), 
                                                                 char_tfidf_x_train, char_tfidf_x_val, 
                                                                 y_train, y_val)

acc, precision, recall, f1 = classification_report(y_val, knn_char_tfidf_predictions)
print("Validation Accuracy: ", acc)
print("\nPrecision: ", precision)
print("Average Precision: ", np.mean(precision))
print("\nRecall: ", recall)
print("Average nRecall: ", np.mean(recall))
print("\nF1-Score: ", f1)
print("Average F1-Score: ", np.mean(f1))

Validation Accuracy:  0.40185185185185185

Precision:  [0.21035599 0.48212157 0.84137931 0.3420463 ]
Average Precision:  0.4689757934781863

Recall:  [0.30952381 0.53754153 0.13785311 0.47216495]
Average nRecall:  0.3642708483903133

F1-Score:  [0.2504817  0.50832548 0.2368932  0.39670853]
Average F1-Score:  0.34810222759790443


## Indic-fasttext Word Embeddings based approach

In [50]:
# KNN on Indic fasttext embeddings (BoW)
knn_bow_indic, knn_bow_indic_predictions = ml_classifier_model(KNeighborsClassifier(n_neighbors=7), 
                                                                 ft_bow_x_train, ft_bow_x_val, 
                                                                 y_train, y_val)

acc, precision, recall, f1 = classification_report(y_val, knn_bow_indic_predictions)
print("Validation Accuracy: ", acc)
print("\nPrecision: ", precision)
print("Average Precision: ", np.mean(precision))
print("\nRecall: ", recall)
print("Average nRecall: ", np.mean(recall))
print("\nF1-Score: ", f1)
print("Average F1-Score: ", np.mean(f1))

Validation Accuracy:  0.7431216931216931

Precision:  [0.66582915 0.76613886 0.7979798  0.70009551]
Average Precision:  0.7325108274367539

Recall:  [0.63095238 0.8358804  0.62485876 0.7556701 ]
Average nRecall:  0.7118404099446018

F1-Score:  [0.64792176 0.79949158 0.7008872  0.72682201]
Average F1-Score:  0.7187806378873858


In [52]:
# KNN on Indic fasttext embeddings (TF-IDF)
knn_tfidf_indic, knn_tfidf_indic_predictions = ml_classifier_model(KNeighborsClassifier(n_neighbors=7), 
                                                                 ft_tfidf_x_train, ft_tfidf_x_val, 
                                                                 y_train, y_val)

acc, precision, recall, f1 = classification_report(y_val, knn_tfidf_indic_predictions)
print("Validation Accuracy: ", acc)
print("\nPrecision: ", precision)
print("Average Precision: ", np.mean(precision))
print("\nRecall: ", recall)
print("Average nRecall: ", np.mean(recall))
print("\nF1-Score: ", f1)
print("Average F1-Score: ", np.mean(f1))

Validation Accuracy:  0.746031746031746

Precision:  [0.62276786 0.79260238 0.6884273  0.79925651]
Average Precision:  0.7257635100573656

Recall:  [0.66428571 0.79734219 0.78644068 0.66494845]
Average nRecall:  0.7282542596377733

F1-Score:  [0.64285714 0.79496522 0.73417722 0.7259426 ]
Average F1-Score:  0.7244855445515201


## Domain Specific Word Embeddings based approach

In [58]:
# KNN on Domain Specific fasttext embeddings (BoW)
index = [3, 5, 7, 9, 11, 13, 15]
model = []
accuracy = []
f1_score = []
for i in index:
    knn_bow_ds, knn_bow_ds_predictions = ml_classifier_model(KNeighborsClassifier(n_neighbors=i), 
                                                             ds_bow_x_train, ds_bow_x_val, 
                                                             y_train, y_val)

    acc, precision, recall, f1 = classification_report(y_val, knn_bow_ds_predictions)
    
    model.append(knn_bow_ds)
    accuracy.append(acc)
    f1_score.append(f1)
#print("Validation Accuracy: ", acc)
#print("\nPrecision: ", precision)
#print("Average Precision: ", np.mean(precision))
#print("\nRecall: ", recall)
#print("Average nRecall: ", np.mean(recall))
#print("\nF1-Score: ", f1)
#print("Average F1-Score: ", np.mean(f1))

In [62]:
for i, a, f, in list(zip(index, accuracy, f1_score)):
    print("Neighbors: {}, accuracy: {}, f11-score: {}".format(i, a, np.mean(f)))

Neighbors: 3, accuracy: 0.7706349206349207, f11-score: 0.7564729241458641
Neighbors: 5, accuracy: 0.785978835978836, f11-score: 0.779420937691388
Neighbors: 7, accuracy: 0.7854497354497354, f11-score: 0.7782878810166497
Neighbors: 9, accuracy: 0.7894179894179895, f11-score: 0.7821699529626662
Neighbors: 11, accuracy: 0.785978835978836, f11-score: 0.7791622929531717
Neighbors: 13, accuracy: 0.7828042328042328, f11-score: 0.7737338880112158
Neighbors: 15, accuracy: 0.782010582010582, f11-score: 0.7718467622580447


In [None]:
# KNN on Domain Specific fasttext embeddings (TF-IDF)
index = [3, 5, 7, 9, 11, 13, 15]
model = []
accuracy = []
f1_score = []
for i in index:
    knn_tfidf_ds, knn_tfidf_ds_predictions = ml_classifier_model(KNeighborsClassifier(n_neighbors=11), 
                                                                 ds_tfidf_x_train, ds_tfidf_x_val, 
                                                                 y_train, y_val)

    acc, precision, recall, f1 = classification_report(y_val, knn_tfidf_ds_predictions)
    
    model.append(knn_bow_ds)
    accuracy.append(acc)
    f1_score.append(f1)
    
    print("Done: ", i)
#print("Validation Accuracy: ", acc)
#print("\nPrecision: ", precision)
#print("Average Precision: ", np.mean(precision))
#print("\nRecall: ", recall)
#print("Average nRecall: ", np.mean(recall))
#print("\nF1-Score: ", f1)
#print("Average F1-Score: ", np.mean(f1))

Done:  3
Done:  5
Done:  7
Done:  9
Done:  11


In [None]:
for i, a, f, in list(zip(index, accuracy, f1_score)):
    print("Neighbors: {}, accuracy: {}, f11-score: {}".format(i, a, np.mean(f)))