In [1]:
from data_preprocessing import *
from model import ml_classifier_model, classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.neighbors import KNeighborsClassifier
from joblib import dump
from time import process_time

In [2]:
stopword_list = get_stopwords("../marathi-stopwords.txt")

# Loading Data

In [3]:
training_path = "../dataset/original-dataset/marathi-training-data.csv"
train_data = read_data(training_path)
train_data

Unnamed: 0,text,label
0,"प्रा . प्रताप हरिदास : होय , मला वाटते की हा ए...",com_tech
1,"तर , विशिष्ट गोष्टींद्वारे , ठराविक कायद्यांद्...",bioche
2,- - - - - - - - - - - - - - - - - - - - - - - ...,cse
3,"तर , आपला अर्धा चिन्ह 9 वाजता असेल .",phy
4,"म्हणून , मी असे म्हणालो की जर शेकडो , हजारो कि...",phy
...,...,...
41992,"जरी आपण डेटा कूटबद्ध केला , तरीही हा मुख्य व्य...",cse
41993,"ते म्हणतात - "" ज्याला पाहण्यासाठी डोळे , ऎकण्य...",com_tech
41994,"प्रथम क्रोनोलॉजिकल , क्रॉनोलॉजी म्हणजे आपल्याल...",com_tech
41995,"त्या थोड्या तपशीलावर येईल , जेणेकरून संपूर्ण ग...",bioche


In [4]:
train_data.label.value_counts()

com_tech    17995
phy          9656
cse          9344
bioche       5002
Name: label, dtype: int64

In [5]:
val_path = "../dataset/original-dataset/marathi-validation-data.csv"
val_data = read_data(val_path)
val_data

Unnamed: 0,text,label
0,1 ची ओळ .,cse
1,"तर , ही एक टॉवर आहे जी टॉवरवर निश्चित केली जात...",phy
2,"तर , थ्रेडच्या परतीच्या स्थितीस पास करण्यासाठी...",cse
3,आपण लोक शोधत आहात जे आपल्यासाठी काहीतरी करू शक...,com_tech
4,लिनक्स कर्नल अनुसूचीतकरणामध्ये अशी तंत्र असण्य...,cse
...,...,...
3775,"नंतर वृद्धत्व , व्हॉट मोठ्या प्रमाणात फ्रॉन 12...",bioche
3776,आणि मग सुद्धा आपल्याला काही सेकंदांनंतर माहित ...,phy
3777,"तर , आपण ELF शीर्षलेखासह प्रारंभ करू .",cse
3778,"तर , त्या क्रॉस्टची जाडी आहे .",phy


In [6]:
val_data.label.value_counts()

com_tech    1505
phy          970
cse          885
bioche       420
Name: label, dtype: int64

In [7]:
x_train = train_data.text.apply(lambda x: preprocess_data(stopword_list, x)).values.tolist()
y_train = train_data.label.values.tolist()
x_val = val_data.text.apply(lambda x: preprocess_data(stopword_list, x)).values.tolist()
y_val = val_data.label.values.tolist()
print(len(x_train))
print(len(y_train))
print(len(x_val))
print(len(y_val))

41997
41997
3780
3780


In [8]:
y_train, y_val = label_encoder(y_train, y_val)

# Computing the BoW and TF-IDF representation

In [9]:
bow_vectorizer, bow_x_train, bow_x_val = bow_vectorize(x_train, x_val, min_df=1)
len(bow_vectorizer.vocabulary_)

51799

In [10]:
tfidf_vectorizer, tfidf_x_train, tfidf_x_val = tfidf_vectorize(x_train, x_val, min_df=1)
len(tfidf_vectorizer.vocabulary_)

51799

In [11]:
n_gram_tfidf_vectorizer, n_gram_tfidf_x_train, n_gram_tfidf_x_val = n_gram_tfidf_vectorize(x_train, x_val, min_df=1)
len(n_gram_tfidf_vectorizer.vocabulary_)

354606

In [12]:
char_tfidf_vectorizer, char_tfidf_x_train, char_tfidf_x_val = char_tfidf_vectorize(x_train, x_val)
len(char_tfidf_vectorizer.vocabulary_)

26786

In [None]:
dump(bow_vectorizer, "../tokenizers/bow_vectorizer_min_df_1.pk1")
dump(tfidf_vectorizer, "../tokenizers/tfidf_vectorizer_min_df_1.pk1")
dump(n_gram_tfidf_vectorizer, "../tokenizers/n_gram_tfidf_vectorizer_min_df_1.pk1")
dump(char_tfidf_vectorizer, "../tokenizers/char_tfidf_vectorizer_min_df_1.pk1")

In [13]:
vocab = bow_vectorizer.vocabulary_

# Ai4Bharat Indic-Fasttext Marathi Word Embeddings

In [14]:
start = process_time()
embedding_path1 = "/home/eastwind/word-embeddings/fasttext/indicnlp.ft.mr.300.vec"
embedding_matrix1 = get_embedding_matrix(embedding_path1, vocab, 300)
end = process_time()
print("Total time taken: ", end-start)
embedding_matrix1.shape

29155
Total time taken:  8.653920267


(51800, 300)

In [15]:
ft_bow_x_train = get_sentence_embedding(embedding_matrix1, bow_x_train, 'bow')
ft_bow_x_val = get_sentence_embedding(embedding_matrix1, bow_x_val, 'bow')
print(ft_bow_x_train.shape)
print(ft_bow_x_val.shape)

(41997, 300)
(3780, 300)


In [16]:
ft_tfidf_x_train = get_sentence_embedding(embedding_matrix1, tfidf_x_train, 'tfidf')
ft_tfidf_x_val = get_sentence_embedding(embedding_matrix1, tfidf_x_val, 'tfidf')
print(ft_tfidf_x_train.shape)
print(ft_tfidf_x_val.shape)

(41997, 300)
(3780, 300)


# Domain Specific fasttext Word Embeddings

In [17]:
start = process_time()
embedding_path2 = "/home/eastwind/word-embeddings/fasttext/TechDofication.mr.cleaned.ft.skipgram.d300.vec"
embedding_matrix2 = get_embedding_matrix(embedding_path2, vocab, 300)
end = process_time()
print("Total time taken: ", end-start)
embedding_matrix2.shape

51795
Total time taken:  4.836589748999998


(51800, 300)

In [18]:
ds_bow_x_train = get_sentence_embedding(embedding_matrix2, bow_x_train, 'bow')
ds_bow_x_val = get_sentence_embedding(embedding_matrix2, bow_x_val, 'bow')
print(ds_bow_x_train.shape)
print(ds_bow_x_val.shape)

(41997, 300)
(3780, 300)


In [19]:
ds_tfidf_x_train = get_sentence_embedding(embedding_matrix2, tfidf_x_train, 'tfidf')
ds_tfidf_x_val = get_sentence_embedding(embedding_matrix2, tfidf_x_val, 'tfidf')
print(ds_tfidf_x_train.shape)
print(ds_tfidf_x_val.shape)

(41997, 300)
(3780, 300)


# Multinomial Naive Bayes

In [21]:
# Naive Bayes on Count Vectors
NB_bow, NB__bow_predictions = ml_classifier_model(MultinomialNB(), 
                                                  bow_x_train, bow_x_val, 
                                                  y_train, y_val)

acc, precision, recall, f1 = classification_report(y_val, NB__bow_predictions)
print("Validation Accuracy: ", acc)
print("\nPrecision: ", precision)
print("Average Precision: ", np.mean(precision))
print("\nRecall: ", recall)
print("Average nRecall: ", np.mean(recall))
print("\nF1-Score: ", f1)
print("Average F1-Score: ", np.mean(f1))

Validation Accuracy:  0.8621693121693121

Precision:  [0.92163009 0.82678571 0.91799266 0.85684647]
Average Precision:  0.8808137343543497

Recall:  [0.7        0.92292359 0.84745763 0.85154639]
Average nRecall:  0.8304819017277721

F1-Score:  [0.79566982 0.8722135  0.8813161  0.85418821]
Average F1-Score:  0.8508469086351682


In [22]:
# Naive Bayes on TF-IDF
NB_tfidf, NB__tfidf_predictions = ml_classifier_model(MultinomialNB(), 
                                                      tfidf_x_train, tfidf_x_val, 
                                                      y_train, y_val)

acc, precision, recall, f1 = classification_report(y_val, NB__tfidf_predictions)
print("Validation Accuracy: ", acc)
print("\nPrecision: ", precision)
print("Average Precision: ", np.mean(precision))
print("\nRecall: ", recall)
print("Average nRecall: ", np.mean(recall))
print("\nF1-Score: ", f1)
print("Average F1-Score: ", np.mean(f1))

Validation Accuracy:  0.7849206349206349

Precision:  [1.         0.68600842 0.9505988  0.86509901]
Average Precision:  0.8754265588297823

Recall:  [0.39761905 0.97408638 0.71751412 0.72061856]
Average nRecall:  0.7024595268378514

F1-Score:  [0.56899489 0.80505217 0.81777205 0.78627672]
Average F1-Score:  0.744523956976183


In [23]:
# Naive Bayes on n-gram TF-IDF
NB_ngram_tfidf, NB_ngram_predictions = ml_classifier_model(MultinomialNB(), 
                                                           n_gram_tfidf_x_train, n_gram_tfidf_x_val, 
                                                           y_train, y_val)

acc, precision, recall, f1 = classification_report(y_val, NB_ngram_predictions)
print("Validation Accuracy: ", acc)
print("\nPrecision: ", precision)
print("Average Precision: ", np.mean(precision))
print("\nRecall: ", recall)
print("Average nRecall: ", np.mean(recall))
print("\nF1-Score: ", f1)
print("Average F1-Score: ", np.mean(f1))

Validation Accuracy:  0.6502645502645502

Precision:  [0.98611111 0.54445664 0.96247241 0.88697318]
Average Precision:  0.8450033346056356

Recall:  [0.16904762 0.98870432 0.49265537 0.47731959]
Average nRecall:  0.5319317232112502

F1-Score:  [0.28861789 0.70221803 0.65171898 0.62064343]
Average F1-Score:  0.5657995821858002


In [24]:
# Naive Bayes on character-level TF-IDF
NB_char_tfidf, NB_char_predictions = ml_classifier_model(MultinomialNB(), 
                                                         char_tfidf_x_train, char_tfidf_x_val, 
                                                         y_train, y_val)

acc, precision, recall, f1 = classification_report(y_val, NB_char_predictions)
print("Validation Accuracy: ", acc)
print("\nPrecision: ", precision)
print("Average Precision: ", np.mean(precision))
print("\nRecall: ", recall)
print("Average nRecall: ", np.mean(recall))
print("\nF1-Score: ", f1)
print("Average F1-Score: ", np.mean(f1))

Validation Accuracy:  0.7748677248677248

Precision:  [0.98445596 0.69151545 0.89044944 0.83133971]
Average Precision:  0.8494401396048793

Recall:  [0.45238095 0.93687708 0.71638418 0.71649485]
Average nRecall:  0.7055342637361744

F1-Score:  [0.61990212 0.79571106 0.79398873 0.7696567 ]
Average F1-Score:  0.7448146526054364


In [35]:
dump(NB_bow, "../models/naive-bayes/NB-bow.pk1")
dump(NB_tfidf, "../models/naive-bayes/NB-tfidf.pk1")
dump(NB_ngram_tfidf, "../models/naive-bayes/NB-ngram-tfidf.pk1")
dump(NB_char_tfidf, "../models/naive-bayes/NB-char-tfidf.pk1")

['../models/naive-bayes/NB-char-tfidf.pk1']

# Linear SVC

## Statsitical word representation approach

In [25]:
# Linear SVC on Count Vectors
LSVC_bow, LSVC_bow_predictions = ml_classifier_model(LinearSVC(max_iter=2000), 
                                                     bow_x_train, bow_x_val, 
                                                     y_train, y_val)

acc, precision, recall, f1 = classification_report(y_val, LSVC_bow_predictions)
print("Validation Accuracy: ", acc)
print("\nPrecision: ", precision)
print("Average Precision: ", np.mean(precision))
print("\nRecall: ", recall)
print("Average nRecall: ", np.mean(recall))
print("\nF1-Score: ", f1)
print("Average F1-Score: ", np.mean(f1))

Validation Accuracy:  0.83994708994709

Precision:  [0.84636872 0.82914573 0.8685446  0.83026585]
Average Precision:  0.8435812233341846

Recall:  [0.72142857 0.87707641 0.83615819 0.8371134 ]
Average nRecall:  0.8179441443852389

F1-Score:  [0.77892031 0.85243784 0.85204375 0.83367556]
Average F1-Score:  0.8292693674592789


In [26]:
# Linear SVC on TF-IDF
LSVC_tfidf, LSVC_tfidf_predictions = ml_classifier_model(LinearSVC(), 
                                                         tfidf_x_train, tfidf_x_val, 
                                                         y_train, y_val)

acc, precision, recall, f1 = classification_report(y_val, LSVC_tfidf_predictions)
print("Validation Accuracy: ", acc)
print("\nPrecision: ", precision)
print("Average Precision: ", np.mean(precision))
print("\nRecall: ", recall)
print("Average nRecall: ", np.mean(recall))
print("\nF1-Score: ", f1)
print("Average F1-Score: ", np.mean(f1))

Validation Accuracy:  0.8669312169312169

Precision:  [0.89942529 0.83898305 0.90572792 0.86942675]
Average Precision:  0.8783907533559553

Recall:  [0.7452381  0.92093023 0.85762712 0.8443299 ]
Average nRecall:  0.8420313358368798

F1-Score:  [0.81510417 0.87804878 0.88102147 0.85669456]
Average F1-Score:  0.8577172454992206


In [27]:
# Linear SVC on n-gram TF-IDF
LSVC_ngram_tfidf, LSVC_ngram_tfidf_predictions = ml_classifier_model(LinearSVC(), 
                                                                     n_gram_tfidf_x_train, n_gram_tfidf_x_val, 
                                                                     y_train, y_val)

acc, precision, recall, f1 = classification_report(y_val, LSVC_ngram_tfidf_predictions)
print("Validation Accuracy: ", acc)
print("\nPrecision: ", precision)
print("Average Precision: ", np.mean(precision))
print("\nRecall: ", recall)
print("Average nRecall: ", np.mean(recall))
print("\nF1-Score: ", f1)
print("Average F1-Score: ", np.mean(f1))

Validation Accuracy:  0.8603174603174604

Precision:  [0.88352273 0.83618582 0.88770686 0.86892178]
Average Precision:  0.8690842945085285

Recall:  [0.74047619 0.9089701  0.84858757 0.84742268]
Average nRecall:  0.8363641352944512

F1-Score:  [0.80569948 0.87106017 0.86770653 0.85803758]
Average F1-Score:  0.8506259400229106


In [28]:
# Linear SVC on character-level TF-IDF
LSVC_char_tfidf, LSVC_char_tfidf_predictions = ml_classifier_model(LinearSVC(), 
                                                                   char_tfidf_x_train, char_tfidf_x_val, 
                                                                   y_train, y_val)

acc, precision, recall, f1 = classification_report(y_val, LSVC_char_tfidf_predictions)
print("Validation Accuracy: ", acc)
print("\nPrecision: ", precision)
print("Average Precision: ", np.mean(precision))
print("\nRecall: ", recall)
print("Average nRecall: ", np.mean(recall))
print("\nF1-Score: ", f1)
print("Average F1-Score: ", np.mean(f1))

Validation Accuracy:  0.8693121693121693

Precision:  [0.89488636 0.85545171 0.88195233 0.87154989]
Average Precision:  0.8759600744440346

Recall:  [0.75       0.91229236 0.8779661  0.84639175]
Average nRecall:  0.8466625532690555

F1-Score:  [0.81606218 0.8829582  0.8799547  0.85878661]
Average F1-Score:  0.8594404215720317


## Indic-Word Embedding bassed approach

In [31]:
# Linear SVC on Count Vectors based indic fasttext word embeddings
LSVC_ft_bow, LSVC_ft_bow_predictions = ml_classifier_model(LinearSVC(max_iter=2000), 
                                                           ft_bow_x_train, ft_bow_x_val, 
                                                           y_train, y_val)

acc, precision, recall, f1 = classification_report(y_val, LSVC_ft_bow_predictions)
print("Validation Accuracy: ", acc)
print("\nPrecision: ", precision)
print("Average Precision: ", np.mean(precision))
print("\nRecall: ", recall)
print("Average nRecall: ", np.mean(recall))
print("\nF1-Score: ", f1)
print("Average F1-Score: ", np.mean(f1))

Validation Accuracy:  0.7775132275132275

Precision:  [0.78594249 0.77227139 0.78103044 0.78104575]
Average Precision:  0.780072518760579

Recall:  [0.58571429 0.86976744 0.75367232 0.73917526]
Average nRecall:  0.7370823254227226

F1-Score:  [0.67121419 0.818125   0.76710753 0.7595339 ]
Average F1-Score:  0.7539951549093648




In [32]:
# Linear SVC on TF-IDF based indic fasttext word embeddings
LSVC_ft_tfidf, LSVC_ft_tfidf_predictions = ml_classifier_model(LinearSVC(), 
                                                               ft_tfidf_x_train, ft_tfidf_x_val, 
                                                               y_train, y_val)

acc, precision, recall, f1 = classification_report(y_val, LSVC_ft_tfidf_predictions)
print("Validation Accuracy: ", acc)
print("\nPrecision: ", precision)
print("Average Precision: ", np.mean(precision))
print("\nRecall: ", recall)
print("Average nRecall: ", np.mean(recall))
print("\nF1-Score: ", f1)
print("Average F1-Score: ", np.mean(f1))

Validation Accuracy:  0.7735449735449735

Precision:  [0.76380368 0.77008798 0.77842907 0.77901786]
Average Precision:  0.7728346471302543

Recall:  [0.59285714 0.87242525 0.75028249 0.71958763]
Average nRecall:  0.733788126692066

F1-Score:  [0.66756032 0.81806854 0.76409666 0.74812433]
Average F1-Score:  0.7494624626225255




## Domain Specific Word Embedding bassed approach

In [33]:
# Linear SVC on Count Vectors based domain specific word embeddings
LSVC_ds_bow, LSVC_ds_bow_predictions = ml_classifier_model(LinearSVC(), 
                                                           ds_bow_x_train, ds_bow_x_val, 
                                                           y_train, y_val)

acc, precision, recall, f1 = classification_report(y_val, LSVC_ds_bow_predictions)
print("Validation Accuracy: ", acc)
print("\nPrecision: ", precision)
print("Average Precision: ", np.mean(precision))
print("\nRecall: ", recall)
print("Average nRecall: ", np.mean(recall))
print("\nF1-Score: ", f1)
print("Average F1-Score: ", np.mean(f1))

Validation Accuracy:  0.8457671957671957

Precision:  [0.85422741 0.82735562 0.88915375 0.8363064 ]
Average Precision:  0.8517607959142947

Recall:  [0.69761905 0.90431894 0.84293785 0.82164948]
Average nRecall:  0.8166313305348878

F1-Score:  [0.76802097 0.86412698 0.86542923 0.82891316]
Average F1-Score:  0.8316225862119562




In [34]:
# Linear SVC on TF-IDF based domain specific word embeddings
LSVC_ds_tfidf, LSVC_ds_tfidf_predictions = ml_classifier_model(LinearSVC(), 
                                                               ds_tfidf_x_train, ds_tfidf_x_val, 
                                                               y_train, y_val)

acc, precision, recall, f1 = classification_report(y_val, LSVC_ds_tfidf_predictions)
print("Validation Accuracy: ", acc)
print("\nPrecision: ", precision)
print("Average Precision: ", np.mean(precision))
print("\nRecall: ", recall)
print("Average nRecall: ", np.mean(recall))
print("\nF1-Score: ", f1)
print("Average F1-Score: ", np.mean(f1))

Validation Accuracy:  0.8494708994708995

Precision:  [0.88855422 0.81195079 0.89927184 0.86041439]
Average Precision:  0.8650478117885909

Recall:  [0.70238095 0.92093023 0.83728814 0.81340206]
Average nRecall:  0.8185003455969955

F1-Score:  [0.78457447 0.8630137  0.86717379 0.83624801]
Average F1-Score:  0.8377524913183791




In [36]:
dump(LSVC_bow, "../models/linear-svc/LSVC-bow.pk1")
dump(LSVC_tfidf, "../models/linear-svc/LSVC-tfidf.pk1")
dump(LSVC_ngram_tfidf, "../models/linear-svc/LSVC-ngram-tfidf.pk1")
dump(LSVC_char_tfidf, "../models/linear-svc/LSVC-char-tfidf.pk1")
dump(LSVC_ft_bow, "../models/linear-svc/LSVC-indic-bow.pk1")
dump(LSVC_ft_tfidf, "../models/linear-svc/LSVC-indic-tfidf.pk1")
dump(LSVC_ds_bow, "../models/linear-svc/LSVC-ds-bow.pk1")
dump(LSVC_ds_tfidf, "../models/linear-svc/LSVC_-ds-tfidf.pk1")

['../models/linear-svc/LSVC_-ds-tfidf.pk1']

# Random Forest

In [None]:
# Random Forest on  Count Vectors
RF_bow, RF_bow_predictions = ml_classifier_model(RandomForestClassifier(), 
                                                 bow_x_train, bow_x_val, 
                                                 y_train, y_val)

acc, precision, recall, f1 = classification_report(y_val, RF_bow_predictions)
print("Validation Accuracy: ", acc)
print("\nPrecision: ", precision)
print("Average Precision: ", np.mean(precision))
print("\nRecall: ", recall)
print("Average nRecall: ", np.mean(recall))
print("\nF1-Score: ", f1)
print("Average F1-Score: ", np.mean(f1))

In [31]:
# Random Forest on TF-IDF
RF_tfidf, RF_tfidf_predictions = ml_classifier_model(RandomForestClassifier(), 
                                                     tfidf_x_train, tfidf_x_val, 
                                                     y_train, y_val)

acc, precision, recall, f1 = classification_report(y_val, RF_tfidf_predictions)
print("Validation Accuracy: ", acc)
print("\nPrecision: ", precision)
print("Average Precision: ", np.mean(precision))
print("\nRecall: ", recall)
print("Average nRecall: ", np.mean(recall))
print("\nF1-Score: ", f1)
print("Average F1-Score: ", np.mean(f1))

Random Forest, TF-IDF Vectors:  0.7505291005291005


In [42]:
# Random Forest on n-gram TF-IDF
RF_ngram_tfidf, RF_ngram_tfidf_predictions = ml_classifier_model(RandomForestClassifier(), 
                                                                 n_gram_tfidf_x_train, n_gram_tfidf_x_val, 
                                                                 y_train, y_val)

acc, precision, recall, f1 = classification_report(y_val, RF_ngram_tfidf_predictions)
print("Validation Accuracy: ", acc)
print("\nPrecision: ", precision)
print("Average Precision: ", np.mean(precision))
print("\nRecall: ", recall)
print("Average nRecall: ", np.mean(recall))
print("\nF1-Score: ", f1)
print("Average F1-Score: ", np.mean(f1))

Random Forest, n-gram TF-IDF Vectors:  0.741005291005291


In [58]:
# Random Forest on character-level TF-IDF
RF_char_tfidf, RF_char_tfidf_predictions = ml_classifier_model(RandomForestClassifier(), 
                                                               char_tfidf_x_train, char_tfidf_x_val, 
                                                               y_train, y_val)

acc, precision, recall, f1 = classification_report(y_val, RF_char_tfidf_predictions)
print("Validation Accuracy: ", acc)
print("\nPrecision: ", precision)
print("Average Precision: ", np.mean(precision))
print("\nRecall: ", recall)
print("Average nRecall: ", np.mean(recall))
print("\nF1-Score: ", f1)
print("Average F1-Score: ", np.mean(f1))

Random Forest, character-level TF-IDF Vectors:  0.7425925925925926


# K-Nearest Neighbors

In [38]:
# K-Nearest Neighbors on  Count Vectors
knn_bow, knn_bow_predictions = ml_classifier_model(KNeighborsClassifier(n_neighbors=11), 
                                                   bow_x_train, bow_x_val, 
                                                   y_train, y_val)

acc, precision, recall, f1 = classification_report(y_val, knn_bow_predictions)
print("Validation Accuracy: ", acc)
print("\nPrecision: ", precision)
print("Average Precision: ", np.mean(precision))
print("\nRecall: ", recall)
print("Average nRecall: ", np.mean(recall))
print("\nF1-Score: ", f1)
print("Average F1-Score: ", np.mean(f1))

Validation Accuracy:  0.4746031746031746

Precision:  [0.33957219 0.55125952 0.59060403 0.39257673]
Average Precision:  0.46850311747264384

Recall:  [0.30238095 0.62524917 0.19887006 0.56701031]
Average nRecall:  0.42337762189792344

F1-Score:  [0.31989924 0.58592777 0.29754861 0.46393927]
Average F1-Score:  0.41682872164127943


In [39]:
# Random Forest on TF-IDF
knn_tfidf, knn_tfidf_predictions = ml_classifier_model(KNeighborsClassifier(n_neighbors=11), 
                                                       tfidf_x_train, tfidf_x_val, 
                                                       y_train, y_val)

acc, precision, recall, f1 = classification_report(y_val, knn_tfidf_predictions)
print("Validation Accuracy: ", acc)
print("\nPrecision: ", precision)
print("Average Precision: ", np.mean(precision))
print("\nRecall: ", recall)
print("Average nRecall: ", np.mean(recall))
print("\nF1-Score: ", f1)
print("Average F1-Score: ", np.mean(f1))

Validation Accuracy:  0.3523809523809524

Precision:  [0.16705882 0.43350785 0.69230769 0.28876245]
Average Precision:  0.3954092039743572

Recall:  [0.16904762 0.55016611 0.03050847 0.4185567 ]
Average nRecall:  0.29206972690290717

F1-Score:  [0.16804734 0.48491947 0.05844156 0.34175084]
Average F1-Score:  0.2632898025960308


In [40]:
# Random Forest on n-gram TF-IDF
knn_ngram_tfidf, knn_ngram_tfidf_predictions = ml_classifier_model(KNeighborsClassifier(n_neighbors=11), 
                                                                   n_gram_tfidf_x_train, n_gram_tfidf_x_val, 
                                                                   y_train, y_val)

acc, precision, recall, f1 = classification_report(y_val, knn_ngram_tfidf_predictions)
print("Validation Accuracy: ", acc)
print("\nPrecision: ", precision)
print("Average Precision: ", np.mean(precision))
print("\nRecall: ", recall)
print("Average nRecall: ", np.mean(recall))
print("\nF1-Score: ", f1)
print("Average F1-Score: ", np.mean(f1))

Validation Accuracy:  0.3547619047619048

Precision:  [0.09090909 0.41197183 0.8        0.27402135]
Average Precision:  0.3942255685520435

Recall:  [0.01904762 0.62192691 0.01355932 0.39690722]
Average nRecall:  0.2628602669688415

F1-Score:  [0.03149606 0.49563145 0.02666667 0.32421053]
Average F1-Score:  0.2195011773772833


In [41]:
# Random Forest on character-level TF-IDF
knn_char_tfidf, knn_char_tfidf_predictions = ml_classifier_model(KNeighborsClassifier(n_neighbors=11), 
                                                                 char_tfidf_x_train, char_tfidf_x_val, 
                                                                 y_train, y_val)

acc, precision, recall, f1 = classification_report(y_val, knn_char_tfidf_predictions)
print("Validation Accuracy: ", acc)
print("\nPrecision: ", precision)
print("Average Precision: ", np.mean(precision))
print("\nRecall: ", recall)
print("Average nRecall: ", np.mean(recall))
print("\nF1-Score: ", f1)
print("Average F1-Score: ", np.mean(f1))

Validation Accuracy:  0.3962962962962963

Precision:  [0.24164524 0.46460177 0.88288288 0.31657609]
Average Precision:  0.47642649599171183

Recall:  [0.22380952 0.55813953 0.11073446 0.48041237]
Average nRecall:  0.3432739732760254

F1-Score:  [0.23238566 0.50709327 0.19678715 0.38165438]
Average F1-Score:  0.329480115124889
