In [1]:
from data_preprocessing import *
from model import ml_classifier_model
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.neighbors import KNeighborsClassifier
from joblib import dump
from time import process_time

# Loading Training Data

In [2]:
training_path = "../dataset/original-dataset/marathi-training-data.csv"
train_data = read_data(training_path)
train_data

Unnamed: 0,text,label
0,"प्रा . प्रताप हरिदास : होय , मला वाटते की हा ए...",com_tech
1,"तर , विशिष्ट गोष्टींद्वारे , ठराविक कायद्यांद्...",bioche
2,- - - - - - - - - - - - - - - - - - - - - - - ...,cse
3,"तर , आपला अर्धा चिन्ह 9 वाजता असेल .",phy
4,"म्हणून , मी असे म्हणालो की जर शेकडो , हजारो कि...",phy
...,...,...
41992,"जरी आपण डेटा कूटबद्ध केला , तरीही हा मुख्य व्य...",cse
41993,"ते म्हणतात - "" ज्याला पाहण्यासाठी डोळे , ऎकण्य...",com_tech
41994,"प्रथम क्रोनोलॉजिकल , क्रॉनोलॉजी म्हणजे आपल्याल...",com_tech
41995,"त्या थोड्या तपशीलावर येईल , जेणेकरून संपूर्ण ग...",bioche


In [3]:
train_data['text'] = train_data['text'].apply(lambda x: preprocess_data(x))
train_data

Unnamed: 0,text,label
0,प्रा प्रताप हरिदास मला वाटते महत्त्वाचा मुद्दा...,com_tech
1,विशिष्ट गोष्टींद्वारे ठराविक कायद्यांद्वारे वि...,bioche
2,- - - - - - - - - - - - - - - - - - - - - - - ...,cse
3,आपला अर्धा चिन्ह [DIGIT] वाजता असेल,phy
4,म्हणालो जर शेकडो हजारो किंवा लाखो विंडील्स बाह...,phy
...,...,...
41992,डेटा कूटबद्ध तरीही मुख्य व्यवस्थापन गोष्टींवर ...,cse
41993,म्हणतात - ज्याला पाहण्यासाठी डोळे ऎकण्यासाठी क...,com_tech
41994,प्रथम क्रोनोलॉजिकल क्रॉनोलॉजी आपल्याला अर्थ मा...,com_tech
41995,थोड्या तपशीलावर येईल जेणेकरून संपूर्ण गोष्टींच...,bioche


In [4]:
train_data.label.value_counts()

com_tech    17995
phy          9656
cse          9344
bioche       5002
Name: label, dtype: int64

In [5]:
x_train = train_data.text.values.tolist()
y_train = train_data.label.values.tolist()
print(len(x_train))
print(len(y_train))

41997
41997


# loading Validation Data

In [6]:
val_path = "../dataset/original-dataset/marathi-validation-data.csv"
val_data = read_data(val_path)
val_data

Unnamed: 0,text,label
0,1 ची ओळ .,cse
1,"तर , ही एक टॉवर आहे जी टॉवरवर निश्चित केली जात...",phy
2,"तर , थ्रेडच्या परतीच्या स्थितीस पास करण्यासाठी...",cse
3,आपण लोक शोधत आहात जे आपल्यासाठी काहीतरी करू शक...,com_tech
4,लिनक्स कर्नल अनुसूचीतकरणामध्ये अशी तंत्र असण्य...,cse
...,...,...
3775,"नंतर वृद्धत्व , व्हॉट मोठ्या प्रमाणात फ्रॉन 12...",bioche
3776,आणि मग सुद्धा आपल्याला काही सेकंदांनंतर माहित ...,phy
3777,"तर , आपण ELF शीर्षलेखासह प्रारंभ करू .",cse
3778,"तर , त्या क्रॉस्टची जाडी आहे .",phy


In [7]:
val_data['text'] = val_data['text'].apply(lambda x: preprocess_data(x))
val_data

Unnamed: 0,text,label
0,[DIGIT] ची ओळ,cse
1,टॉवर टॉवरवर निश्चित जाते नखेला जोडली जाते नंतर...,phy
2,थ्रेडच्या परतीच्या स्थितीस पास करण्यासाठी पर्थ...,cse
3,लोक शोधत आहात आपल्यासाठी काहीतरी करू शकतात,com_tech
4,लिनक्स कर्नल अनुसूचीतकरणामध्ये तंत्र असण्याचे ...,cse
...,...,...
3775,नंतर वृद्धत्व व्हॉट मोठ्या प्रमाणात फ्रॉन [DIG...,bioche
3776,सुद्धा आपल्याला सेकंदांनंतर माहित असेल जमिनीवर...,phy
3777,elf शीर्षलेखासह प्रारंभ करू,cse
3778,क्रॉस्टची जाडी,phy


In [8]:
val_data.label.value_counts()

com_tech    1505
phy          970
cse          885
bioche       420
Name: label, dtype: int64

In [9]:
x_val = val_data.text.values.tolist()
y_val = val_data.label.values.tolist()
print(len(x_val))
print(len(y_val))

3780
3780


# Computing the BoW and TF-IDF representation

In [48]:
bow_vectorizer, bow_x_train, bow_x_val = bow_vectorize(x_train, x_val, min_df=1)
len(bow_vectorizer.vocabulary_)

51775

In [49]:
tfidf_vectorizer, tfidf_x_train, tfidf_x_val = tfidf_vectorize(x_train, x_val, min_df=1)
len(tfidf_vectorizer.vocabulary_)

51775

In [50]:
n_gram_tfidf_vectorizer, n_gram_tfidf_x_train, n_gram_tfidf_x_val = n_gram_tfidf_vectorize(x_train, x_val, min_df=1)
len(n_gram_tfidf_vectorizer.vocabulary_)

356722

In [51]:
char_tfidf_vectorizer, char_tfidf_x_train, char_tfidf_x_val = char_tfidf_vectorize(x_train, x_val)
len(char_tfidf_vectorizer.vocabulary_)

27521

In [45]:
dump(bow_vectorizer, "../tokenizers/bow_vectorizer_min_df_1.pk1")
dump(tfidf_vectorizer, "../tokenizers/tfidf_vectorizer_min_df_1.pk1")
dump(n_gram_tfidf_vectorizer, "../tokenizers/n_gram_tfidf_vectorizer_min_df_1.pk1")
dump(char_tfidf_vectorizer, "../tokenizers/char_tfidf_vectorizer_min_df_1.pk1")

['../tokenizers/char_tfidf_vectorizer_min_df_2.pk1']

In [14]:
integer_y_train, integer_y_val = label_encoder(y_train, y_val)

# Computing Word Embedding Representation

In [52]:
vocab = bow_vectorizer.vocabulary_

In [53]:
def get_embedding_matrix(embedding_path, vocab):
    cnt = 0
    vocab_words = set(vocab.keys())
    embedding_matrix = np.zeros((len(vocab), 300))
    embedding_file = open(embedding_path, 'r')
    for row in embedding_file:
        row = row.split()
        word = row[0].strip()
        if word in vocab_words:
            wv = np.asarray(row[1:], dtype='float32')
            if len(wv) == 300:
                embedding_matrix[vocab[word]] = wv
                cnt = cnt + 1
    print(cnt)
    embedding_file.close()
    return embedding_matrix

In [54]:
def get_sentence_embedding(embedding_matrix, corpus, option='bow'):
    all_sentence_embeddings = []
    if option == 'bow':
        for row in corpus:
            sentence_embedding = np.zeros(300)
            for loc, value in list(zip(row.indices, row.data)):
                sentence_embedding = sentence_embedding + value*embedding_matrix[loc]
            if row.data.shape[0] != 0:
                sentence_embedding = sentence_embedding/row.data.shape[0]
            all_sentence_embeddings.append(sentence_embedding)
        all_sentence_embeddings = np.array([np.array(x) for x in all_sentence_embeddings])
        return all_sentence_embeddings
        
    elif option == 'tfidf':
        for row in corpus:
            sentence_embedding = np.zeros(300)
            for loc, value in list(zip(row.indices, row.data)):
                sentence_embedding = sentence_embedding + value*embedding_matrix[loc]
            all_sentence_embeddings.append(sentence_embedding)
        all_sentence_embeddings = np.array([np.array(x) for x in all_sentence_embeddings])
        return all_sentence_embeddings
    
    else:
        print("Invalid option")
        return text

# Official Fasttext Marathi Word Embeddings

In [55]:
start = process_time()
embedding_path1 = "/home/eastwind/word-embeddings/fasttext/cc.mr.300.vec"
embedding_matrix1 = get_embedding_matrix(embedding_path1, vocab)
end = process_time()
print("Total time taken: ", end-start)
embedding_matrix1.shape

32696
Total time taken:  17.543095697999888


(51775, 300)

In [56]:
ft_bow_x_train = get_sentence_embedding(embedding_matrix1, bow_x_train, 'bow')
ft_bow_x_val = get_sentence_embedding(embedding_matrix1, bow_x_val, 'bow')
print(ft_bow_x_train.shape)
print(ft_bow_x_val.shape)

(41997, 300)
(3780, 300)


In [57]:
ft_tfidf_x_train = get_sentence_embedding(embedding_matrix1, tfidf_x_train, 'tfidf')
ft_tfidf_x_val = get_sentence_embedding(embedding_matrix1, tfidf_x_val, 'tfidf')
print(ft_tfidf_x_train.shape)
print(ft_tfidf_x_val.shape)

(41997, 300)
(3780, 300)


# Ai4Bharat Indic-Fasttext Marathi Word Embeddings

In [58]:
start = process_time()
embedding_path2 = "/home/eastwind/word-embeddings/fasttext/indicnlp.ft.mr.300.vec"
embedding_matrix2 = get_embedding_matrix(embedding_path2, vocab)
end = process_time()
print("Total time taken: ", end-start)
embedding_matrix2.shape

29203
Total time taken:  7.498510450999902


(51775, 300)

In [59]:
indic_ft_bow_x_train = get_sentence_embedding(embedding_matrix2, bow_x_train, 'bow')
indic_ft_bow_x_val = get_sentence_embedding(embedding_matrix2, bow_x_val, 'bow')
print(indic_ft_bow_x_train.shape)
print(indic_ft_bow_x_val.shape)

(41997, 300)
(3780, 300)


In [60]:
indic_ft_tfidf_x_train = get_sentence_embedding(embedding_matrix2, tfidf_x_train, 'tfidf')
indic_ft_tfidf_x_val = get_sentence_embedding(embedding_matrix2, tfidf_x_val, 'tfidf')
print(indic_ft_tfidf_x_train.shape)
print(indic_ft_tfidf_x_val.shape)

(41997, 300)
(3780, 300)


# Multinomial Naive Bayes

In [61]:
# Naive Bayes on Count Vectors
NB_bow, NB_bow_accuracy, NB_bow_f1 = ml_classifier_model(MultinomialNB(), 
                                                         bow_x_train, bow_x_val, 
                                                         integer_y_train, integer_y_val)
print("Naive Bayes, Count Vectors Accuracy: ", NB_bow_accuracy)
print("Naive Bayes, Count Vectors F1-Score: ", NB_bow_f1)

Naive Bayes, Count Vectors Accuracy:  0.8629629629629629
Naive Bayes, Count Vectors F1-Score:  [0.79566982 0.87232704 0.88276671 0.85581395]


In [1]:
(0.79566982+0.87232704+0.88276671+0.85581395)/4

0.85164438

In [62]:
# Naive Bayes on TF-IDF
NB_tfidf, NB_tfidf_accuracy, NB_tfidf_f1 = ml_classifier_model(MultinomialNB(), 
                                                               tfidf_x_train, tfidf_x_val, 
                                                               integer_y_train, integer_y_val)
print("Naive Bayes, TF-IDF Vectors Accuracy: ", NB_tfidf_accuracy)
print("Naive Bayes, TF-IDF Vectors F1-Score: ", NB_tfidf_f1)

Naive Bayes, TF-IDF Vectors Accuracy:  0.7849206349206349
Naive Bayes, TF-IDF Vectors F1-Score:  [0.56164384 0.80538018 0.81853282 0.78695897]


In [4]:
(0.56164384+0.80538018+0.81853282+0.78695897)/4

0.7431289525000001

In [63]:
# Naive Bayes on n-gram TF-IDF
NB_ngram_tfidf, NB_ngram_tfidf_accuracy, NB_ngram_tfidf_f1 = ml_classifier_model(MultinomialNB(), 
                                                                                 n_gram_tfidf_x_train, n_gram_tfidf_x_val, 
                                                                                 integer_y_train, integer_y_val)
print("Naive Bayes, n-gram TF-IDF Vectors Accuracy: ", NB_ngram_tfidf_accuracy)
print("Naive Bayes, n-gram TF-IDF Vectors F1-Score: ", NB_ngram_tfidf_f1)

Naive Bayes, n-gram TF-IDF Vectors Accuracy:  0.6513227513227513
Naive Bayes, n-gram TF-IDF Vectors F1-Score:  [0.29208925 0.70304748 0.65473527 0.62022773]


In [3]:
(0.29208925+0.70304748+0.65473527+0.62022773)/4

0.5675249325

In [64]:
# Naive Bayes on character-level TF-IDF
NB_char_tfidf, NB_char_tfidf_accuracy, NB_char_tfidf_f1 = ml_classifier_model(MultinomialNB(), 
                                                                              char_tfidf_x_train, char_tfidf_x_val, 
                                                                              integer_y_train, integer_y_val)
print("Naive Bayes, character-level TF-IDF Vectors Accuracy: ", NB_char_tfidf_accuracy)
print("Naive Bayes, character-level TF-IDF Vectors F1-Score: ", NB_char_tfidf_f1)

Naive Bayes, character-level TF-IDF Vectors Accuracy:  0.7753968253968254
Naive Bayes, character-level TF-IDF Vectors F1-Score:  [0.61363636 0.79763912 0.79646018 0.76829268]


In [2]:
(0.61363636+0.79763912+0.79646018+0.76829268)/4

0.744007085

In [46]:
dump(NB_bow, "../models/naive-bayes/NB_bow_min_df_1.pk1")
dump(NB_tfidf, "../models/naive-bayes/NB_tfidf_min_df_1.pk1")
dump(NB_ngram_tfidf, "../models/naive-bayes/NB_ngram_tfidf_min_df_1.pk1")
dump(NB_char_tfidf, "../models/naive-bayes/NB_char_tfidf_min_df_1.pk1")

['../models/naive-bayes/NB_char_tfidf_min_df_2.pk1']

# Linear SVC

## Statsitical word representation approach

In [65]:
# Linear SVC on Count Vectors
LSVC_bow, LSVC_bow_accuracy, LSVC_bow_f1 = ml_classifier_model(LinearSVC(max_iter=2000), 
                                                               bow_x_train, bow_x_val, 
                                                               integer_y_train, integer_y_val)
print("Linear SVC, Count Vectors Accuracy: ", LSVC_bow_accuracy)
print("Linear SVC, Count Vectors F1-Score: ", LSVC_bow_f1)

Linear SVC, Count Vectors Accuracy:  0.8391534391534392
Linear SVC, Count Vectors F1-Score:  [0.77835052 0.85308363 0.84911073 0.83230453]




In [5]:
(0.77835052+0.85308363+0.84911073+0.83230453)/4

0.8282123525

In [66]:
# Linear SVC on TF-IDF
LSVC_tfidf, LSVC_tfidf_accuracy, LSVC_tfidf_f1 = ml_classifier_model(LinearSVC(), 
                                                                     tfidf_x_train, tfidf_x_val, 
                                                                     integer_y_train, integer_y_val)
print("Linear SVC, TF-IDF Vectors Accuracy: ", LSVC_tfidf_accuracy)
print("Linear SVC, TF-IDF Vectors F1-Score: ", LSVC_tfidf_f1)

Linear SVC, TF-IDF Vectors Accuracy:  0.8653439153439153
Linear SVC, TF-IDF Vectors F1-Score:  [0.81095176 0.87769328 0.87804878 0.85535248]


In [6]:
(0.81095176+0.87769328+0.87804878+0.85535248)/4

0.855511575

In [67]:
# Linear SVC on n-gram TF-IDF
LSVC_ngram_tfidf, LSVC_ngram_tfidf_accuracy, LSVC_ngram_tfidf_f1 = ml_classifier_model(LinearSVC(), 
                                                                                       n_gram_tfidf_x_train, n_gram_tfidf_x_val, 
                                                                                       integer_y_train, integer_y_val)
print("Linear SVC, n-gram TF-IDF Vectors Accuracy: ", LSVC_ngram_tfidf_accuracy)
print("Linear SVC, n-gram TF-IDF Vectors F1-Score: ", LSVC_ngram_tfidf_f1)

Linear SVC, n-gram TF-IDF Vectors Accuracy:  0.8597883597883598
Linear SVC, n-gram TF-IDF Vectors F1-Score:  [0.79896239 0.87225231 0.86886193 0.85565399]


In [7]:
(0.79896239+0.87225231+0.86886193+0.85565399)/4

0.848932655

In [68]:
# Linear SVC on character-level TF-IDF
LSVC_char_tfidf, LSVC_char_tfidf_accuracy, LSVC_char_tfidf_f1 = ml_classifier_model(LinearSVC(), 
                                                                                    char_tfidf_x_train, char_tfidf_x_val, 
                                                                                    integer_y_train, integer_y_val)
print("Linear SVC, character-level TF-IDF Vectors Accuracy: ", LSVC_char_tfidf_accuracy)
print("Linear SVC, character-level TF-IDF Vectors F1-Score: ", LSVC_char_tfidf_f1)

Linear SVC, character-level TF-IDF Vectors Accuracy:  0.8756613756613757
Linear SVC, character-level TF-IDF Vectors F1-Score:  [0.81298701 0.88675008 0.89623717 0.86399166]


In [8]:
(0.81298701+0.88675008+0.89623717+0.86399166)/4

0.86499148

## Word Embedding bassed approach

In [69]:
# Linear SVC on Count Vectors based fasttext word embeddings
LSVC_ft_bow, LSVC_ft_bow_accuracy, LSVC_ft_bow_f1 = ml_classifier_model(LinearSVC(), 
                                                                        ft_bow_x_train, ft_bow_x_val, 
                                                                        integer_y_train, integer_y_val)
print("Linear SVC, Count Vectors based word embeddings Accuracy: ", LSVC_ft_bow_accuracy)
print("Linear SVC, Count Vectors based word embeddings F1-Score: ", LSVC_ft_bow_f1)

Linear SVC, Count Vectors based word embeddings Accuracy:  0.7288359788359788
Linear SVC, Count Vectors based word embeddings F1-Score:  [0.61849711 0.78284343 0.6971831  0.70163934]


In [5]:
(0.61849711+0.78284343+0.6971831+0.70163934)/4

0.7000407449999999

In [70]:
# Linear SVC on TF-IDF based fasttext  word embeddings
LSVC_ft_tfidf, LSVC_ft_tfidf_accuracy, LSVC_ft_tfidf_f1 = ml_classifier_model(LinearSVC(), 
                                                                              ft_tfidf_x_train, ft_tfidf_x_val, 
                                                                              integer_y_train, integer_y_val)
print("Linear SVC, TF-IDF based word embeddings Accuracy: ", LSVC_ft_tfidf_accuracy)
print("Linear SVC, TF-IDF based word embeddings F1-Score: ", LSVC_ft_tfidf_f1)

Linear SVC, TF-IDF based word embeddings Accuracy:  0.7293650793650793
Linear SVC, TF-IDF based word embeddings F1-Score:  [0.63366337 0.7831141  0.70086455 0.69767442]


In [6]:
(0.63366337+0.7831141+0.70086455+0.69767442)/4

0.70382911

In [71]:
# Linear SVC on Count Vectors based indic fasttext word embeddings
LSVC_indic_ft_bow, LSVC_indic_ft_bow_accuracy, LSVC_indic_ft_bow_f1 = ml_classifier_model(LinearSVC(max_iter=5000), 
                                                                                          indic_ft_bow_x_train, indic_ft_bow_x_val, 
                                                                                          integer_y_train, integer_y_val)
print("Linear SVC, Count Vectors based word embeddings Accuracy: ", LSVC_indic_ft_bow_accuracy)
print("Linear SVC, Count Vectors based word embeddings F1-Score: ", LSVC_indic_ft_bow_f1)

Linear SVC, Count Vectors based word embeddings Accuracy:  0.780952380952381
Linear SVC, Count Vectors based word embeddings F1-Score:  [0.67671233 0.82084894 0.7720504  0.76170213]


In [7]:
(0.67671233+0.82084894+0.7720504+0.76170213)/4

0.7578284499999999

In [43]:
# Linear SVC on TF-IDF based indic fasttext word embeddings
LSVC_indic_ft_tfidf, LSVC_indic_ft_tfidf_accuracy, LSVC_indic_ft_tfidf_f1 = ml_classifier_model(LinearSVC(max_iter=5000), 
                                                                                                indic_ft_tfidf_x_train, indic_ft_tfidf_x_val, 
                                                                                                integer_y_train, integer_y_val)
print("Linear SVC, TF-IDF based word embeddings Accuracy: ", LSVC_indic_ft_tfidf_accuracy)
print("Linear SVC, TF-IDF based word embeddings F1-Score: ", LSVC_indic_ft_tfidf_f1)

Linear SVC, TF-IDF based word embeddings Accuracy:  0.771957671957672
Linear SVC, TF-IDF based word embeddings F1-Score:  [0.67025572 0.81628847 0.75728155 0.74959438]




In [8]:
(0.67025572+0.81628847+0.75728155+0.74959438)/4

0.74835503

In [47]:
dump(LSVC_bow, "../models/linear-svc/LSVC_bow_min_df_1.pk1")
dump(LSVC_tfidf, "../models/linear-svc/LSVC_tfidf_min_df_1.pk1")
dump(LSVC_ngram_tfidf, "../models/linear-svc/LSVC_ngram_tfidf_min_df_1.pk1")
dump(LSVC_char_tfidf, "../models/linear-svc/LSVC_char_tfidf_min_df_1.pk1")
dump(LSVC_ft_bow, "../models/linear-svc/LSVC_ft_bow_min_df_1.pk1")
dump(LSVC_ft_tfidf, "../models/linear-svc/LSVC_ft_tfidf_min_df_1.pk1")
dump(LSVC_indic_ft_bow, "../models/linear-svc/LSVC_indic_ft_bow_min_df_1.pk1")
dump(LSVC_indic_ft_tfidf, "../models/linear-svc/LSVC_indic_ft_tfidf_min_df_1.pk1")

['../models/linear-svc/LSVC_indic_ft_tfidf_min_df_2.pk1']

# Random Forest

In [30]:
# Random Forest on  Count Vectors
RF_bow, RF_bow_accuracy = ml_classifier_model(RandomForestClassifier(), 
                                              bow_train, bow_x_val, 
                                              integer_y_train, integer_y_val)
print("Random Forest, Count Vectors Accuracy: ", RF_bow_accuracy)

Random Forest, Count Vectors:  0.7584656084656085


In [51]:
dump(RF_bow, "models/RF_bow.pk1")

['models/RF_bow.pk1']

In [31]:
# Random Forest on TF-IDF
RF_tfidf, RF_tfidf_accuracy = ml_classifier_model(RandomForestClassifier(), 
                                                  tfidf_x_train, tfidf_x_val, 
                                                  integer_y_train, integer_y_val)
print("Random Forest, TF-IDF Vectors Accuracy: ", RF_tfidf_accuracy)

Random Forest, TF-IDF Vectors:  0.7505291005291005


In [52]:
dump(RF_tfidf, "models/RF_tfidf.pk1")

['models/RF_tfidf.pk1']

In [42]:
# Random Forest on n-gram TF-IDF
RF_ngram_tfidf, RF_ngram_tfidf_accuracy = ml_classifier_model(RandomForestClassifier(), 
                                                              n_gram_tfidf_x_train, n_gram_tfidf_x_val, 
                                                              integer_y_train, integer_y_val)
print("Random Forest, n-gram TF-IDF Vectors Accuracy: ", RF_ngram_tfidf_accuracy)

Random Forest, n-gram TF-IDF Vectors:  0.741005291005291


In [53]:
dump(RF_ngram_tfidf, "models/RF_ngram_tfidf.pk1")

['models/RF_ngram_tfidf.pk1']

In [58]:
# Random Forest on character-level TF-IDF
RF_char_tfidf, RF_char_tfidf_accuracy = ml_classifier_model(RandomForestClassifier(), 
                                                            char_tfidf_x_train, char_tfidf_x_val, 
                                                            integer_y_train, integer_y_val)
print("Random Forest, character-level TF-IDF Vectors Accuracy: ", RF_char_tfidf_accuracy)

Random Forest, character-level TF-IDF Vectors:  0.7425925925925926


In [59]:
dump(RF_char_tfidf, "models/RF_char_tfidf.pk1")

['models/RF_char_tfidf.pk1']

# K-Nearest Neighbors

In [26]:
# K-Nearest Neighbors on  Count Vectors
knn_bow, knn_bow_accuracy = ml_classifier_model(KNeighborsClassifier(n_neighbors=11), 
                                                bow_train, bow_x_val, 
                                                integer_y_train, integer_y_val)
print("K-Nearest Neighbors, Count Vectors Accuracy: ", knn_bow_accuracy)

K-Nearest Neighbors, Count Vectors:  0.5005291005291005


In [54]:
dump(knn_bow, "models/knn_bow.pk1")

['models/knn_bow.pk1']

In [27]:
# Random Forest on TF-IDF
knn_tfidf, knn_tfidf_accuracy = ml_classifier_model(KNeighborsClassifier(n_neighbors=11), 
                                                    tfidf_x_train, tfidf_x_val, 
                                                    integer_y_train, integer_y_val)
print("Random Forest, TF-IDF Vectors Accuracy: ", knn_tfidf_accuracy)

Random Forest, TF-IDF Vectors:  0.7825396825396825


In [55]:
dump(knn_tfidf, "models/knn_tfidf.pk1")

['models/knn_tfidf.pk1']

In [28]:
# Random Forest on n-gram TF-IDF
knn_ngram_tfidf, knn_ngram_tfidf_accuracy = ml_classifier_model(KNeighborsClassifier(n_neighbors=11), 
                                                                n_gram_tfidf_x_train, n_gram_tfidf_x_val, 
                                                                integer_y_train, integer_y_val)
print("Random Forest, n-gram TF-IDF Vectors Accuracy: ", knn_ngram_tfidf_accuracy)

Random Forest, n-gram TF-IDF Vectors:  0.7412698412698413


In [56]:
dump(knn_ngram_tfidf, "models/knn_ngram_tfidf.pk1")

['models/knn_ngram_tfidf.pk1']

In [29]:
# Random Forest on character-level TF-IDF
knn_char_tfidf, knn_char_tfidf_accuracy = ml_classifier_model(KNeighborsClassifier(n_neighbors=11), 
                                                              char_tfidf_x_train, char_tfidf_x_val, 
                                                              integer_y_train, integer_y_val)
print("Random Forest, character-level TF-IDF Vectors Accuracy: ", knn_char_tfidf_accuracy)

Random Forest, character-level TF-IDF Vectors:  0.8132275132275132


In [57]:
dump(knn_char_tfidf, "models/knn_char_tfidf.pk1")

['models/knn_char_tfidf.pk1']