In [1]:
from data_preprocessing import *
from model import ml_classifier_model
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.neighbors import KNeighborsClassifier
from joblib import dump
from time import process_time

# Loading Training Data

In [2]:
training_path = "dataset/marathi-training-data.tsv"
train_data = read_data(training_path)
train_data

Unnamed: 0,text,label
0,"प्रा . प्रताप हरिदास : होय , मला वाटते की हा ए...",com_tech
1,"तर , विशिष्ट गोष्टींद्वारे , ठराविक कायद्यांद्...",bioche
2,- - - - - - - - - - - - - - - - - - - - - - - ...,cse
3,"तर , आपला अर्धा चिन्ह 9 वाजता असेल .",phy
4,"म्हणून , मी असे म्हणालो की जर शेकडो , हजारो कि...",phy
...,...,...
41992,"जरी आपण डेटा कूटबद्ध केला , तरीही हा मुख्य व्य...",cse
41993,"ते म्हणतात - "" ज्याला पाहण्यासाठी डोळे , ऎकण्य...",com_tech
41994,"प्रथम क्रोनोलॉजिकल , क्रॉनोलॉजी म्हणजे आपल्याल...",com_tech
41995,"त्या थोड्या तपशीलावर येईल , जेणेकरून संपूर्ण ग...",bioche


In [3]:
train_data['text'] = train_data['text'].apply(lambda x: preprocess_data(x))
train_data

Unnamed: 0,text,label
0,प्रा प्रताप हरिदास होय मला वाटते की हा एक महत्...,com_tech
1,तर विशिष्ट गोष्टींद्वारे ठराविक कायद्यांद्वारे...,bioche
2,म्हणून डेटा कुठे स्थित आहे जेथे विस्तार इत्याद...,cse
3,तर आपला अर्धा चिन्ह 9 वाजता असेल,phy
4,म्हणून मी असे म्हणालो की जर शेकडो हजारो किंवा ...,phy
...,...,...
41992,जरी आपण डेटा कूटबद्ध केला तरीही हा मुख्य व्यवस...,cse
41993,ते म्हणतात ज्याला पाहण्यासाठी डोळे ऎकण्यासाठी ...,com_tech
41994,प्रथम क्रोनोलॉजिकल क्रॉनोलॉजी म्हणजे आपल्याला ...,com_tech
41995,त्या थोड्या तपशीलावर येईल जेणेकरून संपूर्ण गोष...,bioche


In [4]:
train_data.label.value_counts()

com_tech    17995
phy          9656
cse          9344
bioche       5002
Name: label, dtype: int64

In [5]:
x_train = train_data.text.values.tolist()
y_train = train_data.label.values.tolist()
print(len(x_train))
print(len(y_train))

41997
41997


# loading Validation Data

In [6]:
val_path = "dataset/marathi-validation-data.tsv"
val_data = read_data(val_path)
val_data

Unnamed: 0,text,label
0,1 ची ओळ .,cse
1,"तर , ही एक टॉवर आहे जी टॉवरवर निश्चित केली जात...",phy
2,"तर , थ्रेडच्या परतीच्या स्थितीस पास करण्यासाठी...",cse
3,आपण लोक शोधत आहात जे आपल्यासाठी काहीतरी करू शक...,com_tech
4,लिनक्स कर्नल अनुसूचीतकरणामध्ये अशी तंत्र असण्य...,cse
...,...,...
3775,"नंतर वृद्धत्व , व्हॉट मोठ्या प्रमाणात फ्रॉन 12...",bioche
3776,आणि मग सुद्धा आपल्याला काही सेकंदांनंतर माहित ...,phy
3777,"तर , आपण ELF शीर्षलेखासह प्रारंभ करू .",cse
3778,"तर , त्या क्रॉस्टची जाडी आहे .",phy


In [7]:
val_data['text'] = val_data['text'].apply(lambda x: preprocess_data(x))
val_data

Unnamed: 0,text,label
0,1 ची ओळ,cse
1,तर ही एक टॉवर आहे जी टॉवरवर निश्चित केली जाते ...,phy
2,तर थ्रेडच्या परतीच्या स्थितीस पास करण्यासाठी प...,cse
3,आपण लोक शोधत आहात जे आपल्यासाठी काहीतरी करू शकतात,com_tech
4,लिनक्स कर्नल अनुसूचीतकरणामध्ये अशी तंत्र असण्य...,cse
...,...,...
3775,नंतर वृद्धत्व व्हॉट मोठ्या प्रमाणात फ्रॉन 12 र...,bioche
3776,आणि मग सुद्धा आपल्याला काही सेकंदांनंतर माहित ...,phy
3777,तर आपण ELF शीर्षलेखासह प्रारंभ करू,cse
3778,तर त्या क्रॉस्टची जाडी आहे,phy


In [8]:
val_data.label.value_counts()

com_tech    1505
phy          970
cse          885
bioche       420
Name: label, dtype: int64

In [9]:
x_val = val_data.text.values.tolist()
y_val = val_data.label.values.tolist()
print(len(x_val))
print(len(y_val))

3780
3780


# Computing the BoW and TF-IDF representation

In [10]:
bow_vectorizer, bow_x_train, bow_x_val = bow_vectorize(x_train, x_val)
len(bow_vectorizer.vocabulary_)

52502

In [34]:
dump(bow_vectorizer, "tokenizers/bow_vectorizer.pk1")

['tokenizers/bow_vectorizer.pk1']

In [11]:
tfidf_vectorizer, tfidf_x_train, tfidf_x_val = tfidf_vectorize(x_train, x_val)
len(tfidf_vectorizer.vocabulary_)

52502

In [35]:
dump(tfidf_vectorizer, "tokenizers/tfidf_vectorizer.pk1")

['tokenizers/tfidf_vectorizer.pk1']

In [12]:
n_gram_tfidf_vectorizer, n_gram_tfidf_x_train, n_gram_tfidf_x_val = n_gram_tfidf_vectorize(x_train, x_val)
len(n_gram_tfidf_vectorizer.vocabulary_)

382401

In [36]:
dump(n_gram_tfidf_vectorizer, "tokenizers/n_gram_tfidf_vectorizer.pk1")

['tokenizers/n_gram_tfidf_vectorizer.pk1']

In [13]:
char_tfidf_vectorizer, char_tfidf_x_train, char_tfidf_x_val = char_tfidf_vectorize(x_train, x_val)
len(char_tfidf_vectorizer.vocabulary_)

29291

In [37]:
dump(char_tfidf_vectorizer, "tokenizers/char_tfidf_vectorizer.pk1")

['tokenizers/char_tfidf_vectorizer.pk1']

In [12]:
integer_y_train, integer_y_val = label_encoder(y_train, y_val)

# Computing Word Embedding Representation

In [13]:
vocab = bow_vectorizer.vocabulary_

In [14]:
def get_embedding_matrix(embedding_path, vocab):
    cnt = 0
    vocab_words = set(vocab.keys())
    embedding_matrix = np.zeros((len(vocab), 300))
    embedding_file = open(embedding_path, 'r')
    for row in embedding_file:
        row = row.split()
        word = row[0].strip()
        if word in vocab_words:
            wv = np.asarray(row[1:], dtype='float32')
            if len(wv) == 300:
                embedding_matrix[vocab[word]] = wv
                cnt = cnt + 1
    print(cnt)
    embedding_file.close()
    return embedding_matrix

In [15]:
def get_sentence_embedding(embedding_matrix, corpus, option='bow'):
    all_sentence_embeddings = []
    if option == 'bow':
        for row in corpus:
            sentence_embedding = np.zeros(300)
            for loc, value in list(zip(row.indices, row.data)):
                sentence_embedding = sentence_embedding + value*embedding_matrix[loc]
            if row.data.shape[0] != 0:
                sentence_embedding = sentence_embedding/row.data.shape[0]
            all_sentence_embeddings.append(sentence_embedding)
        all_sentence_embeddings = np.array([np.array(x) for x in all_sentence_embeddings])
        return all_sentence_embeddings
        
    elif option == 'tfidf':
        for row in corpus:
            sentence_embedding = np.zeros(300)
            for loc, value in list(zip(row.indices, row.data)):
                sentence_embedding = sentence_embedding + value*embedding_matrix[loc]
            all_sentence_embeddings.append(sentence_embedding)
        all_sentence_embeddings = np.array([np.array(x) for x in all_sentence_embeddings])
        return all_sentence_embeddings
    
    else:
        print("Invalid option")
        return text

# Official Fasttext Marathi Word Embeddings

In [16]:
start = process_time()
embedding_path1 = "/home/eastwind/word-embeddings/fasttext/cc.mr.300.vec"
embedding_matrix1 = get_embedding_matrix(embedding_path1, vocab)
end = process_time()
print("Total time taken: ", end-start)
embedding_matrix1.shape

33286
Total time taken:  29.673056009999996


(52502, 300)

In [18]:
ft_bow_x_train = get_sentence_embedding(embedding_matrix1, bow_x_train, 'bow')
ft_bow_x_val = get_sentence_embedding(embedding_matrix1, bow_x_val, 'bow')
print(ft_bow_x_train.shape)
print(ft_bow_x_val.shape)

(41997, 300)
(3780, 300)


In [20]:
ft_tfidf_x_train = get_sentence_embedding(embedding_matrix1, tfidf_x_train, 'tfidf')
ft_tfidf_x_val = get_sentence_embedding(embedding_matrix1, tfidf_x_val, 'tfidf')
print(ft_tfidf_x_train.shape)
print(ft_tfidf_x_val.shape)

(41997, 300)
(3780, 300)


# Ai4Bharat Indic-Fasttext Marathi Word Embeddings

In [21]:
start = process_time()
embedding_path2 = "/home/eastwind/word-embeddings/fasttext/indicnlp.ft.mr.300.vec"
embedding_matrix2 = get_embedding_matrix(embedding_path2, vocab)
end = process_time()
print("Total time taken: ", end-start)
embedding_matrix2.shape

29724
Total time taken:  11.626865700000003


(52502, 300)

In [22]:
indic_ft_bow_x_train = get_sentence_embedding(embedding_matrix2, bow_x_train, 'bow')
indic_ft_bow_x_val = get_sentence_embedding(embedding_matrix2, bow_x_val, 'bow')
print(indic_ft_bow_x_train.shape)
print(indic_ft_bow_x_val.shape)

(41997, 300)
(3780, 300)


In [23]:
indic_ft_tfidf_x_train = get_sentence_embedding(embedding_matrix2, tfidf_x_train, 'tfidf')
indic_ft_tfidf_x_val = get_sentence_embedding(embedding_matrix2, tfidf_x_val, 'tfidf')
print(indic_ft_tfidf_x_train.shape)
print(indic_ft_tfidf_x_val.shape)

(41997, 300)
(3780, 300)


# Multinomial Naive Bayes

In [18]:
# Naive Bayes on Count Vectors
NB_bow, NB_bow_accuracy = ml_classifier_model(MultinomialNB(), 
                                              bow_train, bow_x_val, 
                                              integer_y_train, integer_y_val)
print("Naive Bayes, Count Vectors: ", NB_bow_accuracy)

Naive Bayes, Count Vectors:  0.8666666666666667


In [43]:
dump(NB_bow, "models/NB_bow.pk1")

['models/NB_bow.pk1']

In [19]:
# Naive Bayes on TF-IDF
NB_tfidf, NB_tfidf_accuracy = ml_classifier_model(MultinomialNB(), 
                                                  tfidf_x_train, tfidf_x_val, 
                                                  integer_y_train, integer_y_val)
print("Naive Bayes, TF-IDF Vectors: ", NB_tfidf_accuracy)

Naive Bayes, TF-IDF Vectors:  0.7703703703703704


In [44]:
dump(NB_tfidf, "models/NB_tfidf.pk1")

['models/NB_tfidf.pk1']

In [20]:
# Naive Bayes on n-gram TF-IDF
NB_ngram_tfidf, NB_ngram_tfidf_accuracy = ml_classifier_model(MultinomialNB(), 
                                                              n_gram_tfidf_x_train, n_gram_tfidf_x_val, 
                                                              integer_y_train, integer_y_val)
print("Naive Bayes, n-gram TF-IDF Vectors: ", NB_ngram_tfidf_accuracy)

Naive Bayes, n-gram TF-IDF Vectors:  0.620899470899471


In [45]:
dump(NB_ngram_tfidf, "models/NB_ngram_tfidf.pk1")

['models/NB_ngram_tfidf.pk1']

In [21]:
# Naive Bayes on character-level TF-IDF
NB_char_tfidf, NB_char_tfidf_accuracy = ml_classifier_model(MultinomialNB(), 
                                                            char_tfidf_x_train, char_tfidf_x_val, 
                                                            integer_y_train, integer_y_val)
print("Naive Bayes, character-level TF-IDF Vectors: ", NB_char_tfidf_accuracy)

Naive Bayes, character-level TF-IDF Vectors:  0.7687830687830688


In [46]:
dump(NB_char_tfidf, "models/NB_char_tfidf.pk1")

['models/NB_char_tfidf.pk1']

# Linear SVC

In [22]:
# Linear SVC on Count Vectors
LSVC_bow, LSVC_bow_accuracy = ml_classifier_model(LinearSVC(max_iter=2000), 
                                                  bow_train, bow_x_val, 
                                                  integer_y_train, integer_y_val)
print("Linear SVC, Count Vectors: ", LSVC_bow_accuracy)

Linear SVC, Count Vectors:  0.8584656084656085


In [47]:
dump(LSVC_bow, "models/LSVC_bow.pk1")

['models/LSVC_bow.pk1']

In [23]:
# Linear SVC on TF-IDF
LSVC_tfidf, LSVC_tfidf_accuracy = ml_classifier_model(LinearSVC(), 
                                                      tfidf_x_train, tfidf_x_val, 
                                                      integer_y_train, integer_y_val)
print("Linear SVC, TF-IDF Vectors: ", LSVC_tfidf_accuracy)

Linear SVC, TF-IDF Vectors:  0.8822751322751323


In [48]:
dump(LSVC_tfidf, "models/LSVC_tfidf.pk1")

['models/LSVC_tfidf.pk1']

In [24]:
# Linear SVC on n-gram TF-IDF
LSVC_ngram_tfidf, LSVC_ngram_tfidf_accuracy = ml_classifier_model(LinearSVC(), 
                                                                  n_gram_tfidf_x_train, n_gram_tfidf_x_val, 
                                                                  integer_y_train, integer_y_val)
print("Linear SVC, n-gram TF-IDF Vectors: ", LSVC_ngram_tfidf_accuracy)

Linear SVC, n-gram TF-IDF Vectors:  0.873015873015873


In [49]:
dump(LSVC_ngram_tfidf, "models/LSVC_ngram_tfidf.pk1")

['models/LSVC_ngram_tfidf.pk1']

In [25]:
# Linear SVC on character-level TF-IDF
LSVC_char_tfidf, LSVC_char_tfidf_accuracy = ml_classifier_model(LinearSVC(), 
                                                                char_tfidf_x_train, char_tfidf_x_val, 
                                                                integer_y_train, integer_y_val)
print("Linear SVC, character-level TF-IDF Vectors: ", LSVC_char_tfidf_accuracy)

Linear SVC, character-level TF-IDF Vectors:  0.8888888888888888


In [50]:
dump(LSVC_char_tfidf, "models/LSVC_char_tfidf.pk1")

['models/LSVC_char_tfidf.pk1']

In [18]:
# Linear SVC on Count Vectors based fasttext word embeddings
LSVC_ft_bow, LSVC_ft_bow_accuracy = ml_classifier_model(LinearSVC(), 
                                                        ft_bow_x_train, ft_bow_x_val, 
                                                        integer_y_train, integer_y_val)
print("Linear SVC, Count Vectors based word embeddings: ", LSVC_ft_bow_accuracy)

Linear SVC, Count Vectors based word embeddings:  0.7362433862433863


In [19]:
dump(LSVC_ft_bow, "models/LSVC_ft_bow.pk1")

['models/LSVC_ft_bow.pk1']

In [21]:
# Linear SVC on TF-IDF based fasttext  word embeddings
LSVC_ft_tfidf, LSVC_ft_tfidf_accuracy = ml_classifier_model(LinearSVC(max_iter=2000), 
                                                            ft_tfidf_x_train, ft_tfidf_x_val, 
                                                            integer_y_train, integer_y_val)
print("Linear SVC, TF-IDF based word embeddings: ", LSVC_ft_tfidf_accuracy)

Linear SVC, TF-IDF based word embeddings:  0.7333333333333333


In [22]:
dump(LSVC_ft_tfidf, "models/LSVC_ft_tfidf.pk1")

['models/LSVC_ft_tfidf.pk1']

In [24]:
# Linear SVC on Count Vectors based indic fasttext word embeddings
LSVC_indic_ft_bow, LSVC_indic_ft_bow_accuracy = ml_classifier_model(LinearSVC(), 
                                                                    indic_ft_bow_x_train, indic_ft_bow_x_val, 
                                                                    integer_y_train, integer_y_val)
print("Linear SVC, Count Vectors based word embeddings: ", LSVC_indic_ft_bow_accuracy)

Linear SVC, Count Vectors based word embeddings:  0.7933862433862434


In [25]:
dump(LSVC_indic_ft_bow, "models/LSVC_indic_ft_bow.pk1")

['models/LSVC_indic_ft_bow.pk1']

In [28]:
# Linear SVC on TF-IDF based indic fasttext word embeddings
LSVC_indic_ft_tfidf, LSVC_indic_ft_tfidf_accuracy = ml_classifier_model(LinearSVC(max_iter=5000), 
                                                                        indic_ft_tfidf_x_train, indic_ft_tfidf_x_val, 
                                                                        integer_y_train, integer_y_val)
print("Linear SVC, TF-IDF based word embeddings: ", LSVC_indic_ft_tfidf_accuracy)

Linear SVC, TF-IDF based word embeddings:  0.776984126984127




In [29]:
dump(LSVC_indic_ft_tfidf, "models/LSVC_indic_ft_tfidf.pk1")

['models/LSVC_indic_ft_tfidf.pk1']

# Random Forest

In [30]:
# Random Forest on  Count Vectors
RF_bow, RF_bow_accuracy = ml_classifier_model(RandomForestClassifier(), 
                                              bow_train, bow_x_val, 
                                              integer_y_train, integer_y_val)
print("Random Forest, Count Vectors: ", RF_bow_accuracy)

Random Forest, Count Vectors:  0.7584656084656085


In [51]:
dump(RF_bow, "models/RF_bow.pk1")

['models/RF_bow.pk1']

In [31]:
# Random Forest on TF-IDF
RF_tfidf, RF_tfidf_accuracy = ml_classifier_model(RandomForestClassifier(), 
                                                  tfidf_x_train, tfidf_x_val, 
                                                  integer_y_train, integer_y_val)
print("Random Forest, TF-IDF Vectors: ", RF_tfidf_accuracy)

Random Forest, TF-IDF Vectors:  0.7505291005291005


In [52]:
dump(RF_tfidf, "models/RF_tfidf.pk1")

['models/RF_tfidf.pk1']

In [42]:
# Random Forest on n-gram TF-IDF
RF_ngram_tfidf, RF_ngram_tfidf_accuracy = ml_classifier_model(RandomForestClassifier(), 
                                                              n_gram_tfidf_x_train, n_gram_tfidf_x_val, 
                                                              integer_y_train, integer_y_val)
print("Random Forest, n-gram TF-IDF Vectors: ", RF_ngram_tfidf_accuracy)

Random Forest, n-gram TF-IDF Vectors:  0.741005291005291


In [53]:
dump(RF_ngram_tfidf, "models/RF_ngram_tfidf.pk1")

['models/RF_ngram_tfidf.pk1']

In [58]:
# Random Forest on character-level TF-IDF
RF_char_tfidf, RF_char_tfidf_accuracy = ml_classifier_model(RandomForestClassifier(), 
                                                            char_tfidf_x_train, char_tfidf_x_val, 
                                                            integer_y_train, integer_y_val)
print("Random Forest, character-level TF-IDF Vectors: ", RF_char_tfidf_accuracy)

Random Forest, character-level TF-IDF Vectors:  0.7425925925925926


In [59]:
dump(RF_char_tfidf, "models/RF_char_tfidf.pk1")

['models/RF_char_tfidf.pk1']

# K-Nearest Neighbors

In [26]:
# K-Nearest Neighbors on  Count Vectors
knn_bow, knn_bow_accuracy = ml_classifier_model(KNeighborsClassifier(n_neighbors=11), 
                                                bow_train, bow_x_val, 
                                                integer_y_train, integer_y_val)
print("K-Nearest Neighbors, Count Vectors: ", knn_bow_accuracy)

K-Nearest Neighbors, Count Vectors:  0.5005291005291005


In [54]:
dump(knn_bow, "models/knn_bow.pk1")

['models/knn_bow.pk1']

In [27]:
# Random Forest on TF-IDF
knn_tfidf, knn_tfidf_accuracy = ml_classifier_model(KNeighborsClassifier(n_neighbors=11), 
                                                    tfidf_x_train, tfidf_x_val, 
                                                    integer_y_train, integer_y_val)
print("Random Forest, TF-IDF Vectors: ", knn_tfidf_accuracy)

Random Forest, TF-IDF Vectors:  0.7825396825396825


In [55]:
dump(knn_tfidf, "models/knn_tfidf.pk1")

['models/knn_tfidf.pk1']

In [28]:
# Random Forest on n-gram TF-IDF
knn_ngram_tfidf, knn_ngram_tfidf_accuracy = ml_classifier_model(KNeighborsClassifier(n_neighbors=11), 
                                                                n_gram_tfidf_x_train, n_gram_tfidf_x_val, 
                                                                integer_y_train, integer_y_val)
print("Random Forest, n-gram TF-IDF Vectors: ", knn_ngram_tfidf_accuracy)

Random Forest, n-gram TF-IDF Vectors:  0.7412698412698413


In [56]:
dump(knn_ngram_tfidf, "models/knn_ngram_tfidf.pk1")

['models/knn_ngram_tfidf.pk1']

In [29]:
# Random Forest on character-level TF-IDF
knn_char_tfidf, knn_char_tfidf_accuracy = ml_classifier_model(KNeighborsClassifier(n_neighbors=11), 
                                                              char_tfidf_x_train, char_tfidf_x_val, 
                                                              integer_y_train, integer_y_val)
print("Random Forest, character-level TF-IDF Vectors: ", knn_char_tfidf_accuracy)

Random Forest, character-level TF-IDF Vectors:  0.8132275132275132


In [57]:
dump(knn_char_tfidf, "models/knn_char_tfidf.pk1")

['models/knn_char_tfidf.pk1']