In [1]:
from Data_Preprocess import Data_Preprocess
from model import ml_classifier_model
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.neighbors import KNeighborsClassifier

In [2]:
dp = Data_Preprocess()

# Loading Training Data

In [3]:
training_path = "dataset/marathi-training-data.tsv"
train_data = dp.read_data(training_path)
train_data

Unnamed: 0,text,label
0,"प्रा . प्रताप हरिदास : होय , मला वाटते की हा ए...",com_tech
1,"तर , विशिष्ट गोष्टींद्वारे , ठराविक कायद्यांद्...",bioche
2,- - - - - - - - - - - - - - - - - - - - - - - ...,cse
3,"तर , आपला अर्धा चिन्ह 9 वाजता असेल .",phy
4,"म्हणून , मी असे म्हणालो की जर शेकडो , हजारो कि...",phy
...,...,...
41992,"जरी आपण डेटा कूटबद्ध केला , तरीही हा मुख्य व्य...",cse
41993,"ते म्हणतात - "" ज्याला पाहण्यासाठी डोळे , ऎकण्य...",com_tech
41994,"प्रथम क्रोनोलॉजिकल , क्रॉनोलॉजी म्हणजे आपल्याल...",com_tech
41995,"त्या थोड्या तपशीलावर येईल , जेणेकरून संपूर्ण ग...",bioche


In [4]:
train_data.label.value_counts()

com_tech    17995
phy          9656
cse          9344
bioche       5002
Name: label, dtype: int64

In [4]:
x_train = train_data.text.values.tolist()
y_train = train_data.label.values.tolist()
print(len(x_train))
print(len(y_train))

41997
41997


# loading Validation Data

In [5]:
val_path = "dataset/marathi-validation-data.tsv"
val_data = dp.read_data(val_path)
val_data

Unnamed: 0,text,label
0,1 ची ओळ .,cse
1,"तर , ही एक टॉवर आहे जी टॉवरवर निश्चित केली जात...",phy
2,"तर , थ्रेडच्या परतीच्या स्थितीस पास करण्यासाठी...",cse
3,आपण लोक शोधत आहात जे आपल्यासाठी काहीतरी करू शक...,com_tech
4,लिनक्स कर्नल अनुसूचीतकरणामध्ये अशी तंत्र असण्य...,cse
...,...,...
3775,"नंतर वृद्धत्व , व्हॉट मोठ्या प्रमाणात फ्रॉन 12...",bioche
3776,आणि मग सुद्धा आपल्याला काही सेकंदांनंतर माहित ...,phy
3777,"तर , आपण ELF शीर्षलेखासह प्रारंभ करू .",cse
3778,"तर , त्या क्रॉस्टची जाडी आहे .",phy


In [7]:
val_data.label.value_counts()

com_tech    1505
phy          970
cse          885
bioche       420
Name: label, dtype: int64

In [6]:
x_val = val_data.text.values.tolist()
y_val = val_data.label.values.tolist()
print(len(x_val))
print(len(y_val))

3780
3780


# Preprocessing the x and y data

In [7]:
bow_train, bow_x_val = dp.bow_vectorize(x_train, x_val)
tfidf_x_train, tfidf_x_val = dp.tfidf_vectorize(x_train, x_val)
n_gram_tfidf_x_train, n_gram_tfidf_x_val = dp.n_gram_tfidf_vectorize(x_train, x_val)
char_tfidf_x_train, char_tfidf_x_val = dp.char_tfidf_vectorize(x_train, x_val)

In [8]:
bow_train.shape

(41997, 5947)

In [26]:
unigrams = dp.get_unigrams(x_train)
print()
dicty = dp.counter_to_dict(unigrams)
uni = set()

for key in dicty:
    uni.add(key)
    
print(len(uni))


24153


In [None]:
integer_y_train, integer_y_val = dp.label_encoder(y_train, y_val)

# Multinomial Naive Bayes

In [None]:
# Naive Bayes on Count Vectors
NB_bow, NB_bow_accuracy = ml_classifier_model(MultinomialNB(), 
                                              bow_train, bow_x_val, 
                                              integer_y_train, integer_y_val)
print("Naive Bayes, Count Vectors: ", NB_bow_accuracy)

In [None]:
# Naive Bayes on TF-IDF
NB_tfidf, NB_tfidf_accuracy = ml_classifier_model(MultinomialNB(), 
                                                  tfidf_x_train, tfidf_x_val, 
                                                  integer_y_train, integer_y_val)
print("Naive Bayes, TF-IDF Vectors: ", NB_tfidf_accuracy)

In [None]:
# Naive Bayes on n-gram TF-IDF
NB_ngram_tfidf, NB_ngram_tfidf_accuracy = ml_classifier_model(MultinomialNB(), 
                                                              n_gram_tfidf_x_train, n_gram_tfidf_x_val, 
                                                              integer_y_train, integer_y_val)
print("Naive Bayes, n-gram TF-IDF Vectors: ", NB_ngram_tfidf_accuracy)

In [None]:
# Naive Bayes on character-level TF-IDF
NB_char_tfidf, NB_char_tfidf_accuracy = ml_classifier_model(MultinomialNB(), 
                                                            char_tfidf_x_train, char_tfidf_x_val, 
                                                            integer_y_train, integer_y_val)
print("Naive Bayes, character-level TF-IDF Vectors: ", NB_char_tfidf_accuracy)

# Linear SVC

In [None]:
# Linear SVC on Count Vectors
LSVC_bow, LSVC_bow_accuracy = ml_classifier_model(LinearSVC(max_iter=6000), 
                                                  bow_train, bow_x_val, 
                                                  integer_y_train, integer_y_val)
print("Linear SVC, Count Vectors: ", LSVC_bow_accuracy)

In [None]:
# Linear SVC on TF-IDF
LSVC_tfidf, LSVC_tfidf_accuracy = ml_classifier_model(LinearSVC(), 
                                                      tfidf_x_train, tfidf_x_val, 
                                                      integer_y_train, integer_y_val)
print("Linear SVC, TF-IDF Vectors: ", LSVC_tfidf_accuracy)

In [None]:
# Linear SVC on n-gram TF-IDF
LSVC_ngram_tfidf, LSVC_ngram_tfidf_accuracy = ml_classifier_model(LinearSVC(), 
                                                                  n_gram_tfidf_x_train, n_gram_tfidf_x_val, 
                                                                  integer_y_train, integer_y_val)
print("Linear SVC, n-gram TF-IDF Vectors: ", LSVC_ngram_tfidf_accuracy)

In [None]:
# Linear SVC on character-level TF-IDF
LSVC_char_tfidf, LSVC_char_tfidf_accuracy = ml_classifier_model(LinearSVC(), 
                                                                char_tfidf_x_train, char_tfidf_x_val, 
                                                                integer_y_train, integer_y_val)
print("Linear SVC, character-level TF-IDF Vectors: ", LSVC_char_tfidf_accuracy)

# Random Forest

In [None]:
# Random Forest on  Count Vectors
RF_bow, RF_bow_accuracy = ml_classifier_model(RandomForestClassifier(), 
                                              bow_train, bow_x_val, 
                                              integer_y_train, integer_y_val)
print("Random Forest, Count Vectors: ", RF_bow_accuracy)

In [None]:
# Random Forest on TF-IDF
RF_tfidf, RF_tfidf_accuracy = ml_classifier_model(RandomForestClassifier(), 
                                                  tfidf_x_train, tfidf_x_val, 
                                                  integer_y_train, integer_y_val)
print("Random Forest, TF-IDF Vectors: ", RF_tfidf_accuracy)

In [None]:
# Random Forest on n-gram TF-IDF
RF_ngram_tfidf, RF_ngram_tfidf_accuracy = ml_classifier_model(RandomForestClassifier(), 
                                                              n_gram_tfidf_x_train, n_gram_tfidf_x_val, 
                                                              integer_y_train, integer_y_val)
print("Random Forest, n-gram TF-IDF Vectors: ", RF_ngram_tfidf_accuracy)

In [None]:
# Random Forest on character-level TF-IDF
RF_char_tfidf, RF_char_tfidf_accuracy = ml_classifier_model(RandomForestClassifier(), 
                                                            char_tfidf_x_train, char_tfidf_x_val, 
                                                            integer_y_train, integer_y_val)
print("Random Forest, character-level TF-IDF Vectors: ", RF_char_tfidf_accuracy)

# K-Nearest Neighbors

In [None]:
# K-Nearest Neighbors on  Count Vectors
knn_bow, knn_bow_accuracy = ml_classifier_model(KNeighborsClassifier(n_neighbors=11), 
                                                bow_train, bow_x_val, 
                                                integer_y_train, integer_y_val)
print("K-Nearest Neighbors, Count Vectors: ", knn_bow_accuracy)

In [None]:
# Random Forest on TF-IDF
knn_tfidf, knn_tfidf_accuracy = ml_classifier_model(KNeighborsClassifier(n_neighbors=11), 
                                                    tfidf_x_train, tfidf_x_val, 
                                                    integer_y_train, integer_y_val)
print("Random Forest, TF-IDF Vectors: ", knn_tfidf_accuracy)

In [None]:
# Random Forest on n-gram TF-IDF
knn_ngram_tfidf, knn_ngram_tfidf_accuracy = ml_classifier_model(KNeighborsClassifier(n_neighbors=11), 
                                                                n_gram_tfidf_x_train, n_gram_tfidf_x_val, 
                                                                integer_y_train, integer_y_val)
print("Random Forest, n-gram TF-IDF Vectors: ", knn_ngram_tfidf_accuracy)

In [None]:
# Random Forest on character-level TF-IDF
knn_char_tfidf, knn_char_tfidf_accuracy = ml_classifier_model(KNeighborsClassifier(n_neighbors=11), 
                                                              char_tfidf_x_train, char_tfidf_x_val, 
                                                              integer_y_train, integer_y_val)
print("Random Forest, character-level TF-IDF Vectors: ", knn_char_tfidf_accuracy)