# Necessary libraries

In [None]:
# Libraries
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 

import pandas as pd 

from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression

from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble

import pandas, xgboost, numpy, textblob, string
from keras.preprocessing import text, sequence
from keras import layers, models, optimizers
from sklearn.neural_network import MLPClassifier
from sklearn.datasets import make_classification

# Handling Raw Corpus

In [None]:
# load the dataset
import pandas as pd
trainDF = pd.read_csv('data.csv', delimiter='\t')
#trainDF = pd.read_csv('/content/drive/MyDrive/Question Classifier Telugu/Finalized_samples.csv', delimiter=',')
# trainDF = pd.read_csv('/content/drive/MyDrive/Question Classifier Telugu/abs_eval.csv', delimiter=',')

trainDF.head()

Unnamed: 0,text,category
0,“బిట్వీన్ హోప్ అండ్ హిస్టరీ” రచయిత ఎవరు?,PER
1,మ్యాజిక్ మౌంటైన్ రచయిత ఎవరు?,PER
2,ఆఫ్రికాలో మాలికి అత్యంత ప్రసిద్ధ పాలకుడు ఎవరు?,PER
3,దాస్ కాపిటల్ రచయిత ఎవరు?,PER
4,“క్లియర్ లైట్ ఆఫ్ డే” రచయిత ఎవరు?,PER


## Data Splitting into Train and Test

In [None]:
# split the dataset into training and validation datasets 
train_x, test_x, train_y, test_y = model_selection.train_test_split(trainDF['text'], trainDF['category'], test_size=0.2, random_state=42)
#print(train_y.value_counts())
# label encode the target variable 
encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
test_y = encoder.fit_transform(test_y)
print(test_y)
classes = list(encoder.classes_)
print(len(classes))
print(train_y)

[5 5 5 ... 3 5 5]
9
[1 3 5 ... 5 4 4]


# Feature Engineering: Counter Vectorizer

In [None]:
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')
count_vect.fit(trainDF['text'])
# transform the training and validation data using count vectorizer object
xtrain_count =  count_vect.fit_transform(train_x)
#print(xtrain_count)
xvalid_count =  count_vect.transform(test_x)
#print(xvalid_count)
xtrain_count.shape, xvalid_count.shape

((25713, 3164), (6429, 3164))

# Feature Engineering: TF-IDF Vectorizer

In [None]:
# word level tf-idf
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern = r'\w{1,}', max_features=5000)
tfidf_vect.fit(trainDF['text'])
xtrain_tfidf =  tfidf_vect.transform(train_x)
xvalid_tfidf =  tfidf_vect.transform(test_x)

# ngram level tf-idf 
tfidf_vect_ngram = TfidfVectorizer(analyzer='word', ngram_range=(2,3), max_features=25000)
tfidf_vect_ngram.fit(trainDF['text'])
xtrain_tfidf_ngram =  tfidf_vect_ngram.transform(train_x)
xvalid_tfidf_ngram =  tfidf_vect_ngram.transform(test_x)

# characters level tf-idf
tfidf_vect_ngram_chars = TfidfVectorizer(analyzer='char', ngram_range=(2,3), max_features=25000)
tfidf_vect_ngram_chars.fit(trainDF['text'])
xtrain_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(train_x) 
xvalid_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(test_x) 

# Model Traning

In [None]:
def train_model(classifier, feature_vector_train, label, feature_vector_valid, is_neural_net=False):
    # fit the training dataset on the classifier
    classifier.fit(feature_vector_train, label)
    
    # predict the labels on validation dataset
    predictions = classifier.predict(feature_vector_valid)

    if is_neural_net:
        predictions = predictions.argmax(axis=-1)

    
    acc = metrics.accuracy_score(predictions, test_y)
    f1 = metrics.f1_score(predictions, test_y, average='weighted')
    #print(classification_report(predictions, test_y, target_names = list(encoder.classes_)))
    return acc, f1



## Logistic Regression

In [None]:
# Linear Classifier on Count Vectors
accuracy, f1_score = train_model(linear_model.LogisticRegression(solver='sag',multi_class='multinomial', max_iter=25000), xtrain_count, train_y, xvalid_count)
print("LR, Count Vectors: ", accuracy, f1_score)

# Linear Classifier on Word Level TF IDF Vectors
accuracy, f1_score = train_model(linear_model.LogisticRegression(multi_class='multinomial', solver='sag', max_iter=25000), xtrain_tfidf, train_y, xvalid_tfidf)
print("LR, WordLevel TF-IDF: ", accuracy, f1_score)

# Linear Classifier on Ngram Level TF IDF Vectors
accuracy, f1_score = train_model(linear_model.LogisticRegression(multi_class='multinomial', solver='sag', max_iter=25000), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram)
print("LR, N-Gram Vectors: ", accuracy, f1_score)

# Linear Classifier on Character Level TF IDF Vectors
accuracy, f1_score = train_model(linear_model.LogisticRegression(multi_class='multinomial', solver='sag', max_iter=25000), xtrain_tfidf_ngram_chars, train_y, xvalid_tfidf_ngram_chars)
print("LR, CharLevel Vectors: ", accuracy, f1_score)

LR, Count Vectors:  0.9044952558718308 0.9064536356046402
LR, WordLevel TF-IDF:  0.8962513610203764 0.9015063886574495
LR, N-Gram Vectors:  0.8172344065951159 0.8424313889512708
LR, CharLevel Vectors:  0.9380930160211541 0.9407068103873125


## Naive Bayes Classifier

In [None]:
# Naive Bayes on Count Vectors
accuracy, f1_score = train_model(naive_bayes.MultinomialNB(), xtrain_count, train_y, xvalid_count)
print("NB, Count Vectors: ", accuracy, f1_score)


# Naive Bayes on Word Level TF IDF Vectors
accuracy, f1_score = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf, train_y, xvalid_tfidf)
print("NB, WordLevel TF-IDF: ", accuracy, f1_score)

# Naive Bayes on Ngram Level TF IDF Vectors
accuracy, f1_score = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram)
print("NB, N-Gram Vectors: ", accuracy, f1_score)

# Naive Bayes on Character Level TF IDF Vectors
accuracy, f1_score = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf_ngram_chars, train_y, xvalid_tfidf_ngram_chars)
print("NB, CharLevel Vectors: ", accuracy, f1_score)

NB, Count Vectors:  0.862653600871053 0.8733350496568999
NB, WordLevel TF-IDF:  0.7553274226162701 0.8076178311156097
NB, N-Gram Vectors:  0.7987245294758127 0.8332640889824355
NB, CharLevel Vectors:  0.8183232228962514 0.859616293933658


## Support Vector Machine

In [None]:
# SVM on Count Vectors
accuracy = train_model(svm.SVC(), xtrain_count, train_y, xvalid_count)
print("SVM, Count Vectors: ", accuracy)

# SVM on TF IDF Vectors
accuracy = train_model(svm.SVC(), xtrain_tfidf, train_y, xvalid_tfidf)
print("SVM, TF-IDF Vectors: ", accuracy)

# SVM on Ngram Level TF IDF Vectors
accuracy = train_model(svm.SVC(), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram)
print("SVM, Ngram Level Vectors: ", accuracy)

# SVM on Character Level TF IDF Vectors
accuracy = train_model(svm.SVC(), xtrain_tfidf_ngram_chars, train_y, xvalid_tfidf_ngram_chars)
print("SVM, Character Level Vectors: ", accuracy)

SVM, Count Vectors:  (0.9007621714107948, 0.905440544206523)
SVM, TF-IDF Vectors:  (0.9188054129724685, 0.9219383932268962)
SVM, Ngram Level Vectors:  (0.8425882718929849, 0.8608087228714503)
SVM, Character Level Vectors:  (0.9499144501477679, 0.9514699362934937)


## Random Forest Classifier

In [None]:
# RF on Count Vectors
accuracy = train_model(ensemble.RandomForestClassifier(), xtrain_count, train_y, xvalid_count)
print("RF, Count Vectors: ", accuracy)

# RF on Word Level TF IDF Vectors
accuracy = train_model(ensemble.RandomForestClassifier(n_estimators=10, min_samples_split=2, n_jobs=-1), xtrain_tfidf, train_y, xvalid_tfidf)
print("RF, WordLevel TF-IDF: ", accuracy)


# RF on Word Level TF IDF Vectors
accuracy = train_model(ensemble.RandomForestClassifier(n_estimators=10, min_samples_split=2, n_jobs=-1), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram)
print("RF, NgramLevel TF-IDF: ", accuracy)


# RF on Word Level TF IDF Vectors
accuracy = train_model(ensemble.RandomForestClassifier(n_estimators=10, min_samples_split=2, n_jobs=-1), xtrain_tfidf_ngram_chars, train_y, xvalid_tfidf_ngram_chars)
print("RF, Character Level TF-IDF: ", accuracy)

RF, Count Vectors:  (0.9096282470057552, 0.9170521977322891)
RF, WordLevel TF-IDF:  (0.8830300202208742, 0.8920590405157305)
RF, NgramLevel TF-IDF:  (0.8436770881941204, 0.8483803088259527)
RF, Character Level TF-IDF:  (0.9181832322289625, 0.9224476057741355)


## Boosting Model

In [None]:
# Extereme Gradient Boosting on Count Vectors
accuracy = train_model(xgboost.XGBClassifier(), xtrain_count.tocsc(), train_y, xvalid_count.tocsc())
print("Xgb, Count Vectors: ", accuracy)

# Extereme Gradient Boosting on Word Level TF IDF Vectors
accuracy = train_model(xgboost.XGBClassifier(), xtrain_tfidf.tocsc(), train_y, xvalid_tfidf.tocsc())
print("Xgb, WordLevel TF-IDF: ", accuracy)

# Extereme Gradient Boosting on Character Level TF IDF Vectors
accuracy = train_model(xgboost.XGBClassifier(), xtrain_tfidf_ngram_chars.tocsc(), train_y, xvalid_tfidf_ngram_chars.tocsc())
print("Xgb, CharLevel Vectors: ", accuracy)

Xgb, Count Vectors:  (0.8881630113547986, 0.8919517013767069)
Xgb, WordLevel TF-IDF:  (0.8909628247005755, 0.8949147861086001)
Xgb, CharLevel Vectors:  (0.9321822989578472, 0.934191575086161)


## MLP Classifier

In [None]:
# MLP with Counter Vectors
accuracy = train_model(MLPClassifier(random_state=1, max_iter=300), xtrain_count, train_y, xvalid_count)
print("MLP, Counter Vectors: ", accuracy)

# MLP with TF-IDF Word Level Vectors
accuracy = train_model(MLPClassifier(random_state=1, max_iter=300), xtrain_tfidf, train_y, xvalid_tfidf)
print("MLP, TF-IDF Word Level: ", accuracy)

# MLP with TF-IDF N-gram Level Vectors
accuracy = train_model(MLPClassifier(random_state=1, max_iter=300), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram)
print("MLP, TF-IDF Ngram Level: ", accuracy)

# MLP with TF-IDF Character Level Vectors
accuracy = train_model(MLPClassifier(random_state=1, max_iter=300), xtrain_tfidf_ngram_chars, train_y, xvalid_tfidf_ngram_chars)
print("MLP, TF-IDF Character Level: ", accuracy)

MLP, Counter Vectors:  (0.9083838855187432, 0.9100247845389835)
MLP, TF-IDF Word Level:  (0.8970290869497589, 0.8981659691329391)
MLP, TF-IDF Ngram Level:  (0.8547207963913517, 0.8569133238767289)
MLP, TF-IDF Character Level:  (0.9491367242183855, 0.949851192890966)
