In [4]:
#Import all necessary libraries
import pandas as pd
import numpy as np
from tqdm import tqdm
import time
import pickle

In [5]:
import warnings
warnings.simplefilter("ignore")

In [6]:
#Read the training and testing data
faq_train = pd.read_excel("faq.xlsx")
faq_test = pd.read_excel("faq_test.xlsx")

In [7]:
#Find dimensions of data
print("Training data shape: ", faq_train.shape)
print("Testing data shape: ", faq_test.shape)

Training data shape:  (12500, 2)
Testing data shape:  (424, 2)


In [8]:
#Displaying head of datasets
print("Traning Data")
print(faq_train.head())
print("Testing data")
print(faq_test.head())

Traning Data
                                      Question Title  Remarks
0  I have one chanakya  since 1860-2010 income ta...      FAQ
1                                             325000  Not FAQ
2                                         Back cover  Not FAQ
3  à¤à¥à¤¯à¤¾ pulverizer à¤®à¤¶à¥à¤¨ à¤¸à¥ à¤...      FAQ
4                                         Ikkat silk  Not FAQ
Testing data
                                      Question Title  Remarks
0  I want to Armature for philips food processor ...  Not FAQ
1                     Minimum how many KGS purchased  Not FAQ
2                     Minimum how many KGS purchased  Not FAQ
3                                        baby rabbit  Not FAQ
4    I am 6 foot 2 and I want a metal bed. Possible?  Not FAQ


In [10]:
#Preprocess/clean the data

In [11]:
#1. Remove numbers
faq_train['cleanQT'] = faq_train['Question Title'].str.replace("[0-9]"," ")
faq_test['cleanQT'] = faq_test['Question Title'].str.replace("[0-9]"," ")

In [12]:
#2. Remove special symbols
special_symbols = '!@#$%^&*()_-+=[]\{}|;",.<>/?~:\"'
faq_train['cleanQT'] = faq_train['cleanQT'].apply(lambda rss: ''.join(ch for ch in rss if ch not in set(special_symbols)))
faq_test['cleanQT'] = faq_test['cleanQT'].apply(lambda rss: ''.join(ch for ch in rss if ch not in set(special_symbols)))

In [13]:
#3. Convert all characters to lowercase
faq_train['cleanQT'] = faq_train['cleanQT'].str.lower()
faq_test['cleanQT'] = faq_test['cleanQT'].str.lower()

In [14]:
#4. Remove white spaces
faq_test['cleanQT'] = faq_test['cleanQT'].apply(lambda rws: ' '.join(rws.split()))

In [15]:
#Analyzing differences before/after preprocessing data
print("Training data")
print(faq_train.sample(5))
print("Testing data")
print(faq_test.sample(5))

Training data
                                          Question Title  Remarks  \
12247          What is Electro Mechanical Contact Coder?      FAQ   
1402                                         jcb  bh16sp  Not FAQ   
9593   What is the price of electrical circuit breake...      FAQ   
506            Looking Fruit Juice Manufacturer or Plant  Not FAQ   
1608        What is the best price of Fragrance Perfume?      FAQ   

                                                 cleanQT  
12247           what is electro mechanical contact coder  
1402                                         jcb  bh  sp  
9593   what is the price of electrical circuit breake...  
506            looking fruit juice manufacturer or plant  
1608         what is the best price of fragrance perfume  
Testing data
                                     Question Title  Remarks  \
100  5 inch kitchan west coupling will be availabal  Not FAQ   
226                        goldikapasbijbhav janavo  Not FAQ   
215        D

In [16]:
#Preparing ELMo vectors

In [17]:
#Importing libraries for ELMo
import tensorflow_hub as hub
import tensorflow as tf

In [18]:
#Loading the ELMo module
tf.compat.v1.disable_eager_execution()
elmo = hub.Module("https://tfhub.dev/google/elmo/2", trainable = True)

W0605 11:06:41.314031  4844 deprecation.py:323] From C:\Users\Imart\Anaconda3\lib\site-packages\tensorflow\python\ops\control_flow_ops.py:3632: colocate_with (from tensorflow.python.framework.ops) is deprecated and will be removed in a future version.
Instructions for updating:
Colocations handled automatically by placer.


In [19]:
#Defining function for creating ELMo vectors
def elmo_vectors(text):
    embeddings = elmo(text.tolist(),signature = "default", as_dict = True)["elmo"]
    with tf.compat.v1.Session() as session:
        session.run(tf.compat.v1.global_variables_initializer())
        session.run(tf.compat.v1.tables_initializer())
        return session.run(tf.reduce_mean(embeddings,1))

In [20]:
#Splitting dataset into batches for better computation
elmo_start_time = time.time()
faq_train_list = [faq_train[i:i+100] for i in range(0,faq_train.shape[0],100)]
faq_test_list = [faq_test[i:i+100] for i in range(0,faq_test.shape[0],100)]

In [None]:
#Extracting ELMo vectors
elmo_extraction_start_time = time.time()
faq_elmo_train = [elmo_vectors(x['cleanQT']) for x in faq_train_list]
faq_elmo_test = [elmo_vectors(x['cleanQT']) for x in faq_test_list]
elmo_extraction_end_time = time.time()
print("Total extraction time for ELMo vectors: {} seconds".format(elmo_extraction_end_time - elmo_extraction_start_time))

In [None]:
#Checking dimensions of ELMo vectors
print("Training: ",len(faq_elmo_train))
print("Testing: ",len(faq_elmo_test))

In [None]:
#Concatenatening all batches
elmo_concat_start_time = time.time()
elmo_faq_train = np.concatenate(faq_elmo_train, axis = 0)
elmo_faq_test = np.concatenate(faq_elmo_test, axis = 0)
elmo_end_time = elmo_concat_end_time = time.time()
print("Total concatenation time: {} seconds".format(elmo_concat_end_time - elmo_concat_start_time))
print("Total time for ELMo vector extraction: {} seconds".format(elmo_end_time - elmo_start_time))

In [None]:
#Saving output to pickle file
pickle_out_train = open("elmo_faq_train_04062019.pickle","wb")
pickle_out_test = open("elmo_faq_test_04062019.pickle","wb")
pickle.dump(elmo_faq_train, pickle_out_train)
pickle.dump(elmo_faq_test,pickle_out_test)
pickle_out_train.close()
pickle_out_test.close()

In [25]:
#Loading ELMo vectors pickle file
pickle_in_train = open("elmo_faq_train_04062019.pickle","rb")
pickle_in_test = open("elmo_faq_test_04062019.pickle","rb")
elmo_faq_train = pickle.load(pickle_in_train)
elmo_faq_test = pickle.load(pickle_in_test)

In [26]:
#Checking shape of concatenated files
print("Training: ",elmo_faq_train.shape)
print("Testing: ",elmo_faq_test.shape)

Training:  (12500, 1024)
Testing:  (424, 1024)


In [None]:
#Model Building

In [41]:
#Defining training and validation sets
xtrain = pd.DataFrame(elmo_faq_train)
ytrain = faq_train['Remarks']
xvalid = pd.DataFrame(elmo_faq_test)
yvalid = faq_test['Remarks']

In [70]:
#Buidling a logistic Regression Model
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score, f1_score, matthews_corrcoef
lr_start_time = time.time()
regressor = LogisticRegression()
regressor.fit(xtrain,ytrain)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [71]:
#Predicting on the validation set
pred_val_lr = regressor.predict(xvalid)
lr_end_time = time.time()
print("Total time spent on LR: {} seconds".format(lr_end_time - lr_start_time))

Total time spent on LR: 3.3331825733184814 seconds


In [72]:
#Evaluation of Logistic Regression model
print("Logistic Regression")
print("Precision: ",precision_score(yvalid,pred_val_lr,pos_label='FAQ'))
print("Recall: ",recall_score(yvalid,pred_val_lr,pos_label='FAQ'))
print("F1 Score: ",f1_score(yvalid, pred_val_lr,pos_label='FAQ'))
print("MCC: ",matthews_corrcoef(yvalid, pred_val_lr))

Logistic Regression
Precision:  0.8173913043478261
Recall:  0.7899159663865546
F1 Score:  0.8034188034188035
MCC:  0.7287335379525473


In [73]:
#Building a Naive Bayes Classifier
from sklearn.naive_bayes import GaussianNB
nb_start_time = time.time()
nbclassifier = GaussianNB()
nbclassifier.fit(xtrain,ytrain)

GaussianNB(priors=None, var_smoothing=1e-09)

In [74]:
#Predicting on the validation set
pred_val_nb = nbclassifier.predict(xvalid)
nb_end_time = time.time()
print("Total time spent on NB: {} seconds".format(nb_end_time - nb_start_time))

Total time spent on NB: 0.8977758884429932 seconds


In [75]:
#Evaluation of Naive Bayes Classifier
print("Naive Bayes")
print("Precision: ",precision_score(yvalid,pred_val_nb,pos_label='FAQ'))
print("Recall: ",recall_score(yvalid,pred_val_nb,pos_label='FAQ'))
print("F1 Score: ",f1_score(yvalid, pred_val_nb,pos_label='FAQ'))
print("MCC: ",matthews_corrcoef(yvalid, pred_val_nb))

Naive Bayes
Precision:  0.7333333333333333
Recall:  0.3697478991596639
F1 Score:  0.4916201117318435
MCC:  0.4090269641657418


In [76]:
#Creating a Linear SVM (SGD) Classifier
from sklearn.linear_model import SGDClassifier
sgd_start_time = time.time()
sgdclassifier = SGDClassifier(random_state = 1)
sgdclassifier.fit(xtrain,ytrain)

SGDClassifier(alpha=0.0001, average=False, class_weight=None,
       early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
       l1_ratio=0.15, learning_rate='optimal', loss='hinge', max_iter=None,
       n_iter=None, n_iter_no_change=5, n_jobs=None, penalty='l2',
       power_t=0.5, random_state=1, shuffle=True, tol=None,
       validation_fraction=0.1, verbose=0, warm_start=False)

In [77]:
#Predicting on the validation set
pred_val_sgd = sgdclassifier.predict(xvalid)
sgd_end_time = time.time()
print("Total time spent on SVM (SGD): {} seconds".format(sgd_end_time - sgd_start_time))

Total time spent on SVM (SGD): 0.7104437351226807 seconds


In [78]:
#Evaluation of Linear SVM (SGD) Classifier
print("Linear SVM (SGD)")
print("Precision: ",precision_score(yvalid,pred_val_sgd,pos_label='FAQ'))
print("Recall: ",recall_score(yvalid,pred_val_sgd,pos_label='FAQ'))
print("F1 Score: ",f1_score(yvalid, pred_val_sgd,pos_label='FAQ'))
print("MCC: ",matthews_corrcoef(yvalid, pred_val_sgd))

Linear SVM (SGD)
Precision:  0.7886178861788617
Recall:  0.8151260504201681
F1 Score:  0.8016528925619835
MCC:  0.722668539689978


In [79]:
#Buidling a Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier
rfc_start_time = time.time()
rfc = RandomForestClassifier(random_state = 1)
rfc.fit(xtrain,ytrain)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=1, verbose=0, warm_start=False)

In [80]:
#Predicting on the validation set
pred_val_rfc = rfc.predict(xvalid)
rfc_end_time = time.time()
print("Total time spent on RFC: {} seconds".format(rfc_end_time - rfc_start_time))

Total time spent on RFC: 3.885451555252075 seconds


In [81]:
#Evaluation of Random Forest Classifier
print("RFC")
print("Precision: ",precision_score(yvalid,pred_val_rfc,pos_label='FAQ'))
print("Recall: ",recall_score(yvalid,pred_val_rfc,pos_label='FAQ'))
print("F1 Score: ",f1_score(yvalid, pred_val_rfc,pos_label='FAQ'))
print("MCC: ",matthews_corrcoef(yvalid, pred_val_rfc))

RFC
Precision:  0.6496350364963503
Recall:  0.7478991596638656
F1 Score:  0.6953125
MCC:  0.5673595005986476


In [82]:
#Buidling a K Nearest Neighbours Classifer
from sklearn.neighbors import KNeighborsClassifier
knc_start_time = time.time()
knc = KNeighborsClassifier()
knc.fit(xtrain,ytrain)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform')

In [83]:
#Predicting on the validation set
pred_val_knc = knc.predict(xvalid)
knc_end_time = time.time()
print("Total time spent on KNC: {} seconds".format(knc_end_time - knc_start_time))

Total time spent on KNC: 8.525726556777954 seconds


In [84]:
#Evaluation of K Nearest Neighbours Classifer
print("KNC")
print("Precision: ",precision_score(yvalid,pred_val_knc,pos_label='FAQ'))
print("Recall: ",recall_score(yvalid,pred_val_knc,pos_label='FAQ'))
print("F1 Score: ",f1_score(yvalid, pred_val_knc,pos_label='FAQ'))
print("MCC: ",matthews_corrcoef(yvalid, pred_val_knc))

KNC
Precision:  0.7
Recall:  0.7058823529411765
F1 Score:  0.7029288702928871
MCC:  0.5863570989726369


In [98]:
#Buidling a Gaussian Process Classifier
from sklearn.gaussian_process import GaussianProcessClassifier
gpc_start_time = time.time()
gpc = GaussianProcessClassifier(random_state = 1)
gpc.fit(xtrain,ytrain)

GaussianProcessClassifier(copy_X_train=True, kernel=None,
             max_iter_predict=100, multi_class='one_vs_rest', n_jobs=None,
             n_restarts_optimizer=0, optimizer='fmin_l_bfgs_b',
             random_state=1, warm_start=False)

In [99]:
#Predicting on the validation set
pred_val_gpc = gpc.predict(xvalid)
gpc_end_time = time.time()
print("Total time spent on GPC: {} seconds".format(gpc_end_time - gpc_start_time))

Total time spent on GPC: 431.55591440200806 seconds


In [100]:
#Evaluation of Gaussian Processs Classifier
print("GPC")
print("Precision: ",precision_score(yvalid,pred_val_gpc,pos_label='FAQ'))
print("Recall: ",recall_score(yvalid,pred_val_gpc,pos_label='FAQ'))
print("F1 Score: ",f1_score(yvalid, pred_val_gpc,pos_label='FAQ'))
print("MCC: ",matthews_corrcoef(yvalid, pred_val_gpc))

GPC
Precision:  0.7545454545454545
Recall:  0.6974789915966386
F1 Score:  0.7248908296943231
MCC:  0.6242334078185484


In [85]:
#Buidling a Decision Tree Classifier
from sklearn.tree import DecisionTreeClassifier
dtc_start_time = time.time()
dtc = DecisionTreeClassifier(random_state = 1)
dtc.fit(xtrain,ytrain)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=1,
            splitter='best')

In [86]:
#Predicting on the validation set
pred_val_dtc = dtc.predict(xvalid)
dtc_end_time = time.time()
print("Total time spent on GPC: {} seconds".format(dtc_end_time - dtc_start_time))

Total time spent on GPC: 27.65597653388977 seconds


In [87]:
#Evaluation of Decision Tree Classifier
print("DTC")
print("Precision: ",precision_score(yvalid,pred_val_dtc,pos_label='FAQ'))
print("Recall: ",recall_score(yvalid,pred_val_dtc,pos_label='FAQ'))
print("F1 Score: ",f1_score(yvalid, pred_val_dtc,pos_label='FAQ'))
print("MCC: ",matthews_corrcoef(yvalid, pred_val_dtc))

DTC
Precision:  0.49230769230769234
Recall:  0.5378151260504201
F1 Score:  0.5140562248995985
MCC:  0.31322270106457617


In [92]:
#Buidling a MLP Classifier
from sklearn.neural_network import MLPClassifier
mlp_start_time = time.time()
mlp = MLPClassifier(random_state = 1)
mlp.fit(xtrain,ytrain)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=1, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False)

In [93]:
#Predicting on the validation set
pred_val_mlp = mlp.predict(xvalid)
mlp_end_time = time.time()
print("Total time spent on MLP: {} seconds".format(mlp_end_time - mlp_start_time))

Total time spent on MLP: 81.99546360969543 seconds


In [94]:
#Evaluation of MLP Classifier
print("MLP")
print("Precision: ",precision_score(yvalid,pred_val_mlp,pos_label='FAQ'))
print("Recall: ",recall_score(yvalid,pred_val_mlp,pos_label='FAQ'))
print("F1 Score: ",f1_score(yvalid, pred_val_mlp,pos_label='FAQ'))
print("MCC: ",matthews_corrcoef(yvalid, pred_val_mlp))

MLP
Precision:  0.7027027027027027
Recall:  0.8739495798319328
F1 Score:  0.7790262172284644
MCC:  0.6878199687942644


In [95]:
#Buidling a AdaBoost Classifier
from sklearn.ensemble import AdaBoostClassifier
adab_start_time = time.time()
adab = AdaBoostClassifier(random_state = 1)
adab.fit(xtrain,ytrain)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=1)

In [96]:
#Predicting on the validation set
pred_val_adab = adab.predict(xvalid)
adab_end_time = time.time()
print("Total time spent on AdaBoost: {} seconds".format(adab_end_time - adab_start_time))

Total time spent on AdaBoost: 68.39412331581116 seconds


In [97]:
#Evaluation of AdaBoost Classifier
print("AdaBoost")
print("Precision: ",precision_score(yvalid,pred_val_adab,pos_label='FAQ'))
print("Recall: ",recall_score(yvalid,pred_val_adab,pos_label='FAQ'))
print("F1 Score: ",f1_score(yvalid, pred_val_adab,pos_label='FAQ'))
print("MCC: ",matthews_corrcoef(yvalid, pred_val_adab))

AdaBoost
Precision:  0.6611570247933884
Recall:  0.6722689075630253
F1 Score:  0.6666666666666667
MCC:  0.5351366082006004


In [88]:
#Buidling a QDA Classifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
qda_start_time = time.time()
qda = QuadraticDiscriminantAnalysis()
qda.fit(xtrain,ytrain)

QuadraticDiscriminantAnalysis(priors=None, reg_param=0.0,
               store_covariance=False, store_covariances=None, tol=0.0001)

In [89]:
#Predicting on the validation set
pred_val_qda = qda.predict(xvalid)
qda_end_time = time.time()
print("Total time spent on QDA: {} seconds".format(qda_end_time - qda_start_time))

Total time spent on QDA: 3.2382919788360596 seconds


In [90]:
#Evaluation of QDA Classifier
print("QDA")
print("Precision: ",precision_score(yvalid,pred_val_qda,pos_label='FAQ'))
print("Recall: ",recall_score(yvalid,pred_val_qda,pos_label='FAQ'))
print("F1 Score: ",f1_score(yvalid, pred_val_qda,pos_label='FAQ'))
print("MCC: ",matthews_corrcoef(yvalid, pred_val_qda))

QDA
Precision:  0.5163043478260869
Recall:  0.7983193277310925
F1 Score:  0.6270627062706271
MCC:  0.4592002106490289


In [None]:
end_time = time.time()
print("Total time of project execution: {} seconds".format(end_time - start_time))

In [None]:
import numpy as np
for i in ['hinge', 'log']:
    for j in ['l2', 'elasticnet']:
        for k in np.arange(0.0001,0.0010,0.0001):
            sgd_start_time = time.time()
            sgdclassifier = SGDClassifier(loss=i,penalty=j,alpha=k,random_state=1)
            sgdclassifier.fit(xtrain,ytrain)
            pred_val_sgd = sgdclassifier.predict(xvalid)
            sgd_end_time = time.time()
            print("Total time spent on SVM (SGD): {} seconds".format(sgd_end_time - sgd_start_time))
            print("Linear SVM (SGD) with ",i,j,k)
            print("Precision: ",precision_score(yvalid,pred_val_sgd,pos_label='FAQ'))
            print("Recall: ",recall_score(yvalid,pred_val_sgd,pos_label='FAQ'))
            print("F1 Score: ",f1_score(yvalid, pred_val_sgd,pos_label='FAQ'))
            print("MCC: ",matthews_corrcoef(yvalid, pred_val_sgd))
            print()

In [None]:
sgd_start_time = time.time()
sgdclassifier = SGDClassifier(loss='hinge',penalty='elasticnet',max_iter=2000,random_state=1)
sgdclassifier.fit(xtrain,ytrain)
pred_val_sgd = sgdclassifier.predict(xvalid)
sgd_end_time = time.time()
print("Total time spent on SVM (SGD): {} seconds".format(sgd_end_time - sgd_start_time))
print("Linear SVM (SGD) with ",i,j,k)
print("Precision: ",precision_score(yvalid,pred_val_sgd,pos_label='FAQ'))
print("Recall: ",recall_score(yvalid,pred_val_sgd,pos_label='FAQ'))
print("F1 Score: ",f1_score(yvalid, pred_val_sgd,pos_label='FAQ'))
print("MCC: ",matthews_corrcoef(yvalid, pred_val_sgd))
print()