# [Quora Answer Classifier](https://www.quora.com/challenges#answer_classifier) 

Created on Wed Aug 10 12:35:52 2016
@author: Aamir

In [1]:
import numpy as np
from sklearn import svm, preprocessing
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.ensemble import AdaBoostClassifier
from sklearn import cross_validation
from sklearn.linear_model import LogisticRegression
from sklearn.grid_search import GridSearchCV

### Read input data from local disk and separate the training & testing data. We also separate the training labels at this stage.

In [2]:
train, train_label, test = [], [], []

with open("input00.txt", "r") as input_file:
    data = input_file.readlines()
    
train_count = int(data[0].split()[0])
test_count = int(data[train_count+1])

for indx in xrange(1, train_count+1):
    temp = data[indx].split()
    train_label.append(int(temp[1]))
    train.append([float(val.split(":")[1]) for val in temp[2:]])

for indx in xrange(train_count+2, train_count+test_count+2):
    temp = data[indx].split()
    test.append([float(val.split(":")[1]) for val in temp[1:]])

In [3]:
#print "Training data - ", train[:5]
#print "Training labels - ", train_label[:5]
#print "Testing data - ", test[:5]

### Read the target labels

In [4]:
with open("output00.txt","r") as output_file:
    output_data = output_file.readlines()
test_labels = [int(row.split()[1]) for row in output_data]

### The data format looks fine, we use feature scaling for better performance while training our classifiers

In [5]:
train_scaled, test_scaled = preprocessing.scale(np.asarray(train)), preprocessing.scale(np.asarray(test))
#print "Training data - ", train_scaled[:5]
#print "Testing data - ", test_scaled[:5]




### Support Vector Classifier

In [6]:
svc = svm.SVC(C=0.8, kernel='linear', decision_function_shape="ovo")
scores = cross_validation.cross_val_score(svc, train_scaled, train_label, cv=10, scoring='f1_weighted')
print("Cross Validation F-1: {} (+/- {})".format(round(scores.mean(),2), round(scores.std() * 2,2)))
svc.fit(train_scaled, train_label)
predict_labels = svc.predict(test_scaled)
print "\nClassification report - \n", classification_report(test_labels, predict_labels)

Cross Validation F-1: 0.82 (+/- 0.03)

Classification report - 
             precision    recall  f1-score   support

         -1       0.82      0.79      0.81       250
          1       0.80      0.83      0.81       250

avg / total       0.81      0.81      0.81       500



### Hackerrank Score for SVM - 0.0 as it didn't pass both test cases. For test case 1 it results in time out error, probably due to a larger dataset.

### Let's tr Logistic Regression

In [10]:
logistic = LogisticRegression(C=1, penalty='l2', solver='liblinear')
logistic.fit(train_scaled, train_label)
scores = cross_validation.cross_val_score(logistic, train_scaled, train_label, cv=10, scoring='f1_weighted')
print("Cross Validation F-1: {} (+/- {})".format(round(scores.mean(),2), round(scores.std() * 2,2)))
predict_labels = logistic.predict(test_scaled)
print "\nClassification report - \n", classification_report(test_labels, predict_labels)

Cross Validation F-1: 0.82 (+/- 0.02)

Classification report - 
             precision    recall  f1-score   support

         -1       0.81      0.81      0.81       250
          1       0.81      0.82      0.81       250

avg / total       0.81      0.81      0.81       500



Hackerrank Score obtained for Logistic regression - 83.24

Random forest classifier

In [13]:
rfc = RFC(n_estimators=15, verbose=0, criterion='gini', min_samples_split=15, min_samples_leaf=7, max_features='log2')
scores = cross_validation.cross_val_score(rfc, train_scaled, train_label, cv=10, scoring='f1_weighted')
print("Cross Validation F-1: {} (+/- {})".format(round(scores.mean(),2), round(scores.std() * 2,2)))
rfc.fit(train_scaled, train_label)
predict_labels = rfc.predict(test_scaled)
print "\nClassification report - \n", classification_report(test_labels, predict_labels)

Cross Validation F-1: 0.85 (+/- 0.03)

Classification report - 
             precision    recall  f1-score   support

         -1       0.83      0.80      0.82       250
          1       0.81      0.84      0.82       250

avg / total       0.82      0.82      0.82       500



Hackerrank Score for Random Forest - 86.16

Adaboost Classifier

In [15]:
aboost = AdaBoostClassifier(base_estimator=rfc, n_estimators=5, learning_rate=0.5, algorithm='SAMME.R', random_state=True)
scores = cross_validation.cross_val_score(aboost, train_scaled, train_label, cv=10, scoring='f1_weighted')
print("Cross Validation F-1: {} (+/- {})".format(round(scores.mean(),2), round(scores.std() * 2,2)))
aboost.fit(train, train_label)
predict_labels = aboost.predict(test)
print "\nClassification report - \n", classification_report(test_labels, predict_labels)

Cross Validation F-1: 0.85 (+/- 0.03)

Classification report - 
             precision    recall  f1-score   support

         -1       0.82      0.84      0.83       250
          1       0.84      0.82      0.83       250

avg / total       0.83      0.83      0.83       500



Hackerrank score for Adaboost Classifier - 87.38