In [13]:
import pandas as pd
import numpy as np

import warnings
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    warnings.warn("deprecated", DeprecationWarning)

from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn import preprocessing, cross_validation
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

### Dataset contains around 2345796 data values

In [16]:
df = pd.read_csv('test.csv')
df.drop(['test_id'],1,inplace=True)
df = df.dropna()

stop_words = list(set(stopwords.words("english")))

question1 = [word_tokenize(i.lower().decode('utf8')) for i in df['question1'][0:10000]]
question2 = [word_tokenize(i.lower().decode('utf8')) for i in df['question2'][0:10000]]

q1 = [w for w in question1 if w not in stop_words]
q2 = [w for w in question2 if w not in stop_words]
 
q1 = [' '.join(w) for w in q1]
q2 = [' '.join(w) for w in q2]

df['question1'][0:10000] = q1
df['question2'][0:10000] = q2

quora = zip(q1,q2)

### Manual validation (need to take care)

In [5]:
try:
    label = []
    for i in quora:
        values = [j.split(" ") for j in i]
        compare = values
        same = [i for i in compare[0] if i in compare[1]]
        if len(same) > 5:
            label.append(1)
        else:
            label.append(0)
except AttributeError, UnicodeDecodeError:
    print('Error occured')

### Convert to numeric data

In [6]:
def handle_non_numerical_data(df):
    columns = df.columns.values

    for column in columns:
        text_digit_vals = {}

        def convert_to_int(val):
            return text_digit_vals[val]

        if df[column].dtype != np.int64 and df[column].dtype != np.float64:
            column_contents = df[column].values.tolist()
            unique_elements = list(set(column_contents))
            x = 0
            for unique in unique_elements:
                if unique not in text_digit_vals:
                    text_digit_vals[unique] = x
                    x += 1

            df[column] = list(map(convert_to_int,df[column]))

    return df

### Split data

In [7]:
df = df[0:10000]
df = handle_non_numerical_data(df)

ft = np.array(df)
lt = np.array(label)

features_train,features_test,labels_train,labels_test = cross_validation.train_test_split(ft,lt,test_size=0.2)

### Gaussian Naive Bayes

In [8]:
clf = GaussianNB()
clf.fit(features_train,labels_train)
pred = clf.predict(features_test)
print(pred)
accuracy = accuracy_score(labels_test,pred)
print(accuracy)

[0 0 0 ..., 0 0 0]
0.633


### AdaBoostClassifier

In [9]:
clf = AdaBoostClassifier()
clf.fit(features_train,labels_train)
pred = clf.predict(features_test)
print(pred)
accuracy = accuracy_score(labels_test,pred)
print(accuracy)

[0 0 0 ..., 0 0 0]
0.633


### RandomForestClassifier

In [10]:
clf = RandomForestClassifier()
clf.fit(features_train,labels_train)
pred = clf.predict(features_test)
print(pred)
accuracy = accuracy_score(labels_test,pred)
print(accuracy)

[0 0 0 ..., 0 0 0]
0.5865


### KNeighborsClassifier

In [12]:
clf = KNeighborsClassifier(n_neighbors=5)
clf.fit(features_train,labels_train)
pred = clf.predict(features_test)
print(pred)
accuracy = accuracy_score(labels_test,pred)
print(accuracy)

[0 0 0 ..., 0 0 0]
0.5645
