In [18]:
import numpy as np
import pandas as pd
import math

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn import preprocessing, cross_validation
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

### Dataset contains around 2345796 data values

In [2]:
df = pd.read_csv('test.csv')
df.drop(['test_id'],1,inplace=True)
df = df.dropna()

In [3]:
stop_words = list(set(stopwords.words('english')))
sno = SnowballStemmer('english')

In [4]:
question1 = [word_tokenize(i.lower().decode('utf8')) for i in df['question1'][0:10000]]
question2 = [word_tokenize(i.lower().decode('utf8')) for i in df['question2'][0:10000]]

In [5]:
q1 = [' '.join(w) for w in question1 if w not in stop_words]
q2 = [' '.join(w) for w in question2 if w not in stop_words]

In [6]:
vectorizer = CountVectorizer()

In [7]:
X = vectorizer.fit_transform(q1)
X = X.toarray()

In [8]:
Y = vectorizer.fit_transform(q2)
Y = Y.toarray()

In [9]:
x = [sum(i) for i in X]
y = [sum(i) for i in Y]

In [10]:
Z = zip(x,y)

### Manual validation (need to take care)

In [11]:
label = []
for i,j in Z:
    if math.fabs(i - j) <= 5.0:
        label.append(1)
    else:
        label.append(0)

### Convert to numeric data

In [12]:
def handle_non_numerical_data(df):
    columns = df.columns.values

    for column in columns:
        text_digit_vals = {}

        def convert_to_int(val):
            return text_digit_vals[val]

        if df[column].dtype != np.int64 and df[column].dtype != np.float64:
            column_contents = df[column].values.tolist()
            unique_elements = list(set(column_contents))
            x = 0
            for unique in unique_elements:
                if unique not in text_digit_vals:
                    text_digit_vals[unique] = x
                    x += 1

            df[column] = list(map(convert_to_int,df[column]))

    return df

### Split data

In [13]:
df = df[0:10000]
df = handle_non_numerical_data(df)

ft = np.array(df)
lt = np.array(label)

features_train,features_test,labels_train,labels_test = cross_validation.train_test_split(ft,lt,test_size=0.2)

### Gaussian Naive Bayes

In [19]:
clf = GaussianNB()
clf.fit(features_train,labels_train)
pred = clf.predict(features_test)
print(pred)
accuracy = accuracy_score(labels_test,pred)
print(accuracy)

[1 1 1 ..., 1 1 1]
0.757


### AdaBoostClassifier

In [20]:
clf = AdaBoostClassifier()
clf.fit(features_train,labels_train)
pred = clf.predict(features_test)
print(pred)
accuracy = accuracy_score(labels_test,pred)
print(accuracy)

[1 1 1 ..., 1 1 1]
0.753


### RandomForestClassifier

In [16]:
clf = RandomForestClassifier()
clf.fit(features_train,labels_train)
pred = clf.predict(features_test)
print(pred)
accuracy = accuracy_score(labels_test,pred)
print(accuracy)

[0 1 1 ..., 1 1 1]
0.6735


### KNeighborsClassifier

In [17]:
clf = KNeighborsClassifier(n_neighbors=5)
clf.fit(features_train,labels_train)
pred = clf.predict(features_test)
print(pred)
accuracy = accuracy_score(labels_test,pred)
print(accuracy)

[1 1 1 ..., 1 1 1]
0.7105
