In [2]:
import pandas as pd
import numpy as np
from string import punctuation
import math

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn import preprocessing, cross_validation
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer

### Dataset contains around 2345796 data values

In [3]:
df = pd.read_csv('test.csv')
df.drop(['test_id'],1,inplace=True)
df = df.dropna()

In [4]:
stop_words = list(set(stopwords.words('english')))
sno = SnowballStemmer('english')

In [5]:
question1 = [word_tokenize(i.lower().decode('utf8')) for i in df['question1'][0:15000]]
question2 = [word_tokenize(i.lower().decode('utf8')) for i in df['question2'][0:15000]]

In [6]:
q1 = [' '.join(w) for w in question1 if w not in stop_words]
q2 = [' '.join(w) for w in question2 if w not in stop_words]

### Get rid off punctuation marks

In [7]:
quora1 = []
for i in q1:
    for c in punctuation:
        i = i.replace(c,"")
    quora1.append(i)

In [8]:
quora2 = []
for i in q2:
    for c in punctuation:
        i = i.replace(c,"")
    quora2.append(i)

### Count Vectorization

In [9]:
vectorizer = CountVectorizer()

In [10]:
X = vectorizer.fit_transform(quora1)
X = X.toarray()

In [11]:
Y = vectorizer.fit_transform(quora2)
Y = Y.toarray()

In [12]:
x = [sum(i) for i in X]
y = [sum(i) for i in Y]

In [13]:
Z = zip(x,y)

### Manual validation (need to take care)

In [14]:
label = []
for i,j in Z:
    if math.fabs(i - j) <= 5.0:
        label.append(1)
    else:
        label.append(0)

### Convert to numeric data

In [15]:
def handle_non_numerical_data(df):
    columns = df.columns.values

    for column in columns:
        text_digit_vals = {}

        def convert_to_int(val):
            return text_digit_vals[val]

        if df[column].dtype != np.int64 and df[column].dtype != np.float64:
            column_contents = df[column].values.tolist()
            unique_elements = list(set(column_contents))
            x = 0
            for unique in unique_elements:
                if unique not in text_digit_vals:
                    text_digit_vals[unique] = x
                    x += 1

            df[column] = list(map(convert_to_int,df[column]))

    return df

### Split data

In [34]:
df = df[0:15000]

In [26]:
df = handle_non_numerical_data(df)

In [27]:
ft = np.array(df)
lt = np.array(label)

In [28]:
features_train,features_test,labels_train,labels_test = cross_validation.train_test_split(ft,lt,test_size=0.2)

## Slight increase in accuracy

### Gaussian Naive Bayes

In [29]:
clf = GaussianNB()
clf.fit(features_train,labels_train)
pred = clf.predict(features_test)
print(pred)
accuracy = accuracy_score(labels_test,pred)
print(accuracy)

[1 1 1 ..., 1 1 1]
0.759


### AdaBoostClassifier

In [30]:
clf = AdaBoostClassifier()
clf.fit(features_train,labels_train)
pred = clf.predict(features_test)
print(pred)
accuracy = accuracy_score(labels_test,pred)
print(accuracy)

[1 1 1 ..., 1 1 1]
0.758666666667


### RandomForestClassifier

In [31]:
clf = RandomForestClassifier()
clf.fit(features_train,labels_train)
pred = clf.predict(features_test)
print(pred)
accuracy = accuracy_score(labels_test,pred)
print(accuracy)

[1 1 1 ..., 1 1 1]
0.680333333333


### KNeighborsClassifier

In [32]:
clf = KNeighborsClassifier()
clf.fit(features_train,labels_train)
pred = clf.predict(features_test)
print(pred)
accuracy = accuracy_score(labels_test,pred)
print(accuracy)

[1 1 1 ..., 1 1 1]
0.711666666667
