## Simple Random Sample technique

In [2]:
import pandas as pd
import numpy as np
from string import punctuation
import math

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn import preprocessing, cross_validation
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.utils import shuffle

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer

### Dataset contains around 2345796 data values

In [3]:
df = pd.read_csv('test.csv')
df.drop(['test_id'],1,inplace=True)
df = df.dropna()

### Shuffle the data (random)

In [4]:
df = shuffle(df)

In [5]:
stop_words = list(set(stopwords.words('english')))
sno = SnowballStemmer('english')

In [6]:
question1 = [word_tokenize(i.lower().decode('utf8')) for i in df['question1'][0:10000]]
question2 = [word_tokenize(i.lower().decode('utf8')) for i in df['question2'][0:10000]]

In [7]:
q1 = [' '.join(w) for w in question1 if w not in stop_words]
q2 = [' '.join(w) for w in question2 if w not in stop_words]

### Get rid off punctuation marks

In [8]:
quora1 = []
for i in q1:
    for c in punctuation:
        i = i.replace(c,"")
    quora1.append(i)

In [9]:
quora2 = []
for i in q2:
    for c in punctuation:
        i = i.replace(c,"")
    quora2.append(i)

### Count Vectorization

In [10]:
vectorizer = CountVectorizer()

In [11]:
X = vectorizer.fit_transform(quora1)
X = X.toarray()

In [12]:
Y = vectorizer.fit_transform(quora2)
Y = Y.toarray()

In [13]:
x = [sum(i) for i in X]
y = [sum(i) for i in Y]

In [14]:
Z = zip(x,y)

### Manual Validation (need to take care)

In [15]:
label = []
for i,j in Z:
    if math.fabs(i - j) <= 5.0:
        label.append(1)
    else:
        label.append(0)

### Convert to numeric data

In [16]:
def handle_non_numerical_data(df):
    columns = df.columns.values

    for column in columns:
        text_digit_vals = {}

        def convert_to_int(val):
            return text_digit_vals[val]

        if df[column].dtype != np.int64 and df[column].dtype != np.float64:
            column_contents = df[column].values.tolist()
            unique_elements = list(set(column_contents))
            x = 0
            for unique in unique_elements:
                if unique not in text_digit_vals:
                    text_digit_vals[unique] = x
                    x += 1

            df[column] = list(map(convert_to_int,df[column]))

    return df

### Split data

In [17]:
df = df[0:10000]

In [18]:
df = handle_non_numerical_data(df)

In [19]:
ft = np.array(df)
lt = np.array(label)

In [20]:
features_train,features_test,labels_train,labels_test = cross_validation.train_test_split(ft,lt,test_size=0.2)

### Gaussian Naive Bayes

In [21]:
clf = GaussianNB()
clf.fit(features_train,labels_train)
pred = clf.predict(features_test)
print(pred)
accuracy = accuracy_score(labels_test,pred)
print(accuracy)

[1 1 1 ..., 1 1 1]
0.765


### AdaBoostClassifier

In [22]:
clf = AdaBoostClassifier()
clf.fit(features_train,labels_train)
pred = clf.predict(features_test)
print(pred)
accuracy = accuracy_score(labels_test,pred)
print(accuracy)

[1 1 1 ..., 1 1 1]
0.763


### RandomForestClassifier

In [23]:
clf = RandomForestClassifier()
clf.fit(features_train,labels_train)
pred = clf.predict(features_test)
print(pred)
accuracy = accuracy_score(labels_test,pred)
print(accuracy)

[1 1 1 ..., 1 1 1]
0.68


### KNeighborsClassifier

In [24]:
clf = KNeighborsClassifier()
clf.fit(features_train,labels_train)
pred = clf.predict(features_test)
print(pred)
accuracy = accuracy_score(labels_test,pred)
print(accuracy)

[1 1 1 ..., 1 1 1]
0.719
