# Importing Libraries

In [13]:
import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import xgboost as xgb

# Importing Dataset

In [2]:
dataset = pd.read_csv('ANN.csv',  encoding='ISO-8859-1')

In [3]:
dataset.head(5)

Unnamed: 0,id,question1,question2,len_q1,len_q2,diff_len,len_char_q1,len_char_q2,len_word_q1,len_word_q2,...,jaccard_distance,canberra_distance,euclidean_distance,minkowski_distance,braycurtis_distance,skew_q1vec,skew_q2vec,kur_q1vec,kur_q2vec,is_duplicate
0,0,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,66,57,9,20,20,14,12,...,1.0,94.023324,0.371408,0.168999,0.186557,0.031817,-0.091902,0.050416,0.337301,0
1,1,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,51,88,-37,21,29,8,13,...,1.0,177.58809,1.012091,0.45591,0.592655,0.008735,0.094704,0.28401,-0.034444,0
2,2,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,73,59,14,25,24,14,10,...,1.0,135.988707,0.666346,0.307828,0.342306,0.239752,0.144554,0.026759,-0.474131,0
3,3,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,50,65,-15,19,26,11,9,...,1.0,192.237828,1.140536,0.506028,0.692421,-0.002527,0.069649,-0.24456,-0.265568,0
4,4,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,76,39,37,25,18,13,7,...,1.0,161.408435,0.860225,0.38277,0.480633,-0.133849,0.114777,0.2179,-0.338876,0


# Dropping columns

In [4]:
dataset = dataset.drop(['question1', 'question2', 'id'], axis=1)
#Replacing infinite values with nan and droping nan rows
dataset = dataset.replace([np.inf, -np.inf], np.nan).dropna(axis=0)

In [5]:
dataset.head(2)

Unnamed: 0,len_q1,len_q2,diff_len,len_char_q1,len_char_q2,len_word_q1,len_word_q2,common_words,fuzz_qratio,fuzz_WRatio,...,jaccard_distance,canberra_distance,euclidean_distance,minkowski_distance,braycurtis_distance,skew_q1vec,skew_q2vec,kur_q1vec,kur_q2vec,is_duplicate
0,66,57,9,20,20,14,12,10,93,95,...,1.0,94.023324,0.371408,0.168999,0.186557,0.031817,-0.091902,0.050416,0.337301,0
1,51,88,-37,21,29,8,13,4,66,86,...,1.0,177.58809,1.012091,0.45591,0.592655,0.008735,0.094704,0.28401,-0.034444,0


In [6]:
#Checking infinite values
np.any(np.isinf(dataset))

False

In [7]:
#Counting duplicate and non-duplicate pairs
dataset["is_duplicate"].value_counts()

0    246559
1    146768
Name: is_duplicate, dtype: int64

# Spliting into dependent & Independent variables


In [9]:
X = dataset.loc[:, dataset.columns != 'is_duplicate']
y = dataset.loc[:, dataset.columns == 'is_duplicate']

# Splitting in train and test data

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

# XGboost

In [11]:
model = xgb.XGBClassifier(max_depth=50, n_estimators=80, learning_rate=0.1, colsample_bytree=.7, gamma=0, reg_alpha=4, objective='binary:logistic', eta=0.3, silent=1, subsample=0.8).fit(X_train, y_train.values.ravel()) 
prediction = model.predict(X_test)

# Confusion Matrix

In [14]:
cm = confusion_matrix(y_test, prediction)  
print(cm)  
print('Accuracy', accuracy_score(y_test, prediction))


[[58617 15476]
 [11698 32208]]
Accuracy 0.7697099127958712


# Classification Report

In [15]:
from sklearn.metrics import classification_report
print(classification_report(y_test, prediction))

              precision    recall  f1-score   support

           0       0.83      0.79      0.81     74093
           1       0.68      0.73      0.70     43906

   micro avg       0.77      0.77      0.77    117999
   macro avg       0.75      0.76      0.76    117999
weighted avg       0.77      0.77      0.77    117999

