In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 
import os
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory
print(os.listdir("../input"))

training_data= pd.read_csv("../input/train.csv")
testing_data= pd.read_csv("../input/test.csv")

print(training_data.shape)
print(testing_data.shape)
# Any results you write to the current directory are saved as output.

In [None]:
print("First few records from the training data set")
training_data.head(15)

In [None]:
print("First few records from the testing data set")
testing_data.head(15)

Exploratory Data Analyis  to look for Duplicate number of records in the training data

In [None]:
##Exploratory Data Analyis 
Dup_rec= training_data['is_duplicate'].value_counts()
Dup_rec_per =(Dup_rec/Dup_rec.sum())
# Looking at duplicate records in the training data
print ("Number of records for duplicate and not duplicate :\n %s" %Dup_rec)
print ("Percentage of duplicated records in training data :\n %s" %Dup_rec_per)

Identifying number of unique questions in the training data set 

In [None]:
total_questions = pd.Series(training_data['qid1'].tolist() + training_data['qid2'].tolist())
print('Total unique questions in column qid1 and qid2 : {}'.format(len(np.unique(total_questions))))
print("Number of questions with more than one occurence in the training data: {}".format(np.sum(total_questions.value_counts()> 1)))

#creating a histogram for question occurences

plt.figure(figsize=(10,5))
plt.hist(total_questions.value_counts(), bins=30)
plt.yscale('log', nonposy= 'clip')
plt.title('Question occurence counts')
plt.xlabel('Number of Occurences for Question')
plt.ylabel('total number of questions')
print()

* 111,780 questions appeared multiple times in training data out of 537933 unique questions 
* As seen in the below histogram, majority of the questions appeared more than once
* There is a vast majority of questions that appeared less than 60 times
* A very small portion of the questions appeared more than 100 times


In [None]:
#Checking for Null Values in the data set (Both training and testing)
print("Number of null values in training data set: %d" %training_data.isnull().sum().sum())
print("Number of null values in testing data set: %d" %testing_data.isnull().sum().sum())

#Find the row id for Null values in the training data set
print(training_data[pd.isnull(training_data).any(axis=1)])

#Find the row id for Null values in the testing data set
print(testing_data[pd.isnull(testing_data).any(axis=1)])

#Replacing the null vlaues with a string 'NA'
training_data.ix[training_data['question1'].isnull(),['question1']] = 'NA'
training_data.ix[training_data['question2'].isnull(),['question2']] = 'NA'
testing_data.ix[testing_data['question1'].isnull(),['question1']] = 'NA'
testing_data.ix[testing_data['question2'].isnull(),['question2']] = 'NA'


* training data has 3 null values and testing data has 6 null values.

**Data Pre-Prcoessing **

In [None]:
##Fucntions for different preprocessing steps 
## https://www.geeksforgeeks.org/removing-stop-words-nltk-python/
import string
from string import punctuation
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
stemmer=PorterStemmer()
stop_words = set(stopwords.words('english'))
# Make the questions lower case
def lower_case(x):
    return x.lower()

#removing stop words
def remove_stop_words(x):
    x = x.split()
    x = [s for s in x if s not in stop_words]
    x = " ".join(x)
    return x

##removing punctuations 
def remove_punctuation(x):
    punc_string = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''
    for i in x.lower():
        if i in punc_string:
            x = x.replace(i, "")
    return x        
##Using Stemmer for converting the words to their roots
def stem_port(x):
    x = " ".join([stemmer.stem(i) for i in x.lower().split(" ")])
    return x
    
print("Process Finished")

Running a simple test on a sentence to see if the fucntions are working properly 

In [None]:
##testing for removing lowe case, stop words, removing punctuation, stem words
lower_cs = lower_case("Hi, i HOPE you are doing well and feeling amazing!!!!!!!!!")
print(lower_cs)
clean_st = remove_stop_words(lower_cs)
print(clean_st)
clean_pc = remove_punctuation(clean_st)
print(clean_pc)
stem_words = stem_port(clean_pc)
print(stem_words)
#dir(string)

print("Process finished")

Treating Null values and unnecesary columns 

In [None]:
#Dropping id,qid1 and qid2 as it does not provide valuable imputs
training_data.drop(['id', 'qid1', 'qid2'], axis = 1, inplace = True)
##training_data.fillna(value = "", axis = 0, inplace = True)

#split into features and results
result_data = training_data['is_duplicate']
raw_training_data = training_data

#check training features and results are as expected
display(result_data.head(10))
#display(raw_training_data.head(10))

Applying preprocessing functions to Training and Testing data set

In [None]:
#### Preprocessing Training data
#make all the questions lower case
raw_training_data["question1"] = raw_training_data["question1"].apply(lower_case)
raw_training_data["question2"] = raw_training_data["question2"].apply(lower_case)

#remove stop words from the question 
raw_training_data["question1"] = raw_training_data["question1"].apply(remove_stop_words)
raw_training_data["question2"] = raw_training_data["question2"].apply(remove_stop_words)

#remove punctuation from the question 
raw_training_data["question1"] = raw_training_data["question1"].apply(remove_punctuation)
raw_training_data["question2"] = raw_training_data["question2"].apply(remove_punctuation)

#Stemming for all the words in the question
raw_training_data["question1"] = raw_training_data["question1"].apply(stem_port)
raw_training_data["question2"] = raw_training_data["question2"].apply(stem_port)
print("process finished")

In [None]:
#### Preprocessing Testing data
raw_testing_data = testing_data
#make all the questions lower case
raw_testing_data["question1"] = raw_testing_data["question1"].apply(lower_case)
raw_testing_data["question2"] = raw_testing_data["question2"].apply(lower_case)

#remove stop words from the question 
raw_testing_data["question1"] = raw_testing_data["question1"].apply(remove_stop_words)
raw_testing_data["question2"] = raw_testing_data["question2"].apply(remove_stop_words)

#remove punctuation from the question 
raw_testing_data["question1"] = raw_testing_data["question1"].apply(remove_punctuation)
raw_testing_data["question2"] = raw_testing_data["question2"].apply(remove_punctuation)

#Stemming for all the words in the question
raw_testing_data["question1"] = raw_testing_data["question1"].apply(stem_port)
raw_testing_data["question2"] = raw_testing_data["question2"].apply(stem_port)
print("process finished")

In [None]:
#saving the preprocessed files for repeated use 
raw_train_save = raw_training_data
raw_test_save = raw_testing_data

#Checking to see if the preprocessing steps were implemented properly
display(raw_training_data.head(20))

In [None]:
display(raw_testing_data.head(20))

Feature Engineering ( Creating new feature using exsiting data)

In [None]:
##creating new features in the data set
#Finding number of common words in question pairs
def common_words(w):
    common = 0 
    que1 = w["question1"].lower().split()
    que2 = w["question2"].lower().split() 
    
    for i in que1:
        u = i
        if i in que2:
            common+= 1            
    return common

#function to find the difference in number of words in questions 
def diff_words(w):
    try:
        que1len = len(w["question1"].split())
        que2len = len(w["question2"].split())
        return np.absolute(que1len - que2len) 
    except:
        return 10
#finding ratio of common words between question pairs 
def ratio_common_words(w):
    try: 
        ratio =  (1.0 * common_words(w) / (len(w["question1"].split()) + len(w["question2"].split())))
        return ratio
    except:
        return 0 
    
print("Process Finished")    

In [None]:
## Distribution of Common Words (Duplicate and Not Duplicate)
cmmn_words = raw_training_data.apply(common_words, axis=1)
plt.figure(figsize=(15, 5))
plt.hist(cmmn_words[raw_training_data['is_duplicate'] == 0],bins=20, histtype="stepfilled", alpha = 0.75, label="Not duplicate")
plt.hist(cmmn_words[raw_training_data['is_duplicate'] == 1],bins=20, histtype="stepfilled", alpha = 0.75, label="Duplicate")
plt.yscale('log', nonposy='clip')
plt.legend()
plt.title('distribution over common words', fontsize=15)
plt.xlabel('Common words', fontsize=15)

In [None]:
## Distribution of difference of Words (Duplicate and Not Duplicate)
diff_words_n = raw_training_data.apply(diff_words, axis=1)
plt.figure(figsize=(15, 5))
plt.hist(diff_words_n[raw_training_data['is_duplicate'] == 0], bins=20, histtype="stepfilled", alpha = 0.75, label="Not duplicate")
plt.hist(diff_words_n[raw_training_data['is_duplicate'] == 1], histtype="stepfilled", alpha = 0.75, label="Duplicate")
plt.yscale('log', nonposy='clip')
plt.legend()
plt.title('distribution over diff of words', fontsize=15)
plt.xlabel('diff of words', fontsize=15)

In [None]:
## Distribution of ratio Common Words (Duplicate and Not Duplicate)
ra_cmmn_words = raw_training_data.apply(ratio_common_words, axis=1)
plt.figure(figsize=(15, 5))
plt.hist(ra_cmmn_words[raw_training_data['is_duplicate'] == 0], bins=20, histtype="stepfilled", alpha = 0.75, label="Not duplicate")
plt.hist(ra_cmmn_words[raw_training_data['is_duplicate'] == 1], histtype="stepfilled", alpha = 0.75, label="Duplicate")
plt.yscale('log', nonposy='clip')
plt.legend()
plt.title('distribution over ratio of common words', fontsize=15)
plt.xlabel('ratio of Common words', fontsize=15)

Implementing tf-idf / cosine similarity 

In [None]:
#cosine Similarity/TFIDF vectorizer 
#putting all the questions (question1 and question2) in a single list for TFIDF

Training_co_set = pd.Series(raw_training_data['question1'].tolist() + raw_training_data['question2'].tolist()).astype(str)
Testing_co_set = pd.Series(raw_testing_data['question1'].tolist() + raw_testing_data['question2'].tolist()).astype(str)

from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(min_df = 2)

tfidf_new= tfidf_vectorizer.fit(pd.concat((raw_training_data.ix[:,'question1'],raw_training_data.ix[:,'question2'])).unique())

#Generating new vectors for individual question
train_Que1_tfidf = tfidf_vectorizer.transform(raw_training_data.ix[:,'question1'])
train_Que2_tfidf = tfidf_vectorizer.transform(raw_training_data.ix[:,'question2'])

print("Question 1 Shape: ", train_Que1_tfidf.shape)
print("Question 2 Shape: ", train_Que2_tfidf.shape)

#applying cosine similarites to the vectors for training data 
from sklearn.metrics.pairwise import cosine_similarity
cosine_train_new = []
for i in range(0,404290):
    cosine_train_new.append((cosine_similarity(train_Que1_tfidf[i],train_Que2_tfidf[i]))[0][0])
    i = i +1


In [None]:
raw_training_data['cosine_trfs'] = cosine_train_new

In [None]:
#label distribution over Cosine Similarity
plt.figure(figsize=(15, 5))
plt.hist(raw_training_data[raw_training_data['is_duplicate']== 0]['cosine_trfs'], bins=50, range=[0, 1], normed=True, alpha=0.5, label='not duplicate')
plt.hist(raw_training_data[raw_training_data['is_duplicate']== 1]['cosine_trfs'], bins=50, range=[0, 1], normed=True, alpha=0.5, label='duplicate')
plt.title('Distribution', fontsize=20)
plt.legend()
plt.xlabel('Cosine Similarity', fontsize=20)
plt.ylabel('Distribution', fontsize=20)

In [None]:
##Appying Cosine similarity to Testing data
test_Que1_tfidf = tfidf_vectorizer.transform(raw_testing_data.ix[:,'question1'])
test_Que2_tfidf = tfidf_vectorizer.transform(raw_testing_data.ix[:,'question2'])
print("Question 1 Shape: ", test_Que1_tfidf.shape)
print("Question 2 Shape: ", test_Que2_tfidf.shape)

cosine_test_new = []
for i in range(0,2345796):
    cosine_test_new.append((cosine_similarity(test_Que1_tfidf[i],test_Que2_tfidf[i]))[0][0])
    i = i + 1

Feature selection for data

In [None]:
#adding more features to the original data set
Feature_train_Data = pd.DataFrame(dtype='float64')
Feature_test_Data = pd.DataFrame(dtype='float64')

Feature_train_Data['que1len'] = raw_training_data['question1'].str.len()
Feature_train_Data['que2len'] = raw_training_data['question2'].str.len()
Feature_train_Data['que1word'] = raw_training_data['question1'].apply(lambda x: len(str(x).split(" ")))
Feature_train_Data['que2word'] = raw_training_data['question2'].apply(lambda y: len(str(y).split(" ")))
#Feature_train_Data['Common_words'] = raw_training_data.apply(common_words, axis=1)
Feature_train_Data['diff_words'] = raw_training_data.apply(diff_words, axis=1)
Feature_train_Data['ratio_common_words'] = raw_training_data.apply(ratio_common_words, axis=1)
Feature_train_Data['cosine_trfs'] = cosine_train_new

#Feature_train_Data.head()

Feature_test_Data['que1len'] = raw_testing_data['question1'].str.len()
Feature_test_Data['que2len'] = raw_testing_data['question2'].str.len()
Feature_test_Data['que1word'] = raw_testing_data['question1'].apply(lambda x: len(str(x).split(" ")))
Feature_test_Data['que2word'] = raw_testing_data['question2'].apply(lambda y: len(str(y).split(" ")))
#Feature_test_Data['Common_words'] = raw_testing_data.apply(common_words, axis=1)
Feature_test_Data['diff_words'] = raw_testing_data.apply(diff_words, axis=1)
Feature_test_Data['ratio_common_words'] = raw_testing_data.apply(ratio_common_words, axis=1)
Feature_test_Data['cosine_trfs'] = cosine_test_new

Feature_ytrain_Data = raw_training_data['is_duplicate'].values

In [None]:
Feature_train_Data.head()

In [None]:
Feature_test_Data.head()

**Scaling the data** 

In [None]:
#Scaling all the features: 
from sklearn import preprocessing

min_max_scaler = preprocessing.MinMaxScaler()
#feature_list = ["Common_words","diff_words","ratio_common_words", "cosine_trfs"]
feature_list = ["que1len","que2len","que1word","que2word","diff_words","ratio_common_words","cosine_trfs"]
scaled = min_max_scaler.fit_transform(Feature_train_Data[feature_list])

scaled_features = pd.DataFrame(data = scaled, columns=feature_list)
display(scaled_features.head())

Do not use below section

In [None]:
##Rebalancing the data : Section taken from https://www.kaggle.com/anokas/data-analysis-xgboost-starter-0-35460-lb 
##According to the analysis in above blog, The training data has 37% of the positive class while testing data has only 17%
#of positive class. rebalacing the data will make sure the training data also will have 17% of positive class in the data. 
#Adding rebalancing data section helped improve the log loss score on XGboost significantly 

#pos_train = Feature_train_Data[Feature_ytrain_Data == 1]
#neg_train = Feature_train_Data[Feature_ytrain_Data == 0]

# Now we oversample the negative class
# There is likely a much more elegant way to do this...
#p = 0.165
#scale = ((len(pos_train) / (len(pos_train) + len(neg_train))) / p) - 1
#while scale > 1:
 #   neg_train = pd.concat([neg_train, neg_train])
 #   scale -=1
#neg_train = pd.concat([neg_train, neg_train[:int(scale * len(neg_train))]])
#print(len(pos_train) / (len(pos_train) + len(neg_train)))

#x_train = pd.concat([pos_train, neg_train])
#y_train = (np.zeros(len(pos_train)) + 1).tolist() + np.zeros(len(neg_train)).tolist()
#del pos_train, neg_train

In [None]:
# creating a benchmark model for the data set
from sklearn.model_selection import GridSearchCV
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import log_loss

x_train, x_valid, y_train, y_valid = train_test_split(scaled_features, Feature_ytrain_Data, test_size=0.2, random_state=0)


**Random Forest Implementation**

In [None]:
##Creating a list of parameters to be used in GridSearch (n_estimators : 10,20,50,100)
param = {'n_estimators':[300], 'random_state':[0]}
random_clf = RandomForestClassifier() 

rn_grid_clf = GridSearchCV(random_clf, param)
rn_grid_clf = rn_grid_clf.fit(x_train, y_train)

print(rn_grid_clf.best_estimator_)

#random forest predictor
random_pred = rn_grid_clf.predict_proba(x_valid)
random_pred = random_pred[:,1]

#finding the log loss 
print("The log loss error is : ")
log_loss(y_valid, random_pred)

In [None]:
#Creating predictions for the test dataset
rand_x_test = rn_grid_clf.predict_proba(Feature_test_Data)
rand_x_test = rand_x_test[:,1]


RFC_data_csv = pd.DataFrame()
RFC_data_csv ['test_id'] = testing_data['test_id']
RFC_data_csv['is_duplicate'] = rand_x_test
RFC_data_csv.to_csv('RFC_export.csv', index = False)
print("Export Successful")

**Logistic Regression Implementation**

In [None]:
from sklearn.linear_model import LogisticRegression
#using sag solver as it is good for large data sets and faster as compared to Liblinear
param_reg = {'solver':['sag'], 'C':[.0011], 'random_state':[0]}
log_clf = LogisticRegression()

lr_grid_clf = GridSearchCV(log_clf, param_reg)

lr_grid_clf = lr_grid_clf.fit(x_train, y_train)
print("The best estimator is : ", lr_grid_clf.best_estimator_)

In [None]:
log_random_pred = lr_grid_clf.predict_proba(x_valid)
log_random_pred = log_random_pred[:,1]

#finding the log loss on the validation data set
print("The log loss error is : ")
log_loss(y_valid, log_random_pred)

In [None]:
#Creating predictions for the test dataset
log_x_test = lr_grid_clf.predict_proba(Feature_test_Data)
log_x_test = log_x_test[:,1]

log_data_csv = pd.DataFrame()
log_data_csv ['test_id'] = testing_data['test_id']
log_data_csv['is_duplicate'] = log_x_test
log_data_csv.to_csv('log_export.csv', index = False)
print("Export Successful")

**Implementing XGBoost Classifier**

In [None]:
import xgboost as xgb
# parameters for xgboost
params = {}
params['max_depth'] = 8
params['eval_metric'] = 'logloss'
params['eta'] = 0.4
params['objective'] = 'binary:logistic'
#params['gamma'] = 2
#params['subsample'] = 0.5
#params['colsample_bytree'] = 0.8
#params['scale_pos_weight'] = 10

print(params)
xgb_train = xgb.DMatrix(x_train, label=y_train)
xgb_valid = xgb.DMatrix(x_valid, label=y_valid)

watchlist = [(xgb_train, 'training'), (xgb_valid, 'validation')]

bst = xgb.train(params, xgb_train, 500, watchlist, early_stopping_rounds= 50, verbose_eval=20)

In [None]:
xgb_test = xgb.DMatrix(Feature_test_Data)
predicted_test_op = bst.predict(xgb_test)

##output the data to csv for test
xgb_data_csv = pd.DataFrame()
xgb_data_csv ['test_id'] = testing_data['test_id']
xgb_data_csv['is_duplicate'] = predicted_test_op
xgb_data_csv.to_csv('xgb_export.csv', index = False)
print("Export Successful")

xgb_data_csv.head()