In [1]:
import pandas as pd
import numpy as np
import pickle
import gensim
from fuzzywuzzy import fuzz
from nltk.corpus import stopwords
from tqdm import tqdm
from scipy.stats import skew, kurtosis
from scipy.spatial.distance import cosine, cityblock, jaccard, canberra, euclidean, minkowski, braycurtis
from nltk import word_tokenize
stop_words = stopwords.words('english')



#### WMD Distance

In [2]:
def wmd(s1, s2):
    s1 = str(s1).lower().split()
    s2 = str(s2).lower().split()
    stop_words = stopwords.words('english')
    s1 = [w for w in s1 if w not in stop_words]
    s2 = [w for w in s2 if w not in stop_words]
    return model.wmdistance(s1, s2)

#### Sentence embedding

In [3]:
def sent2vec(s):
    words = str(s).lower()
    words = word_tokenize(words)
    words = [w for w in words if not w in stop_words]
    words = [w for w in words if w.isalpha()]
    M = []
    for w in words:
        try:
            M.append(model[w])
        except:
            continue
    M = np.array(M)
    v = M.sum(axis=0)
    return v / np.sqrt((v ** 2).sum())

### Import embedding model

#### Glove

In [4]:
from gensim.models import KeyedVectors
# load the Stanford GloVe model
filename = 'C:/Users/Amine/Desktop/MVA2017/Object_recognition/Project/project/Project/KCCA/glove.42B.300d.txt.word2vec'
word_embedding_model_glove = KeyedVectors.load_word2vec_format(filename, binary=False)

#### W2V

In [5]:
model = gensim.models.KeyedVectors.load_word2vec_format('C:/Users/Amine/Desktop/MVA2017/ALTEGRAD/TP3/for moodle/code/GoogleNews-vectors-negative300.bin.gz', binary=True)

### Generate features for train data

We can generate some useful features:

    -Using the fuzz package to compute some ratio of string similarity between question 1 et question 2.
    -The length of questions
    -Word embedding of the questions and compute different distances: WMD, cosine, 

In [6]:
data_train = pd.read_csv('data/train.csv', sep=',',names = ["id", "qid1", "qid2", "question1","question2","is_duplicate"])
data_train = data_train.drop(['id', 'qid1', 'qid2'], axis=1)
data_train['len_q1'] = data_train.question1.apply(lambda x: len(str(x)))
data_train['len_q2'] = data_train.question2.apply(lambda x: len(str(x)))
data_train['diff_len'] = data_train.len_q1 - data_train.len_q2
data_train['len_char_q1'] = data_train.question1.apply(lambda x: len(''.join(set(str(x).replace(' ', '')))))
data_train['len_char_q2'] = data_train.question2.apply(lambda x: len(''.join(set(str(x).replace(' ', '')))))
data_train['len_word_q1'] = data_train.question1.apply(lambda x: len(str(x).split()))
data_train['len_word_q2'] = data_train.question2.apply(lambda x: len(str(x).split()))
data_train['common_words'] = data_train.apply(lambda x: len(set(str(x['question1']).lower().split()).intersection(set(str(x['question2']).lower().split()))), axis=1)
data_train['fuzz_qratio'] = data_train.apply(lambda x: fuzz.QRatio(str(x['question1']), str(x['question2'])), axis=1)
data_train['fuzz_WRatio'] = data_train.apply(lambda x: fuzz.WRatio(str(x['question1']), str(x['question2'])), axis=1)
data_train['fuzz_partial_ratio'] = data_train.apply(lambda x: fuzz.partial_ratio(str(x['question1']), str(x['question2'])), axis=1)
data_train['fuzz_partial_token_set_ratio'] = data_train.apply(lambda x: fuzz.partial_token_set_ratio(str(x['question1']), str(x['question2'])), axis=1)
data_train['fuzz_partial_token_sort_ratio'] = data_train.apply(lambda x: fuzz.partial_token_sort_ratio(str(x['question1']), str(x['question2'])), axis=1)
data_train['fuzz_token_set_ratio'] = data_train.apply(lambda x: fuzz.token_set_ratio(str(x['question1']), str(x['question2'])), axis=1)
data_train['fuzz_token_sort_ratio'] = data_train.apply(lambda x: fuzz.token_sort_ratio(str(x['question1']), str(x['question2'])), axis=1)

#### Embedding of the questions

In [8]:
model=word_embedding_model_glove
norm_model = word_embedding_model_glove
norm_model.init_sims(replace=True)

data_train['wmd'] = data_train.apply(lambda x: wmd(x['question1'], x['question2']), axis=1)

question1_vectors = np.zeros((data_train.shape[0], 300))
error_count = 0

for i, q in tqdm(enumerate(data_train.question1.values)):
    question1_vectors[i, :] = sent2vec(q)

question2_vectors  = np.zeros((data_train.shape[0], 300))
for i, q in tqdm(enumerate(data_train.question2.values)):
    question2_vectors[i, :] = sent2vec(q)

#### Compute distances

In [9]:
data_train['cosine_distance'] = [cosine(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                          np.nan_to_num(question2_vectors))]

data_train['cityblock_distance'] = [cityblock(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                          np.nan_to_num(question2_vectors))]

data_train['jaccard_distance'] = [jaccard(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                          np.nan_to_num(question2_vectors))]

data_train['canberra_distance'] = [canberra(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                          np.nan_to_num(question2_vectors))]

data_train['euclidean_distance'] = [euclidean(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                          np.nan_to_num(question2_vectors))]

data_train['minkowski_distance'] = [minkowski(x, y, 3) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                          np.nan_to_num(question2_vectors))]

data_train['braycurtis_distance'] = [braycurtis(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                          np.nan_to_num(question2_vectors))]

data_train['skew_q1vec'] = [skew(x) for x in np.nan_to_num(question1_vectors)]
data_train['skew_q2vec'] = [skew(x) for x in np.nan_to_num(question2_vectors)]
data_train['kur_q1vec'] = [kurtosis(x) for x in np.nan_to_num(question1_vectors)]
data_train['kur_q2vec'] = [kurtosis(x) for x in np.nan_to_num(question2_vectors)]

### Save features

In [10]:
pickle.dump(question1_vectors, open('data/q1_glove.pkl', 'wb'), -1)
pickle.dump(question2_vectors, open('data/q2_glove.pkl', 'wb'), -1)

data_train.to_csv('data/train_features_glove.csv', index=False)

### Generete features for test data

In [11]:
data_test = pd.read_csv('data/test.csv', sep=',',names = ["id", "qid1", "qid2", "question1","question2"])

In [12]:
data_test['len_q1'] = data_test.question1.apply(lambda x: len(str(x)))
data_test['len_q2'] = data_test.question2.apply(lambda x: len(str(x)))
data_test['diff_len'] = data_test.len_q1 - data_test.len_q2
data_test['len_char_q1'] = data_test.question1.apply(lambda x: len(''.join(set(str(x).replace(' ', '')))))
data_test['len_char_q2'] = data_test.question2.apply(lambda x: len(''.join(set(str(x).replace(' ', '')))))
data_test['len_word_q1'] = data_test.question1.apply(lambda x: len(str(x).split()))
data_test['len_word_q2'] = data_test.question2.apply(lambda x: len(str(x).split()))
data_test['common_words'] = data_test.apply(lambda x: len(set(str(x['question1']).lower().split()).intersection(set(str(x['question2']).lower().split()))), axis=1)
data_test['fuzz_qratio'] = data_test.apply(lambda x: fuzz.QRatio(str(x['question1']), str(x['question2'])), axis=1)
data_test['fuzz_WRatio'] = data_test.apply(lambda x: fuzz.WRatio(str(x['question1']), str(x['question2'])), axis=1)
data_test['fuzz_partial_ratio'] = data_test.apply(lambda x: fuzz.partial_ratio(str(x['question1']), str(x['question2'])), axis=1)
data_test['fuzz_partial_token_set_ratio'] = data_test.apply(lambda x: fuzz.partial_token_set_ratio(str(x['question1']), str(x['question2'])), axis=1)
data_test['fuzz_partial_token_sort_ratio'] = data_test.apply(lambda x: fuzz.partial_token_sort_ratio(str(x['question1']), str(x['question2'])), axis=1)
data_test['fuzz_token_set_ratio'] = data_test.apply(lambda x: fuzz.token_set_ratio(str(x['question1']), str(x['question2'])), axis=1)
data_test['fuzz_token_sort_ratio'] = data_test.apply(lambda x: fuzz.token_sort_ratio(str(x['question1']), str(x['question2'])), axis=1)

In [13]:
norm_model = model
norm_model.init_sims(replace=True)
data_test['wmd'] = data_test.apply(lambda x: wmd(x['question1'], x['question2']), axis=1)

question1_vectors = np.zeros((data_test.shape[0], 300))
error_count = 0

for i, q in tqdm(enumerate(data_test.question1.values)):
    question1_vectors[i, :] = sent2vec(q)

question2_vectors  = np.zeros((data_test.shape[0], 300))
for i, q in tqdm(enumerate(data_test.question2.values)):
    question2_vectors[i, :] = sent2vec(q)

In [14]:
data_test['cosine_distance'] = [cosine(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                          np.nan_to_num(question2_vectors))]

data_test['cityblock_distance'] = [cityblock(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                          np.nan_to_num(question2_vectors))]

data_test['jaccard_distance'] = [jaccard(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                          np.nan_to_num(question2_vectors))]

data_test['canberra_distance'] = [canberra(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                          np.nan_to_num(question2_vectors))]

data_test['euclidean_distance'] = [euclidean(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                          np.nan_to_num(question2_vectors))]

data_test['minkowski_distance'] = [minkowski(x, y, 3) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                          np.nan_to_num(question2_vectors))]

data_test['braycurtis_distance'] = [braycurtis(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                          np.nan_to_num(question2_vectors))]

data_test['skew_q1vec'] = [skew(x) for x in np.nan_to_num(question1_vectors)]
data_test['skew_q2vec'] = [skew(x) for x in np.nan_to_num(question2_vectors)]
data_test['kur_q1vec'] = [kurtosis(x) for x in np.nan_to_num(question1_vectors)]
data_test['kur_q2vec'] = [kurtosis(x) for x in np.nan_to_num(question2_vectors)]

In [21]:
pickle.dump(question1_vectors, open('data/q1_glove_test.pkl', 'wb'), -1)
pickle.dump(question2_vectors, open('data/q2_glove_test.pkl', 'wb'), -1)
data_test.to_csv('data/test_features_glove.csv', index=False)

### Load features

In [None]:
features_train = pd.read_csv('data/train_features_glove.csv', sep=',', encoding='latin-1')
features_test = pd.read_csv('data/test_features_glove.csv', sep=',', encoding='latin-1')
data_train = pd.read_csv('train.csv', sep=',',names = ["id", "qid1", "qid2", "question1","question2","is_duplicate"])
Y_train=data_train["is_duplicate"].values

### Page Rank

In [22]:
#https://www.kaggle.com/zfturbo/pagerank-on-quora-feature-file-generator/code
import pandas as pd
import hashlib
import gc 

df_train = pd.read_csv('data/train.csv', sep=',',names = ["id", "qid1", "qid2", "question1","question2","is_duplicate"])
df_test = pd.read_csv('data/test.csv', sep=',',names = ["id", "qid1", "qid2", "question1","question2"])

# Generating a graph of Questions and their neighbors
def generate_qid_graph_table(row):
    hash_key1 = hashlib.md5(row["question1"].encode('utf-8')).hexdigest()
    hash_key2 = hashlib.md5(row["question2"].encode('utf-8')).hexdigest()

    qid_graph.setdefault(hash_key1, []).append(hash_key2)
    qid_graph.setdefault(hash_key2, []).append(hash_key1)


qid_graph = {}
print('Apply to train...')
df_train.apply(generate_qid_graph_table, axis=1)
print('Apply to test...')
df_test.apply(generate_qid_graph_table, axis=1)


def pagerank():
    MAX_ITER = 20
    d = 0.85

    # Initializing -- every node gets a uniform value!
    pagerank_dict = {i: 1 / len(qid_graph) for i in qid_graph}
    num_nodes = len(pagerank_dict)

    for iter in range(0, MAX_ITER):

        for node in qid_graph:
            local_pr = 0

            for neighbor in qid_graph[node]:
                local_pr += pagerank_dict[neighbor] / len(qid_graph[neighbor])

            pagerank_dict[node] = (1 - d) / num_nodes + d * local_pr

    return pagerank_dict

print('Main PR generator...')
pagerank_dict = pagerank()

def get_pagerank_value(row):
    q1 = hashlib.md5(row["question1"].encode('utf-8')).hexdigest()
    q2 = hashlib.md5(row["question2"].encode('utf-8')).hexdigest()
    s = pd.Series({
        "q1_pr": pagerank_dict[q1],
        "q2_pr": pagerank_dict[q2]
    })
    return s

print('Apply to train...')
pagerank_feats_train = df_train.apply(get_pagerank_value, axis=1)
print('Writing train...')
pagerank_feats_train.to_csv("data/pagerank_train.csv", index=False)
del df_train
gc.collect()
print('Apply to test...')
pagerank_feats_test = df_test.apply(get_pagerank_value, axis=1)
print('Writing test...')
pagerank_feats_test.to_csv("data/pagerank_test.csv", index=False)

### Add features of page Rank

In [None]:
features_test["q1_pr"]=pagerank_feats_test["q1_pr"]
features_test["q2_pr"]=pagerank_feats_test["q2_pr"]
features_train["q1_pr"]=pagerank_feats_train["q1_pr"]
features_train["q2_pr"]=pagerank_feats_train["q2_pr"]

### Question frequency

This code has the purpose to compute the fraquency of the questions with the idea that more frequenct questions are more likely to be duplicates

In [None]:
# code from https://www.kaggle.com/jturkewitz/magic-features-0-03-gain/comments

import numpy as np
import pandas as pd
import timeit
import sys

train_orig =pd.read_csv('train.csv', sep=',',names = ["id", "qid1", "qid2", "question1","question2","is_duplicate"])
test_orig =  pd.read_csv('test.csv', sep=',',names = ["id", "qid1", "qid2", "question1","question2"])

tic0=timeit.default_timer()
df1 = train_orig[['question1']].copy()
df2 = train_orig[['question2']].copy()
df1_test = test_orig[['question1']].copy()
df2_test = test_orig[['question2']].copy()

df2.rename(columns = {'question2':'question1'},inplace=True)
df2_test.rename(columns = {'question2':'question1'},inplace=True)

train_questions = df1.append(df2)
train_questions = train_questions.append(df1_test)
train_questions = train_questions.append(df2_test)
train_questions.drop_duplicates(subset = ['question1'],inplace=True)

train_questions.reset_index(inplace=True,drop=True)
questions_dict = pd.Series(train_questions.index.values,index=train_questions.question1.values).to_dict()

train_cp = train_orig.copy()
test_cp = test_orig.copy()
train_cp.drop(['qid1','qid2'],axis=1,inplace=True)

test_cp['is_duplicate'] = -1
test_cp.rename(columns={'test_id':'id'},inplace=True)
comb = pd.concat([train_cp,test_cp])

comb['q1_hash'] = comb['question1'].map(questions_dict)
# print(comb['q1_hash'])
comb['q2_hash'] = comb['question2'].map(questions_dict)

q1_vc = comb.q1_hash.value_counts().to_dict()
q2_vc = comb.q2_hash.value_counts().to_dict()

def try_apply_dict(x,dict_to_apply):
    try:
        return dict_to_apply[x]
    except KeyError:
        return 0
#map to frequency space

comb['q1_freq'] = comb['q1_hash'].map(lambda x: try_apply_dict(x,q1_vc) + try_apply_dict(x,q2_vc))
comb['q2_freq'] = comb['q2_hash'].map(lambda x: try_apply_dict(x,q1_vc) + try_apply_dict(x,q2_vc))

train_comb = comb[comb['is_duplicate'] >= 0][['id','q1_hash','q2_hash','q1_freq','q2_freq','is_duplicate']]
test_comb = comb[comb['is_duplicate'] < 0][['id','q1_hash','q2_hash','q1_freq','q2_freq']]
corr_mat = train_comb.corr()
train_comb.to_csv('./data/train_magic.csv', columns=['q1_freq', 'q2_freq'])
test_comb.to_csv('./data/test_magic.csv', columns=['q1_freq', 'q2_freq'])
print(corr_mat)
print(test_comb)

### Intersection of questions

In [None]:
import numpy as np 
import pandas as pd 
from collections import defaultdict

def q1_q2_intersect(row):
    return(len(set(q_dict[row['question1']]).intersection(set(q_dict[row['question2']]))))

train_orig =pd.read_csv('train.csv', sep=',',names = ["id", "qid1", "qid2", "question1","question2","is_duplicate"])
test_orig =  pd.read_csv('test.csv', sep=',',names = ["id", "qid1", "qid2", "question1","question2"])

ques = pd.concat([train_orig[['question1', 'question2']], test_orig[['question1', 'question2']]], axis=0).reset_index(drop='index')


q_dict = defaultdict(set)
for i in range(ques.shape[0]):
    q_dict[ques.question1[i]].add(ques.question2[i])
    q_dict[ques.question2[i]].add(ques.question1[i])

train_orig['q1_q2_intersect'] = train_orig.apply(q1_q2_intersect, axis=1, raw=True)
test_orig['q1_q2_intersect'] = test_orig.apply(q1_q2_intersect, axis=1, raw=True)

train_feat = train_orig[['q1_q2_intersect']]
test_feat = test_orig[['q1_q2_intersect']]

train_feat.to_csv('./data/train_magic_v2.csv')
test_feat.to_csv('./data/test_magic_v2.csv')

### K cores

In [None]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from utils import dist_utils,split_data
import networkx as nx
import scipy.stats as sps

seed = 1024
np.random.seed(seed)

train = pd.read_csv('train.csv', sep=',',names = ["id", "qid1", "qid2", "question1","question2","label"])
test = pd.read_csv('test.csv', sep=',',names = ["id", "qid1", "qid2", "question1","question2"])

data_all = pd.concat([train, test])[['question1','question2']]

#dup index
q_all = pd.DataFrame(np.hstack([train['question1'], test['question1'],
                   train['question2'], test['question2']]), columns=['question'])
q_all = pd.DataFrame(q_all.question.value_counts()).reset_index()

q_num = dict(q_all.values)
q_index = {}
for i,key in enumerate(q_num.keys()):
    q_index[key] = i
data_all['q1_index'] = data_all['question1'].map(q_index)
data_all['q2_index'] = data_all['question2'].map(q_index)


#link edges
q_list = {}
dd = data_all[['q1_index','q2_index']].values
for i in tqdm(np.arange(data_all.shape[0])):
#for i in np.arange(dd.shape[0]):
    q1,q2=dd[i]
    if q_list.setdefault(q1,[q2])!=[q2]:
        q_list[q1].append(q2)
    if q_list.setdefault(q2,[q1])!=[q1]:
        q_list[q2].append(q1)


common_fea = np.zeros((data_all.shape[0],3))
for i in tqdm(np.arange(data_all.shape[0])):
    q1,q2 = dd[i]
    if (q1 not in q_list)|(q2 not in q_list):
        continue
    nei_q1 = set(q_list[q1])
    nei_q2 = set(q_list[q2])

    f_1 = len(nei_q1.intersection(nei_q2))
    common_fea[i][0] = f_1
    common_fea[i][1] = len(nei_q1)
    common_fea[i][2] = len(nei_q2)

train_common = common_fea[:train.shape[0]]
test_common = common_fea[train.shape[0]:]

# pd.to_pickle(train_common,'data/train_neigh.pkl')
# pd.to_pickle(test_common,'data/test_neigh.pkl')
train_cores=pd.DataFrame(train_common, columns=['core1','core2','core3'])
test_cores=pd.DataFrame(test_common, columns=['core1','core2','core3'])

In [None]:
import networkx as nx
import pandas as pd
from tqdm import tqdm

df_train = pd.read_csv('train.csv', sep=',',names = ["id", "qid1", "qid2", "question1","question2","label"])
df_test = pd.read_csv('test.csv', sep=',',names = ["id", "qid1", "qid2", "question1","question2"])

dfs = (df_train, df_test)

questions = []
for df in dfs:
    df['question1'] = df['question1'].str.lower()
    df['question2'] = df['question2'].str.lower()
    questions += df['question1'].tolist()
    questions += df['question2'].tolist()

graph = nx.Graph()
graph.add_nodes_from(questions)

for df in [df_train, df_test]:
    edges = list(df[['question1', 'question2']].to_records(index=False))
    graph.add_edges_from(edges)

graph.remove_edges_from(graph.selfloop_edges())

df = pd.DataFrame(data=graph.nodes(), columns=["question"])
df['kcores'] = 1

n_cores = 30
for k in tqdm(range(2, n_cores + 1)):
    ck = nx.k_core(graph, k=k).nodes()
    df['kcores'][df.question.isin(ck)] = k

print(df['kcores'].value_counts())

df.to_csv("data/question_kcores.csv", index=None)

In [None]:
df_train = pd.read_csv('train.csv', sep=',',names = ["id", "qid1", "qid2", "question1","question2","label"])
df_test = pd.read_csv('test.csv', sep=',',names = ["id", "qid1", "qid2", "question1","question2"])

dfs = (df_train, df_test)

for df in dfs:
    df['question1'] = df['question1'].str.lower()
    df['question2'] = df['question2'].str.lower()
    
    q_kcores = pd.read_csv('data/question_kcores.csv', encoding="ISO-8859-1")
    
    q_kcores['question1'] = q_kcores['question']
    del q_kcores['question']
    df['q1_kcores'] = df.merge(q_kcores, 'left')['kcores']
    
    q_kcores['question2'] = q_kcores['question1']
    del q_kcores['question1']
    df['q2_kcores'] = df.merge(q_kcores, 'left')['kcores']
    
    df['q1_q2_kcores_ratio'] = (df['q1_kcores'] / df['q2_kcores']).apply(lambda x: x if x < 1. else 1./x)
    df['q1_q2_kcores_diff'] = (df['q1_kcores'] - df['q2_kcores']).apply(abs)
    df['q1_q2_kcores_diff_normed'] = (df['q1_kcores'] - df['q2_kcores']).apply(abs) / (df['q1_kcores'] + df['q2_kcores'])

df_train, df_test = dfs
df_train = df_train[['q1_kcores', 'q2_kcores', 'q1_q2_kcores_ratio', 'q1_q2_kcores_diff', 'q1_q2_kcores_diff_normed']]
df_test = df_test[['q1_kcores', 'q2_kcores', 'q1_q2_kcores_ratio', 'q1_q2_kcores_diff', 'q1_q2_kcores_diff_normed']]

### TF-IDF

In [None]:
import argparse
import functools
from collections import defaultdict

import numpy as np
import pandas as pd
import xgboost as xgb

from nltk.corpus import stopwords
from collections import Counter
from sklearn.metrics import log_loss
from sklearn.cross_validation import train_test_split

from xgboost import XGBClassifier


def word_match_share(row, stops=None):
    q1words = {}
    q2words = {}
    for word in row['question1']:
        if word not in stops:
            q1words[word] = 1
    for word in row['question2']:
        if word not in stops:
            q2words[word] = 1
    if len(q1words) == 0 or len(q2words) == 0:
        # The computer-generated chaff includes a few questions that are nothing but stopwords
        return 0
    shared_words_in_q1 = [w for w in q1words.keys() if w in q2words]
    shared_words_in_q2 = [w for w in q2words.keys() if w in q1words]
    R = (len(shared_words_in_q1) + len(shared_words_in_q2))/(len(q1words) + len(q2words))
    return R

def jaccard(row):
    wic = set(row['question1']).intersection(set(row['question2']))
    uw = set(row['question1']).union(row['question2'])
    if len(uw) == 0:
        uw = [1]
    return (len(wic) / len(uw))

def common_words(row):
    return len(set(row['question1']).intersection(set(row['question2'])))

def total_unique_words(row):
    return len(set(row['question1']).union(row['question2']))

def total_unq_words_stop(row, stops):
    return len([x for x in set(row['question1']).union(row['question2']) if x not in stops])

def wc_diff(row):
    return abs(len(row['question1']) - len(row['question2']))

def wc_ratio(row):
    l1 = len(row['question1'])*1.0 
    l2 = len(row['question2'])
    if l2 == 0:
        return np.nan
    if l1 / l2:
        return l2 / l1
    else:
        return l1 / l2

def wc_diff_unique(row):
    return abs(len(set(row['question1'])) - len(set(row['question2'])))

def wc_ratio_unique(row):
    l1 = len(set(row['question1'])) * 1.0
    l2 = len(set(row['question2']))
    if l2 == 0:
        return np.nan
    if l1 / l2:
        return l2 / l1
    else:
        return l1 / l2

def wc_diff_unique_stop(row, stops=None):
    return abs(len([x for x in set(row['question1']) if x not in stops]) - len([x for x in set(row['question2']) if x not in stops]))

def wc_ratio_unique_stop(row, stops=None):
    l1 = len([x for x in set(row['question1']) if x not in stops])*1.0 
    l2 = len([x for x in set(row['question2']) if x not in stops])
    if l2 == 0:
        return np.nan
    if l1 / l2:
        return l2 / l1
    else:
        return l1 / l2

def same_start_word(row):
    if not row['question1'] or not row['question2']:
        return np.nan
    return int(row['question1'][0] == row['question2'][0])

def char_diff(row):
    return abs(len(''.join(row['question1'])) - len(''.join(row['question2'])))

def char_ratio(row):
    l1 = len(''.join(row['question1'])) 
    l2 = len(''.join(row['question2']))
    if l2 == 0:
        return np.nan
    if l1 / l2:
        return l2 / l1
    else:
        return l1 / l2

def char_diff_unique_stop(row, stops=None):
    return abs(len(''.join([x for x in set(row['question1']) if x not in stops])) - len(''.join([x for x in set(row['question2']) if x not in stops])))


def get_weight(count, eps=10000, min_count=2):
    if count < min_count:
        return 0
    else:
        return 1 / (count + eps)
    
def tfidf_word_match_share_stops(row, stops=None, weights=None):
    q1words = {}
    q2words = {}
    for word in row['question1']:
        if word not in stops:
            q1words[word] = 1
    for word in row['question2']:
        if word not in stops:
            q2words[word] = 1
    if len(q1words) == 0 or len(q2words) == 0:
        # The computer-generated chaff includes a few questions that are nothing but stopwords
        return 0
    
    shared_weights = [weights.get(w, 0) for w in q1words.keys() if w in q2words] + [weights.get(w, 0) for w in q2words.keys() if w in q1words]
    total_weights = [weights.get(w, 0) for w in q1words] + [weights.get(w, 0) for w in q2words]
    
    R = np.sum(shared_weights) / np.sum(total_weights)
    return R

def tfidf_word_match_share(row, weights=None):
    q1words = {}
    q2words = {}
    for word in row['question1']:
        q1words[word] = 1
    for word in row['question2']:
        q2words[word] = 1
    if len(q1words) == 0 or len(q2words) == 0:
        # The computer-generated chaff includes a few questions that are nothing but stopwords
        return 0
    
    shared_weights = [weights.get(w, 0) for w in q1words.keys() if w in q2words] + [weights.get(w, 0) for w in q2words.keys() if w in q1words]
    total_weights = [weights.get(w, 0) for w in q1words] + [weights.get(w, 0) for w in q2words]
    
    R = np.sum(shared_weights) / np.sum(total_weights)
    return R


def build_features(data, stops, weights):
    X = pd.DataFrame()
    f = functools.partial(word_match_share, stops=stops)
    X['word_match'] = data.apply(f, axis=1, raw=True) #1

    f = functools.partial(tfidf_word_match_share, weights=weights)
    X['tfidf_wm'] = data.apply(f, axis=1, raw=True) #2

    f = functools.partial(tfidf_word_match_share_stops, stops=stops, weights=weights)
    X['tfidf_wm_stops'] = data.apply(f, axis=1, raw=True) #3

    X['jaccard'] = data.apply(jaccard, axis=1, raw=True) #4
    X['wc_diff'] = data.apply(wc_diff, axis=1, raw=True) #5
    X['wc_ratio'] = data.apply(wc_ratio, axis=1, raw=True) #6
    X['wc_diff_unique'] = data.apply(wc_diff_unique, axis=1, raw=True) #7
    X['wc_ratio_unique'] = data.apply(wc_ratio_unique, axis=1, raw=True) #8

    f = functools.partial(wc_diff_unique_stop, stops=stops)    
    X['wc_diff_unq_stop'] = data.apply(f, axis=1, raw=True) #9
    f = functools.partial(wc_ratio_unique_stop, stops=stops)    
    X['wc_ratio_unique_stop'] = data.apply(f, axis=1, raw=True) #10

    X['same_start'] = data.apply(same_start_word, axis=1, raw=True) #11
    X['char_diff'] = data.apply(char_diff, axis=1, raw=True) #12

    f = functools.partial(char_diff_unique_stop, stops=stops) 
    X['char_diff_unq_stop'] = data.apply(f, axis=1, raw=True) #13

#     X['common_words'] = data.apply(common_words, axis=1, raw=True)  #14
    X['total_unique_words'] = data.apply(total_unique_words, axis=1, raw=True)  #15

    f = functools.partial(total_unq_words_stop, stops=stops)
    X['total_unq_words_stop'] = data.apply(f, axis=1, raw=True)  #16
    
    X['char_ratio'] = data.apply(char_ratio, axis=1, raw=True) #17    

    return X

In [None]:

df_train =pd.read_csv('train.csv', sep=',',names = ["id", "qid1", "qid2", "question1","question2","is_duplicate"])
df_train = df_train.fillna(' ')

df_test=  pd.read_csv('test.csv', sep=',',names = ["id", "qid1", "qid2", "question1","question2"])
ques = pd.concat([df_train[['question1', 'question2']], \
    df_test[['question1', 'question2']]], axis=0).reset_index(drop='index')
q_dict = defaultdict(set)
for i in range(ques.shape[0]):
        q_dict[ques.question1[i]].add(ques.question2[i])
        q_dict[ques.question2[i]].add(ques.question1[i])

def q1_freq(row):
    return(len(q_dict[row['question1']]))

def q2_freq(row):
    return(len(q_dict[row['question2']]))

def q1_q2_intersect(row):
    return(len(set(q_dict[row['question1']]).intersection(set(q_dict[row['question2']]))))

df_train['q1_q2_intersect'] = df_train.apply(q1_q2_intersect, axis=1, raw=True)
df_train['q1_freq'] = df_train.apply(q1_freq, axis=1, raw=True)
df_train['q2_freq'] = df_train.apply(q2_freq, axis=1, raw=True)

df_test['q1_q2_intersect'] = df_test.apply(q1_q2_intersect, axis=1, raw=True)
df_test['q1_freq'] = df_test.apply(q1_freq, axis=1, raw=True)
df_test['q2_freq'] = df_test.apply(q2_freq, axis=1, raw=True)

test_leaky = df_test.loc[:, ['q1_q2_intersect','q1_freq','q2_freq']]
del df_test

train_leaky = df_train.loc[:, ['q1_q2_intersect','q1_freq','q2_freq']]

# explore
stops = set(stopwords.words("english"))

df_train['question1'] = df_train['question1'].map(lambda x: str(x).lower().split())
df_train['question2'] = df_train['question2'].map(lambda x: str(x).lower().split())

train_qs = pd.Series(df_train['question1'].tolist() + df_train['question2'].tolist())

words = [x for y in train_qs for x in y]
counts = Counter(words)
weights = {word: get_weight(count) for word, count in counts.items()}

print('Building Features')
x_train = build_features(df_train, stops, weights)
x_train = pd.concat((x_train, train_leaky), axis=1)

df_test =pd.read_csv('test.csv', sep=',',names = ["id", "qid1", "qid2", "question1","question2"])
df_test = df_test.fillna(' ')

df_test['question1'] = df_test['question1'].map(lambda x: str(x).lower().split())
df_test['question2'] = df_test['question2'].map(lambda x: str(x).lower().split())

x_test = build_features(df_test, stops, weights)
x_test = pd.concat((x_test, test_leaky), axis=1)

### naive features

In [None]:
import pandas as pd

def generate_naive_features(data):
    df = data.copy()

    df['q1_char_length_with_space'] = df.question1.apply(lambda x: len(str(x)))  # with space
    df['q1_char_length_without_space'] = df.question1.apply(lambda x: len(str(x).replace(' ', '')))  # without space
    df['q1_word_length'] = df.question1.apply(lambda x: len(str(x).split(' ')))
    df['q1_question_mark'] = df.question1.apply(lambda x: str(x).count('?'))  # TODO: prob not good?

    df['q2_char_length_with_space'] = df.question2.apply(lambda x: len(str(x)))  # with space
    df['q2_char_length_without_space'] = df.question2.apply(lambda x: len(str(x).replace(' ', '')))  # without space
    df['q2_word_length'] = df.question2.apply(lambda x: len(str(x).split(' ')))
    df['q2_question_mark'] = df.question2.apply(lambda x: str(x).count('?'))  # TODO: prob not good?

    df['word_length_diff'] = abs(df.q2_word_length - df.q1_word_length)
    df['char_length_diff'] = abs(df.q2_char_length_without_space - df.q1_char_length_without_space)

    return df

train_data =pd.read_csv('train.csv', sep=',',names = ["id", "qid1", "qid2", "question1","question2","is_duplicate"])
train_data = train_data.fillna(' ')

test_data=  pd.read_csv('test.csv', sep=',',names = ["id", "qid1", "qid2", "question1","question2"])

train_data = generate_naive_features(train_data)
test_data = generate_naive_features(test_data)

## More features

In [None]:
from collections import defaultdict
import numpy as np
import pandas as pd
import xgboost as xgb
from nltk.corpus import stopwords
from collections import Counter
from sklearn.metrics import log_loss

def add_word_count(x, df, word):
	x['q1_' + word] = df['question1'].apply(lambda x: (word in str(x).lower())*1)
	x['q2_' + word] = df['question2'].apply(lambda x: (word in str(x).lower())*1)
	x[word + '_both'] = x['q1_' + word] * x['q2_' + word]
    
def get_weight(count, eps=10000, min_count=2):
    return 0 if count < min_count else 1 / (count + eps)

def word_shares(row):

	q1_list = str(row['question1']).lower().split()
	q1 = set(q1_list)
	q1words = q1.difference(stops)
	if len(q1words) == 0:
		return '0:0:0:0:0:0:0:0'
        
	q2_list = str(row['question2']).lower().split()
	q2 = set(q2_list)
	q2words = q2.difference(stops)
	if len(q2words) == 0:
		return '0:0:0:0:0:0:0:0'

	words_hamming = sum(1 for i in zip(q1_list, q2_list) if i[0]==i[1])/max(len(q1_list), len(q2_list))
    
	q1stops = q1.intersection(stops)
	q2stops = q2.intersection(stops)
	q1_2gram = set([i for i in zip(q1_list, q1_list[1:])])
	q2_2gram = set([i for i in zip(q2_list, q2_list[1:])])
	shared_2gram = q1_2gram.intersection(q2_2gram)
	shared_words = q1words.intersection(q2words)
	shared_weights = [weights.get(w, 0) for w in shared_words]
	q1_weights = [weights.get(w, 0) for w in q1words]
	q2_weights = [weights.get(w, 0) for w in q2words]
	total_weights = q1_weights + q1_weights
	
	R1 = np.sum(shared_weights) / np.sum(total_weights) #tfidf share
	R2 = len(shared_words) / (len(q1words) + len(q2words) - len(shared_words)) #count share
	R31 = len(q1stops) / len(q1words) #stops in q1
	R32 = len(q2stops) / len(q2words) #stops in q2
	Rcosine_denominator = (np.sqrt(np.dot(q1_weights,q1_weights))*np.sqrt(np.dot(q2_weights,q2_weights)))
	Rcosine = np.dot(shared_weights, shared_weights)/Rcosine_denominator
	if len(q1_2gram) + len(q2_2gram) == 0:
		R2gram = 0
	else:
		R2gram = len(shared_2gram) / (len(q1_2gram) + len(q2_2gram))
	return '{}:{}:{}:{}:{}:{}:{}:{}'.format(R1, R2, len(shared_words), R31, R32, R2gram, Rcosine, words_hamming)

df_train =pd.read_csv('train.csv', sep=',',names = ["id", "qid1", "qid2", "question1","question2","is_duplicate"])
df_train = df_train.fillna(' ')

df_test=  pd.read_csv('test.csv', sep=',',names = ["id", "qid1", "qid2", "question1","question2"])
df_test = df_test.fillna(' ')

# explore
stops = set(stopwords.words("english"))

df_train['question1'] = df_train['question1'].map(lambda x: str(x).lower().split())
df_train['question2'] = df_train['question2'].map(lambda x: str(x).lower().split())

train_qs = pd.Series(df_train['question1'].tolist() + df_train['question2'].tolist())

words = [x for y in train_qs for x in y]
counts = Counter(words)
weights = {word: get_weight(count) for word, count in counts.items()}

df = pd.concat([df_train, df_test])
df['word_shares'] = df.apply(word_shares, axis=1, raw=True)
x = pd.DataFrame()

x['word_match']       = df['word_shares'].apply(lambda x: float(x.split(':')[0]))
x['word_match_2root'] = np.sqrt(x['word_match'])
x['tfidf_word_match'] = df['word_shares'].apply(lambda x: float(x.split(':')[1]))
x['shared_count']     = df['word_shares'].apply(lambda x: float(x.split(':')[2]))

x['stops1_ratio']     = df['word_shares'].apply(lambda x: float(x.split(':')[3]))
x['stops2_ratio']     = df['word_shares'].apply(lambda x: float(x.split(':')[4]))
x['shared_2gram']     = df['word_shares'].apply(lambda x: float(x.split(':')[5]))
x['cosine']           = df['word_shares'].apply(lambda x: float(x.split(':')[6]))
x['words_hamming']    = df['word_shares'].apply(lambda x: float(x.split(':')[7]))
x['diff_stops_r']     = x['stops1_ratio'] - x['stops2_ratio']

x['len_q1'] = df['question1'].apply(lambda x: len(str(x)))
x['len_q2'] = df['question2'].apply(lambda x: len(str(x)))
x['diff_len'] = x['len_q1'] - x['len_q2']
	
x['caps_count_q1'] = df['question1'].apply(lambda x:sum(1 for i in str(x) if i.isupper()))
x['caps_count_q2'] = df['question2'].apply(lambda x:sum(1 for i in str(x) if i.isupper()))
x['diff_caps'] = x['caps_count_q1'] - x['caps_count_q2']

x['len_char_q1'] = df['question1'].apply(lambda x: len(str(x).replace(' ', '')))
x['len_char_q2'] = df['question2'].apply(lambda x: len(str(x).replace(' ', '')))
x['diff_len_char'] = x['len_char_q1'] - x['len_char_q2']

x['len_word_q1'] = df['question1'].apply(lambda x: len(str(x).split()))
x['len_word_q2'] = df['question2'].apply(lambda x: len(str(x).split()))
x['diff_len_word'] = x['len_word_q1'] - x['len_word_q2']

x['avg_world_len1'] = x['len_char_q1'] / x['len_word_q1']
x['avg_world_len2'] = x['len_char_q2'] / x['len_word_q2']
x['diff_avg_word'] = x['avg_world_len1'] - x['avg_world_len2']

x['exactly_same'] = (df['question1'] == df['question2']).astype(int)
# x['duplicated'] = df.duplicated(['question1','question2']).astype(int)
add_word_count(x, df,'how')
add_word_count(x, df,'what')
add_word_count(x, df,'which')
add_word_count(x, df,'who')
add_word_count(x, df,'where')
add_word_count(x, df,'when')
add_word_count(x, df,'why')

print(x.columns)
print(x.describe())

x_train_question = x[:df_train.shape[0]]
x_test_question  = x[df_train.shape[0]:]

## Graph features (Seb)

In [None]:
import networkx as nx
import pandas as pd
import os
import numpy as np
from tqdm import tqdm


"""
Generate graph features for Quora question data. Features will be written in a csv file in path folder.
Args:
    path: folder containing train.csv and test.csv and to write csv features file.
Return:

"""
train =pd.read_csv('train.csv', sep=',',names = ["id", "qid1", "qid2", "question1","question2","is_duplicate"])
train = train.fillna(' ')

test=  pd.read_csv('test.csv', sep=',',names = ["id", "qid1", "qid2", "question1","question2"])
test = test.fillna(' ')
train = train.drop(['id','question1','question2', 'is_duplicate'], axis=1)
test = test.drop(['id','question1','question2'], axis=1)
train_test = pd.concat([train,test], ignore_index=True)

# Create graph
G=nx.Graph()

edge_list = []
for index, row in train_test.iterrows():
    edge_list.append([train_test['qid1'][index],train_test['qid2'][index]])

G.add_edges_from(edge_list)

print('Number of nodes:', G.number_of_nodes())
print('Number of edges:', G.number_of_edges())


train['q1_neigh'] = np.nan; train['q2_neigh'] = np.nan
train['common_neigh'] = np.nan; train['distinct_neigh'] = np.nan
#train['all_simple_paths_3'] =  np.nan #TOO LONG !
train['clique_size'] = np.nan
#train['number_of_clique'] = np.nan #TOO LONG !


# Computing train features
print('Computing train features')
for index, row in tqdm(train.iterrows()):
    neigh_1 = G.neighbors(train['qid1'][index])
    neigh_2 = G.neighbors(train['qid2'][index])

    train.loc[index,'q1_neigh'] = len(neigh_1)
    train.loc[index,'q2_neigh'] = len(neigh_2)
    train.loc[index,'common_neigh'] = len(list(nx.common_neighbors(G,train['qid1'][index],train['qid2'][index])))
    train.loc[index,'distinct_neigh'] = len(neigh_1)+len(neigh_2)-len(list(nx.common_neighbors(G,train['qid1'][index],train['qid2'][index])))

    #train.loc[index,'all_simple_paths_3'] = len(list(nx.all_simple_paths(G,train['qid1'][index],train['qid2'][index])))

    train.loc[index,'clique_size'] = nx.node_clique_number(G,train['qid1'][index])
    #train.loc[index,'number_of_clique'] = nx.number_of_cliques(G,train['qid1'][index])

train = train.drop(['qid1','qid2'],axis=1)

# print('Writing train features...')
# 	train.to_csv(os.path.join(path,'train_graph_feat.csv'))

print('Computing test features')
for index, row in tqdm(test.iterrows()):
    neigh_1 = G.neighbors(test['qid1'][index])
    neigh_2 = G.neighbors(test['qid2'][index])

    test.loc[index,'q1_neigh'] = len(neigh_1)
    test.loc[index,'q2_neigh'] = len(neigh_2)
    test.loc[index,'common_neigh'] = len(list(nx.common_neighbors(G,test['qid1'][index],test['qid2'][index])))
    test.loc[index,'distinct_neigh'] = len(neigh_1)+len(neigh_2)-len(list(nx.common_neighbors(G,test['qid1'][index],test['qid2'][index])))

    #test.loc[index,'all_simple_paths_3'] = len(list(nx.all_simple_paths(G,test['qid1'][index],test['qid2'][index])))

    test.loc[index,'clique_size'] = nx.node_clique_number(G,test['qid1'][index])
    #test.loc[index,'number_of_clique'] = nx.number_of_cliques(G,test['qid1'][index])

test = test.drop(['qid1','qid2'],axis=1)

#     print('Writing test features...')	    
# 	test.to_csv(os.path.join(path,'test_graph_feat.csv'))

# 	print('CSV written ! see: ', path, " | suffix: ", "_graph_feat.csv")

### N gram (seb)

In [None]:
train_2gram_feat = pd.read_csv('data/train_2gram_feat.csv', sep=',', encoding='latin-1')
test_2gram_feat = pd.read_csv('data/test_2gram_feat.csv', sep=',', encoding='latin-1')
train_3gram_feat = pd.read_csv('data/train_3gram_feat.csv', sep=',', encoding='latin-1')
test_3gram_feat = pd.read_csv('data/test_3gram_feat.csv', sep=',', encoding='latin-1')

## ML Algorithm

In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn import cross_validation, metrics   #Additional scklearn functions
from sklearn.grid_search import GridSearchCV   #Perforing grid search
import matplotlib.pyplot as plt

### Load data

In [None]:
features_train = pd.read_csv('data/train_features_glove.csv', sep=',', encoding='latin-1')
features_test = pd.read_csv('data/test_features_glove.csv', sep=',', encoding='latin-1')
features_train= features_train.drop(['question1', 'question2'], axis=1)
features_test = features_test.drop(['id','qid1','qid2','question1', 'question2'], axis=1)
data_train = pd.read_csv('train.csv', sep=',',names = ["id", "qid1", "qid2", "question1","question2","is_duplicate"])
Y_train=data_train["is_duplicate"].values
features_test["q1_pr"]=pagerank_feats_test["q1_pr"]
features_test["q2_pr"]=pagerank_feats_test["q2_pr"]
features_train["q1_pr"]=pagerank_feats_train["q1_pr"]
features_train["q2_pr"]=pagerank_feats_train["q2_pr"]
features_test["q1_hash"]=test_comb["q1_hash"]
features_test["q2_hash"]=test_comb["q2_hash"]
features_test["q1_freq"]=test_comb["q1_freq"]
features_test["q2_freq"]=test_comb["q2_freq"]
features_train["q1_hash"]=train_comb["q1_hash"]
features_train["q2_hash"]=train_comb["q1_hash"]
features_train["q1_freq"]=train_comb["q1_freq"]
features_train["q2_freq"]=train_comb["q2_freq"]
features_train['q1_q2_intersect']=train_feat['q1_q2_intersect']
features_test['q1_q2_intersect']=test_feat['q1_q2_intersect']
features_train["core1"]=train_cores["core1"]
features_train["core2"]=train_cores["core2"]
features_train["core3"]=train_cores["core3"]
features_test["core1"]=test_cores["core1"]
features_test["core2"]=test_cores["core2"]
features_test["core3"]=test_cores["core3"]
features_train[['q1_kcores', 'q2_kcores', 'q1_q2_kcores_ratio', 'q1_q2_kcores_diff', 'q1_q2_kcores_diff_normed']]=df_train_core
features_test[['q1_kcores', 'q2_kcores', 'q1_q2_kcores_ratio', 'q1_q2_kcores_diff', 'q1_q2_kcores_diff_normed']]=df_test_core
features_train[['word_match','tfidf_wm','tfidf_wm_stops','jaccard','wc_diff','wc_ratio','wc_diff_unique','wc_ratio_unique','wc_diff_unq_stop','wc_ratio_unique_stop','same_start',
 'char_diff','char_diff_unq_stop','total_unique_words','total_unq_words_stop','char_ratio']]=x_train[['word_match','tfidf_wm','tfidf_wm_stops','jaccard','wc_diff','wc_ratio','wc_diff_unique','wc_ratio_unique','wc_diff_unq_stop','wc_ratio_unique_stop','same_start',
 'char_diff','char_diff_unq_stop','total_unique_words','total_unq_words_stop','char_ratio']]
features_test[['word_match','tfidf_wm','tfidf_wm_stops','jaccard','wc_diff','wc_ratio','wc_diff_unique','wc_ratio_unique','wc_diff_unq_stop','wc_ratio_unique_stop','same_start',
 'char_diff','char_diff_unq_stop','total_unique_words','total_unq_words_stop','char_ratio']]=x_test[['word_match','tfidf_wm','tfidf_wm_stops','jaccard','wc_diff','wc_ratio','wc_diff_unique','wc_ratio_unique','wc_diff_unq_stop','wc_ratio_unique_stop','same_start',
 'char_diff','char_diff_unq_stop','total_unique_words','total_unq_words_stop','char_ratio']]

features_train[['q1_neigh', 'q2_neigh', 'common_neigh', 'distinct_neigh', 'clique_size']]=train
features_test[['q1_neigh', 'q2_neigh', 'common_neigh', 'distinct_neigh', 'clique_size']]=test

In [None]:
features_train[[ 'q1_how','q2_how','how_both','q1_what','q2_what','what_both','q1_which','q2_which','which_both','q1_who','q2_who','who_both','q1_where','q2_where','where_both','q1_when','q2_when','when_both','q1_why','q2_why','why_both','caps_count_q1','caps_count_q2','diff_caps','exactly_same']]=x_train_question[[ 'q1_how','q2_how','how_both','q1_what','q2_what','what_both','q1_which','q2_which','which_both','q1_who','q2_who','who_both','q1_where','q2_where','where_both','q1_when','q2_when','when_both','q1_why','q2_why','why_both','caps_count_q1','caps_count_q2','diff_caps','exactly_same']]
features_test[[ 'q1_how','q2_how','how_both','q1_what','q2_what','what_both','q1_which','q2_which','which_both','q1_who','q2_who','who_both','q1_where','q2_where','where_both','q1_when','q2_when','when_both','q1_why','q2_why','why_both','caps_count_q1','caps_count_q2','diff_caps','exactly_same']]=x_test_question[[ 'q1_how','q2_how','how_both','q1_what','q2_what','what_both','q1_which','q2_which','which_both','q1_who','q2_who','who_both','q1_where','q2_where','where_both','q1_when','q2_when','when_both','q1_why','q2_why','why_both','caps_count_q1','caps_count_q2','diff_caps','exactly_same']]

In [None]:
features_train[['bigram_coocurence','bigram_distinct','bigram_nostpwrd_coocurence','bigram_nostpwrd_distinct']]=train_2gram_feat[['bigram_coocurence','bigram_distinct','bigram_nostpwrd_coocurence','bigram_nostpwrd_distinct']]
features_test[['bigram_coocurence','bigram_distinct','bigram_nostpwrd_coocurence','bigram_nostpwrd_distinct']]=test_2gram_feat[['bigram_coocurence','bigram_distinct','bigram_nostpwrd_coocurence','bigram_nostpwrd_distinct']]
features_train[['3gram_cooccurence','3gram_distinct','3gram_nostpwrd_cooccurence','3gram_nostpwrd_distinct']]=train_3gram_feat[['3gram_cooccurence','3gram_distinct','3gram_nostpwrd_cooccurence','3gram_nostpwrd_distinct']]
features_test[['3gram_cooccurence','3gram_distinct','3gram_nostpwrd_cooccurence','3gram_nostpwrd_distinct']]=test_3gram_feat[['3gram_cooccurence','3gram_distinct','3gram_nostpwrd_cooccurence','3gram_nostpwrd_distinct']]

In [None]:
X_train= features_train
X_test = features_test
X_train=X_train.replace([np.inf, -np.inf], np.nan)
X_train=X_train.fillna(value=0)
X_test=X_test.replace([np.inf, -np.inf], np.nan)
X_test=X_test.fillna(value=0)

### Feature selection

In [23]:
#Choose all predictors except target & IDcols
predictors = [x for x in X_train.columns if x not in ['is_duplicate']]
xgb1 = XGBClassifier(
 learning_rate =0.1,
 n_estimators=1000,
 max_depth=5,
 min_child_weight=1,
 gamma=0,
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'binary:logistic',
 nthread=4,
 scale_pos_weight=1,
 seed=27)

cv_folds=5
early_stopping_rounds=50
# modelfit(xgb1, X_train, predictors)
alg=xgb1
dtrain=X_train.copy()
xgb_param = alg.get_xgb_params()
xgtrain = xgb.DMatrix(dtrain[predictors].values, label=dtrain['is_duplicate'].values)
cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
    metrics='auc', early_stopping_rounds=early_stopping_rounds)
alg.set_params(n_estimators=cvresult.shape[0])

#Fit the algorithm on the data
alg.fit(dtrain[predictors], dtrain['is_duplicate'],eval_metric='auc')

#Predict training set:
dtrain_predictions = alg.predict(dtrain[predictors])
dtrain_predprob = alg.predict_proba(dtrain[predictors])[:,1]

#Print model report:
print ("\nModel Report")
print ("Accuracy : %.4g" % metrics.accuracy_score(dtrain['is_duplicate'].values, dtrain_predictions))
print ("AUC Score (Train): %f" % metrics.roc_auc_score(dtrain['is_duplicate'], dtrain_predprob))

feat_imp = pd.Series(alg.booster().get_fscore()).sort_values(ascending=False)
feat_imp.plot(kind='bar', title='Feature Importances')
plt.ylabel('Feature Importance Score')
plt.show()

In [24]:
print(feat_imp)

In [None]:
X_train= features_train.drop(['question1','question2','is_duplicate','cosine_distance','jaccard_distance','euclidean_distance','norm_wmd','fuzz_WRatio','len_word_q2','len_word_q1','minkowski_distance','braycurtis_distance'], axis=1)

In [None]:
X_test= features_test.drop(['qid1', 'id', 'qid2','question1','question2','cosine_distance','jaccard_distance','euclidean_distance','norm_wmd','fuzz_WRatio','len_word_q2','len_word_q1','minkowski_distance','braycurtis_distance'], axis=1)

In [None]:
xgb1.fit(X_train,Y_train)

In [None]:
y_pred1 = xgb1.predict_proba(X_test1)
with open("submission_file.csv", 'w') as f:
    f.write("Id,Score\n")
    for i in range(y_pred1.shape[0]):
        f.write(str(i)+','+str(y_pred1[i][1])+'\n')

## Cross val

#### light gb

In [None]:
from sklearn.model_selection import StratifiedKFold
NUM_FOLDS = 5
RANDOM_SEED = 2017
np.random.seed(RANDOM_SEED)
import lightgbm as lgb

In [None]:
kfold = StratifiedKFold(
    n_splits=NUM_FOLDS,
    shuffle=True,
    random_state=RANDOM_SEED
)

In [None]:
y_test_pred = np.zeros((len(X_test1), NUM_FOLDS))
cv_scores = []

In [None]:
X_train=X_train1.values
X_test=X_test1.values
for fold_num, (ix_train, ix_val) in enumerate(kfold.split(X_train, Y_train)):
    print('Fitting fold {fold_num + 1} of {kfold.n_splits}')
    
    print(len(ix_train))
    print(X_train.shape)
    X_fold_train = X_train[ix_train,:]
    X_fold_val = X_train[ix_val,:]

    y_fold_train = Y_train[ix_train]
    y_fold_val = Y_train[ix_val]
    
    lgb_params = {
        'objective': 'binary',
        'metric': 'binary_logloss',
        'boosting': 'gbdt',
        'device': 'cpu',
        'feature_fraction': 0.486,
        'num_leaves': 158,
        'lambda_l2': 50,
        'learning_rate': 0.01,
        'num_boost_round': 5000,
        'early_stopping_rounds': 10,
        'verbose': 1,
        'bagging_fraction_seed': RANDOM_SEED,
        'feature_fraction_seed': RANDOM_SEED,
    }
    
    lgb_data_train = lgb.Dataset(X_fold_train, y_fold_train)
    lgb_data_val = lgb.Dataset(X_fold_val, y_fold_val)    
    evals_result = {}
    
    model = lgb.train(
        lgb_params,
        lgb_data_train,
        valid_sets=[lgb_data_train, lgb_data_val],
        evals_result=evals_result,
        num_boost_round=lgb_params['num_boost_round'],
        early_stopping_rounds=lgb_params['early_stopping_rounds'],
        verbose_eval=False,
    )
    
    fold_train_scores = evals_result['training'][lgb_params['metric']]
    fold_val_scores = evals_result['valid_1'][lgb_params['metric']]
    
    print('Fold {}: {} rounds, training loss {:.6f}, validation loss {:.6f}'.format(
        fold_num + 1,
        len(fold_train_scores),
        fold_train_scores[-1],
        fold_val_scores[-1],
    ))
    print()
    
    cv_scores.append(fold_val_scores[-1])
    y_test_pred[:, fold_num] = model.predict(X_test).reshape(-1)

In [None]:
pd.DataFrame({
    'column': list(X_train.columns),
    'importance': model.feature_importance(),
}).sort_values(by='importance')

In [None]:
print('Final CV score:', final_cv_score)

In [None]:
y_test = np.mean(y_test_pred, axis=1)
with open("submission_file.csv", 'w') as f:
    f.write("Id,Score\n")
    for i in range(y_test.shape[0]):
        f.write(str(i)+','+str(y_test[i])+'\n')

#### Xgboost

In [None]:
from sklearn.metrics import log_loss
from sklearn.model_selection import train_test_split

from xgboost import XGBClassifier
import xgboost as xgb

X_train, X_valid, y_train, y_valid = train_test_split(X_train1, Y_train, test_size=0.1, random_state=4242)

# UPDownSampling
pos_train = X_train[y_train == 1]
neg_train = X_train[y_train == 0]
X_train = pd.concat((neg_train, pos_train.iloc[:int(0.8 * len(pos_train))], neg_train))
y_train = np.array(
    [0] * neg_train.shape[0] + [1] * pos_train.iloc[:int(0.8 * len(pos_train))].shape[0] + [0] * neg_train.shape[0])
print(np.mean(y_train))
del pos_train, neg_train

pos_valid = X_valid[y_valid == 1]
neg_valid = X_valid[y_valid == 0]
X_valid = pd.concat((neg_valid, pos_valid.iloc[:int(0.8 * len(pos_valid))], neg_valid))
y_valid = np.array(
    [0] * neg_valid.shape[0] + [1] * pos_valid.iloc[:int(0.8 * len(pos_valid))].shape[0] + [0] * neg_valid.shape[0])
print(np.mean(y_valid))
del pos_valid, neg_valid

params = {}
params['objective'] = 'binary:logistic'
params['eval_metric'] = 'logloss'
params['eta'] = 0.02
params['max_depth'] = 7
params['subsample'] = 0.6
params['base_score'] = 0.2
# params['scale_pos_weight'] = 0.2

d_train = xgb.DMatrix(X_train, label=y_train)
d_valid = xgb.DMatrix(X_valid, label=y_valid)

watchlist = [(d_train, 'train'), (d_valid, 'valid')]

bst = xgb.train(params, d_train, 2500, watchlist, early_stopping_rounds=50, verbose_eval=50)
print(log_loss(y_valid, bst.predict(d_valid)))
d_test = xgb.DMatrix(X_test)
p_test = bst.predict(d_test)

In [1]:
import spacy

ImportError: No module named 'ujson'