## Imports

In [None]:
"""
Feature based on Tien-Dung Le's post:
https://www.kaggle.com/c/quora-question-pairs/discussion/33371

Slightly different idea here.
Instead of making a id-based graph/dataframe, we do it based on lowercase questions.

"""
import networkx as nx
import pandas as pd
from tqdm import tqdm

## Read data

In [None]:
feature_list_id = 'magic_kcore_2'

In [None]:
df_train = pd.read_csv(data_folder + 'train.csv')
df_test = pd.read_csv(data_folder + 'test.csv')

dfs = (df_train, df_test)

## Build features

In [None]:
questions = []
for df in dfs:
    df['question1'] = df['question1'].str.lower()
    df['question2'] = df['question2'].str.lower()
    questions += df['question1'].tolist()
    questions += df['question2'].tolist()

graph = nx.Graph()
graph.add_nodes_from(questions)

for df in [df_train, df_test]:
    edges = list(df[['question1', 'question2']].to_records(index=False))
    graph.add_edges_from(edges)

graph.remove_edges_from(graph.selfloop_edges())

df = pd.DataFrame(data=graph.nodes(), columns=["question"])
df['kcores'] = 1

n_cores = 30
for k in tqdm(range(2, n_cores + 1)):
    ck = nx.k_core(graph, k=k).nodes()
    df['kcores'][df.question.isin(ck)] = k

print(df['kcores'].value_counts())

In [None]:
df.head()

In [None]:
kcore_dict = dict(zip(df.question, df.kcores))

In [None]:
df_train['q1_kcore'] = df_train['question1'].map(kcore_dict)
df_train['q2_kcore'] = df_train['question2'].map(kcore_dict)

In [None]:
df_test['q1_kcore'] = df_test['question1'].map(kcore_dict)
df_test['q2_kcore'] = df_test['question2'].map(kcore_dict)

## Save feature names

In [None]:
columns_to_keep = [
    'q1_kcore',
    'q2_kcore',
]

In [None]:
feature_names = [
    'magic_kcore_q1',
    'magic_kcore_q2',
]

In [None]:
save_feature_names(feature_names, feature_list_id)

## Save features

In [None]:
save_feature_list(df_train[columns_to_keep].values, 'train', feature_list_id)

In [None]:
save_feature_list(df_test[columns_to_keep].values, 'test', feature_list_id)