## Imports

In [1]:
import os

## Config

In [2]:
feature_list_id = 'magic_jturkewitz'

In [3]:
data_folder = os.path.abspath(os.path.join(os.curdir, os.pardir, 'data')) + os.path.sep
aux_data_folder = os.path.join(data_folder, 'aux') + os.path.sep
preproc_data_folder = os.path.join(data_folder, 'preproc') + os.path.sep
features_data_folder = os.path.join(data_folder, 'features') + os.path.sep

## Read Data

In [4]:
df_questions_train = pd.read_csv(data_folder + 'train.csv').fillna('')
df_questions_test = pd.read_csv(data_folder + 'test.csv').fillna('')

## Build Features

In [5]:
train_orig = df_questions_train
test_orig = df_questions_test

In [6]:
df1 = train_orig[['question1']].copy()
df2 = train_orig[['question2']].copy()
df1_test = test_orig[['question1']].copy()
df2_test = test_orig[['question2']].copy()

df2.rename(columns = {'question2':'question1'},inplace=True)
df2_test.rename(columns = {'question2':'question1'},inplace=True)

train_questions = df1.append(df2)
train_questions = train_questions.append(df1_test)
train_questions = train_questions.append(df2_test)
train_questions.drop_duplicates(subset = ['question1'],inplace=True)

train_questions.reset_index(inplace=True,drop=True)
questions_dict = pd.Series(train_questions.index.values,index=train_questions.question1.values).to_dict()
train_cp = train_orig.copy()
test_cp = test_orig.copy()
train_cp.drop(['qid1','qid2'],axis=1,inplace=True)

test_cp['is_duplicate'] = -1
test_cp.rename(columns={'test_id':'id'},inplace=True)
comb = pd.concat([train_cp,test_cp])

comb['q1_hash'] = comb['question1'].map(questions_dict)
comb['q2_hash'] = comb['question2'].map(questions_dict)

q1_vc = comb.q1_hash.value_counts().to_dict()
q2_vc = comb.q2_hash.value_counts().to_dict()

def try_apply_dict(x,dict_to_apply):
    try:
        return dict_to_apply[x]
    except KeyError:
        return 0

#map to frequency space
comb['q1_freq'] = comb['q1_hash'].map(lambda x: try_apply_dict(x,q1_vc) + try_apply_dict(x,q2_vc))
comb['q2_freq'] = comb['q2_hash'].map(lambda x: try_apply_dict(x,q1_vc) + try_apply_dict(x,q2_vc))

# Calculate ratios
comb['freq_ratio'] = comb['q1_freq'] / comb['q2_freq']
comb['freq_ratio_log'] = np.log(comb['q1_freq'] / comb['q2_freq'])

# Bin hashes
comb['q1_hash'] = pd.cut(comb['q1_hash'], 100, labels=range(100))
comb['q2_hash'] = pd.cut(comb['q2_hash'], 100, labels=range(100))

column_names = [
    'q1_freq',
    'q2_freq',
    'freq_ratio',
#     'freq_ratio_log',
#     'q1_hash',
#     'q2_hash',
]

train_comb = comb[comb['is_duplicate'] >= 0][column_names]
test_comb = comb[comb['is_duplicate'] < 0][column_names]

In [7]:
comb

Unnamed: 0,id,question1,question2,is_duplicate,q1_hash,q2_hash,q1_freq,q2_freq,freq_ratio,freq_ratio_log
0,0,What is the step by step guide to invest in share market in india?,What is the step by step guide to invest in share market?,0,0,6,1,2,0.500000,-0.693147
1,1,What is the story of Kohinoor (Koh-i-Noor) Diamond?,What would happen if the Indian government stole the Kohinoor (Koh-i-Noor) diamond back?,0,0,0,8,3,2.666667,0.980829
2,2,How can I increase the speed of my internet connection while using a VPN?,How can Internet speed be increased by hacking through DNS?,0,0,6,2,1,2.000000,0.693147
3,3,Why am I mentally very lonely? How can I solve it?,"Find the remainder when [math]23^{24}[/math] is divided by 24,23?",0,0,6,1,1,1.000000,0.000000
4,4,"Which one dissolve in water quikly sugar, salt, methane and carbon di oxide?",Which fish would survive in salt water?,0,0,6,3,1,3.000000,1.098612
5,5,Astrology: I am a Capricorn Sun Cap moon and cap rising...what does that say about me?,"I'm a triple Capricorn (Sun, Moon and ascendant in Capricorn) What does this say about me?",1,0,6,1,1,1.000000,0.000000
6,6,Should I buy tiago?,What keeps childern active and far from phone and video games?,0,0,6,1,1,1.000000,0.000000
7,7,How can I be a good geologist?,What should I do to be a great geologist?,1,0,6,1,1,1.000000,0.000000
8,8,When do you use シ instead of し?,"When do you use ""&"" instead of ""and""?",0,0,5,2,3,0.666667,-0.405465
9,9,Motorola (company): Can I hack my Charter Motorolla DCX3400?,How do I hack Motorola DCX3400 for free internet?,0,0,6,1,1,1.000000,0.000000


## Save feature names

In [8]:
feature_names = [
    'magic_jt_q1_freq',
    'magic_jt_q2_freq',
    'magic_jt_freq_ratio',
#     'magic_jt_freq_ratio_log',
#     'magic_jt_q1_hash_bin',
#     'magic_jt_q2_hash_bin',
]

In [9]:
save_lines(feature_names, features_data_folder + f'X_train_{feature_list_id}.names')

## Save features

In [10]:
X_train = train_comb.values.astype('float64')

In [11]:
save(X_train, features_data_folder + f'X_train_{feature_list_id}.pickle')

In [12]:
X_test = test_comb.values.astype('float64')

In [13]:
save(X_test, features_data_folder + f'X_test_{feature_list_id}.pickle')