In [1]:
import pandas as pd
import numpy as np
import sklearn
from tqdm import tqdm
import time

Data proprocess for fast IO

In [None]:
# Load preprocessed train_categorical.csv into np.float (NaN not supported on np.int)
start_time = time.time()
train_categorical=pd.read_csv('train_categorical_int.csv', dtype=np.float)
print('Load train_categorical time: {} minutes'.format(round((time.time() - start_time)/60, 2))) # at 4.6 minutes

# Process train_categorical into np.int32 to save memory
# 
train_categorical.fillna(-999, inplace=True)
train_categorical=train_categorical.astype(np.int32)
print('Process train_categorical time: {} minutes'.format(round((time.time() - start_time)/60, 2))) # at 6.25 minutes

# Resave
train_categorical.to_hdf('train_categorical_int.h5','table')
print('Resave train_categorical time: {} minutes'.format(round((time.time() - start_time)/60, 2))) # at 8.19 minutes

In [None]:
# Load preprocessed train_categorical.csv into np.float (NaN not supported on np.int)
start_time = time.time()
test_categorical=pd.read_csv('test_categorical_int.csv', dtype=np.float)
print('Load test_categorical time: {} minutes'.format(round((time.time() - start_time)/60, 2))) # at 4.6 minutes

# Process train_categorical into np.int32 to save memory
# 
test_categorical.fillna(-999, inplace=True)
test_categorical=test_categorical.astype(np.int32)
print('Process test_categorical time: {} minutes'.format(round((time.time() - start_time)/60, 2))) # at 6.25 minutes

# Resave
test_categorical.to_hdf('test_categorical_int.h5','table')
print('Resave test_categorical time: {} minutes'.format(round((time.time() - start_time)/60, 2))) # at 8.19 minutes

In [None]:
# Load numerical features and Response!
start_time = time.time()
train_numeric=pd.read_csv('train_numeric.csv', dtype=np.float)
print('Load train_numeric time: {} minutes'.format(round((time.time() - start_time)/60, 2))) # at 1.8 minutes

train_numeric['Id']=train_numeric['Id'].astype(np.int32)
train_numeric['Response']=train_numeric['Response'].astype(np.int32)
print('Process train_numeric time: {} minutes'.format(round((time.time() - start_time)/60, 2)))

train_numeric.to_hdf('train_numeric.h5','table', complevel=1)
print('Resave train_numeric time: {} minutes'.format(round((time.time() - start_time)/60, 2)))

In [None]:
# Load numerical features and Response!
start_time = time.time()
test_numeric=pd.read_csv('test_numeric.csv', dtype=np.float)
print('Load test_numeric time: {} minutes'.format(round((time.time() - start_time)/60, 2))) # at 1.8 minutes

test_numeric['Id']=test_numeric['Id'].astype(np.int32)
#test_numeric['Response']=test_numeric['Response'].astype(np.int32)
print('Process test_numeric time: {} minutes'.format(round((time.time() - start_time)/60, 2)))

test_numeric.to_hdf('test_numeric.h5','table', complevel=1)
print('Resave test_numeric time: {} minutes'.format(round((time.time() - start_time)/60, 2)))

In [None]:
# Load dates!
start_time = time.time()
train_date=pd.read_csv('train_date.csv', dtype=np.float)
print('Load train_date time: {} minutes'.format(round((time.time() - start_time)/60, 2))) 

train_date['Id']=train_numeric['Id'].astype(np.int32)
print('Process train_date time: {} minutes'.format(round((time.time() - start_time)/60, 2)))

train_date.to_hdf('train_date.h5','table', complevel=1)
print('Resave train_date time: {} minutes'.format(round((time.time() - start_time)/60, 2)))

In [None]:
# Load dates!
start_time = time.time()
test_date=pd.read_csv('test_date.csv', dtype=np.float)
print('Load test_date time: {} minutes'.format(round((time.time() - start_time)/60, 2))) 

test_date['Id']=test_numeric['Id'].astype(np.int32)
print('Process test_date time: {} minutes'.format(round((time.time() - start_time)/60, 2)))

test_date.to_hdf('test_date.h5','table', complevel=1)
print('Resave test_date time: {} minutes'.format(round((time.time() - start_time)/60, 2)))

Load data and do some basic analysis

In [None]:
start_time = time.time()
train_categorical=pd.read_hdf('train_categorical_int.h5', 'table')
train_numeric    =pd.read_hdf('train_numeric.h5', 'table')
train_date       =pd.read_hdf('train_date.h5', 'table')
print('Load time: {} minutes'.format(round((time.time() - start_time)/60, 2)))

In [2]:
start_time = time.time()
test_categorical=pd.read_hdf('test_categorical_int.h5', 'table')
test_numeric    =pd.read_hdf('test_numeric.h5', 'table')
test_date       =pd.read_hdf('test_date.h5', 'table')
print('Load time: {} minutes'.format(round((time.time() - start_time)/60, 2)))

Load time: 0.51 minutes


In [None]:
#start_time = time.time()
#train=pd.merge(train_categorical, train_numeric, on='Id')
cat_names=train_categorical.columns.values[1:]
num_names=train_numeric.columns.values[1:-1]
dat_names=train_date.columns.values[1:]
#print('Merge categorical and numeric time: {} minutes'.format(round((time.time() - start_time)/60, 2))) # at 2.85 minutes
#del(train_categorical)
#del(train_numeric)

In [3]:
cat_names=test_categorical.columns.values[1:]
num_names=test_numeric.columns.values[1:]
dat_names=test_date.columns.values[1:]

In [None]:
def mutual_entropy(a, l):
    n = len(a)
    a_ent = -np.sum([x*np.log(x) for x in a.value_counts()/n])
    l_ent = -np.sum([x*np.log(x) for x in l.value_counts()/n])
    al_ent = -np.sum([x*np.log(x) for x in a[l==0].value_counts()/n]) - \
             np.sum([x*np.log(x) for x in a[l==1].value_counts()/n])
    return a_ent + l_ent - al_ent
def entropy(a):
    n=len(a)
    a_ent = -np.sum([x*np.log(x) for x in a.value_counts()/n])
    return a_ent

leaveoneout=dict()
onehot_categorical=[]
leaveoneout_categorical=[]
for cat in tqdm(cat_names):
    ent=mutual_entropy(train_categorical[cat], train_numeric['Response'])
    if ent > 1E-5:        
        #print(str(cat) + ': ' + str(ent))
        if ent > 1E-3:
            onehot_categorical = onehot_categorical + [cat]
        leaveoneout_categorical = leaveoneout_categorical + [cat]
        leaveoneout[cat]={k:v-1 for (k,v) in dict(train_categorical[cat].value_counts()).items()}
selected_cat_names=list(leaveoneout.keys())        

In [None]:
HasResponse = np.array(train_numeric['Response'] == 1)

In [None]:
sum(HasResponse) / len(train_numeric)

In [None]:
positive_filter=np.full(sum(HasResponse), 0, dtype=np.int)
all_filter=np.full(len(train), 0, dtype=np.int)
for cat in selected_cat_names:
    positive_filter = positive_filter + np.array(train[cat][HasResponse]!=-999)
    all_filter = all_filter + np.array(train[cat]!=-999)

In [None]:
sum(positive_filter == 2) / sum(all_filter == 2)

In [None]:
np.array(selected_cat_names)[X[HasResponse][positive_filter == 2][3,:] != -999]

In [None]:
mutual_entropy(train['L3_S32_F3854'], train['Response'])

In [None]:
%matplotlib inline
%load_ext Cython
from sklearn.metrics import matthews_corrcoef
import matplotlib.pyplot as plt

In [None]:
%%cython
from sklearn.metrics import matthews_corrcoef
import matplotlib.pyplot as plt
import numpy as np
def mcc(tp, tn, fp, fn):
    sup = tp * tn - fp * fn
    inf = (tp + fp) * (tp + fn) * (tn + fp) * (tn + fn)
    if inf==0:
        return 0
    else:
        return sup / np.sqrt(inf)

def eval_mcc(y_true, y_prob, show=False):
    idx = np.argsort(y_prob)
    y_true_sort = y_true[idx]
    n = y_true.shape[0]
    nump = 1.0 * np.sum(y_true) # number of positive
    numn = n - nump # number of negative
    tp = nump
    tn = 0.0
    fp = numn
    fn = 0.0
    best_mcc = 0.0
    best_id = -1
    mccs = np.zeros(n)
    for i in range(n):
        # all items with idx <= i are predicted negative while others are predicted positive
        if y_true_sort[i] == 1:
            tp -= 1.0
            fn += 1.0
        else:
            fp -= 1.0
            tn += 1.0
        new_mcc = mcc(tp, tn, fp, fn)
        mccs[i] = new_mcc
        if new_mcc >= best_mcc:
            best_mcc = new_mcc
            best_id = i
    if show:
        best_proba = y_prob[idx[best_id]]
        y_pred = (y_prob > best_proba).astype(int)
        score = matthews_corrcoef(y_true, y_pred)
        print(score, best_mcc)
        plt.plot(mccs)
        return best_proba, best_mcc, y_pred
    else:
        return best_mcc

In [None]:
from sklearn import cross_validation
from sklearn import ensemble
from sklearn import cluster

In [None]:
start_date=train_date[dat_names].min(axis=1)
end_date=train_date[dat_names].max(axis=1)
duration=end_date-start_date
start_date.fillna(-999, inplace=True)
end_date.fillna(-999, inplace=True)
duration.fillna(-999, inplace=True)
date_feature=np.array([start_date.values, end_date.values, duration]).T
np.save('date_feat.npy', date_feature)

In [4]:
start_date=test_date[dat_names].min(axis=1)
end_date=test_date[dat_names].max(axis=1)
duration=end_date-start_date
start_date.fillna(-999, inplace=True)
end_date.fillna(-999, inplace=True)
duration.fillna(-999, inplace=True)
date_feature=np.array([start_date.values, end_date.values, duration]).T
np.save('lb_date_feat.npy', date_feature)

In [None]:
np.savez_compressed('numeric_feat.npz', train_numeric[num_names].fillna(-999).values)
np.savez_compressed('categorical_feat.npz', train_categorical[cat_names].values)
np.savez_compressed('fulldate_feat.npz', train_date[dat_names].values)

In [6]:
np.savez_compressed('lb_numeric_feat.npz', test_numeric[num_names].fillna(-999).values)
np.savez_compressed('lb_categorical_feat.npz', test_categorical[cat_names].values)
np.savez_compressed('lb_fulldate_feat.npz', test_date[dat_names].values)

In [None]:
numeric_feat=np.load('numeric_feat.npz')['arr_0']
categorical_feat=np.load('categorical_feat.npz')['arr_0']
fulldate_feat=np.load('fulldate_feat.npz')['arr_0']
date_feat=np.load('date_feat.npy')
label=np.load('label.npy')

In [None]:
numeric_feat=np.load('lb_numeric_feat.npz')['arr_0']
categorical_feat=np.load('lb_categorical_feat.npz')['arr_0']
fulldate_feat=np.load('lb_fulldate_feat.npz')['arr_0']
date_feat=np.load('lb_date_feat.npy')

In [None]:
from scipy.stats import ttest_ind
%matplotlib inline
import matplotlib.pyplot as plt

a=[]
for i in tqdm(range(numeric_feat.shape[1])):
    d0=numeric_feat[label==0,i]
    d0=d0[d0!=-999]
    d1=numeric_feat[label==1,i]
    d1=d1[d1!=-999]
    Tstat, Pval1=ttest_ind(d0, d1)
    Tstat, Pval2=ttest_ind(d0**2, d1**2)
    if Pval1 < 0.01 or Pval2 < 0.01:
        a.append(True)
    else:
        a.append(False)


In [None]:
a=np.array(a)
np.save('feature_filter_ttest.npy',a)

In [None]:
X=numeric_feat # np.concatenate((numeric_feat, date_feat), axis=1)
y=label

In [None]:
X_train, X_test, y_train, y_test = cross_validation.train_test_split(
...     X, y, stratify=y, test_size=0.3, random_state=777)

In [None]:
clf=ensemble.RandomForestClassifier(n_estimators=100, random_state=777, verbose=1, n_jobs=4, oob_score=True, class_weight={1:10, 0:1}) # ~10 minutes
clf.fit(X_train, y_train)

In [None]:
y_pred=clf.predict_proba(X_test)[:,1]
eval_mcc(y_test, y_pred)

In [None]:
# find multimodal distributions
count=0
for k in tqdm(range(len(num_names))):
    samples=numeric_feat[label==1,k]
    samples=samples[samples!=-999].reshape(-1,1)
    ms=cluster.MeanShift(bandwidth=0.1, min_bin_freq=20)
    ms.fit(samples)
    if ms.cluster_centers_.shape[0] > 1:
        count = count + 1
        print(str(count) + '/' + str(k+1))
        #print(num_names[k], end=': ')
        #print(ms.cluster_centers_.shape[0])