In [1]:
import pandas as pd
import numpy as np
import sklearn
from tqdm import tqdm
import time

Data proprocess for fast IO

In [None]:
# Load preprocessed train_categorical.csv into np.float (NaN not supported on np.int)
start_time = time.time()
train_categorical=pd.read_csv('train_categorical_int.csv', dtype=np.float)
print('Load train_categorical time: {} minutes'.format(round((time.time() - start_time)/60, 2))) # at 4.6 minutes

# Process train_categorical into np.int32 to save memory
# 
train_categorical.fillna(-999, inplace=True)
train_categorical=train_categorical.astype(np.int32)
print('Process train_categorical time: {} minutes'.format(round((time.time() - start_time)/60, 2))) # at 6.25 minutes

# Resave
train_categorical.to_hdf('train_categorical_int.h5','table')
print('Resave train_categorical time: {} minutes'.format(round((time.time() - start_time)/60, 2))) # at 8.19 minutes

In [None]:
# Load numerical features and Response!
start_time = time.time()
train_numeric=pd.read_csv('train_numeric.csv', dtype=np.float)
print('Load train_numeric time: {} minutes'.format(round((time.time() - start_time)/60, 2))) # at 1.8 minutes

train_numeric['Id']=train_numeric['Id'].astype(np.int32)
train_numeric['Response']=train_numeric['Response'].astype(np.int32)
print('Process train_numeric time: {} minutes'.format(round((time.time() - start_time)/60, 2)))

train_numeric.to_hdf('train_numeric.h5','table', complevel=1)
print('Resave train_numeric time: {} minutes'.format(round((time.time() - start_time)/60, 2)))

In [3]:
# Load dates!
start_time = time.time()
train_date=pd.read_csv('train_date.csv', dtype=np.float)
print('Load train_date time: {} minutes'.format(round((time.time() - start_time)/60, 2))) 

train_date['Id']=train_numeric['Id'].astype(np.int32)
print('Process train_date time: {} minutes'.format(round((time.time() - start_time)/60, 2)))

train_date.to_hdf('train_date.h5','table', complevel=1)
print('Resave train_date time: {} minutes'.format(round((time.time() - start_time)/60, 2)))

Load train_date time: 2.49 minutes
Process train_date time: 2.84 minutes
Resave train_date time: 4.36 minutes


Load data and do some basic analysis

In [4]:
start_time = time.time()
train_categorical=pd.read_hdf('train_categorical_int.h5', 'table')
train_numeric    =pd.read_hdf('train_numeric.h5', 'table')
train_date       =pd.read_hdf('train_date.h5', 'table')
print('Load time: {} minutes'.format(round((time.time() - start_time)/60, 2)))

In [7]:
#start_time = time.time()
#train=pd.merge(train_categorical, train_numeric, on='Id')
cat_names=train_categorical.columns.values[1:]
num_names=train_numeric.columns.values[1:-1]
dat_names=train_date.columns.values[1:-1]
#print('Merge categorical and numeric time: {} minutes'.format(round((time.time() - start_time)/60, 2))) # at 2.85 minutes
#del(train_categorical)
#del(train_numeric)

In [8]:
def mutual_entropy(a, l):
    n = len(a)
    a_ent = -np.sum([x*np.log(x) for x in a.value_counts()/n])
    l_ent = -np.sum([x*np.log(x) for x in l.value_counts()/n])
    al_ent = -np.sum([x*np.log(x) for x in a[l==0].value_counts()/n]) - \
             np.sum([x*np.log(x) for x in a[l==1].value_counts()/n])
    return a_ent + l_ent - al_ent
def entropy(a):
    n=len(a)
    a_ent = -np.sum([x*np.log(x) for x in a.value_counts()/n])
    return a_ent

leaveoneout=dict()
onehot_categorical=[]
leaveoneout_categorical=[]
for cat in tqdm(cat_names):
    ent=mutual_entropy(train_categorical[cat], train_numeric['Response'])
    if ent > 1E-5:        
        #print(str(cat) + ': ' + str(ent))
        if ent > 1E-3:
            onehot_categorical = onehot_categorical + [cat]
        leaveoneout_categorical = leaveoneout_categorical + [cat]
        leaveoneout[cat]={k:v-1 for (k,v) in dict(train_categorical[cat].value_counts()).items()}
selected_cat_names=list(leaveoneout.keys())        

100%|██████████| 2140/2140 [02:45<00:00, 12.65it/s]


In [9]:
HasResponse = np.array(train_numeric['Response'] == 1)

In [10]:
sum(HasResponse) / len(train_numeric)

0.0058112079692704604

In [71]:
positive_filter=np.full(sum(HasResponse), 0, dtype=np.int)
all_filter=np.full(len(train), 0, dtype=np.int)
for cat in selected_cat_names:
    positive_filter = positive_filter + np.array(train[cat][HasResponse]!=-999)
    all_filter = all_filter + np.array(train[cat]!=-999)

In [95]:
sum(positive_filter == 2) / sum(all_filter == 2)

0.042234251383023023

In [112]:
np.array(selected_cat_names)[X[HasResponse][positive_filter == 2][3,:] != -999]

array(['L3_S32_F3854', 'L3_S32_F3851'], 
      dtype='<U12')

In [117]:
mutual_entropy(train['L3_S32_F3854'], train['Response'])

0.0020189238473217186

In [11]:
%matplotlib inline
%load_ext Cython
from sklearn.metrics import matthews_corrcoef
import matplotlib.pyplot as plt

In [12]:
%%cython
from sklearn.metrics import matthews_corrcoef
import matplotlib.pyplot as plt
import numpy as np
def mcc(tp, tn, fp, fn):
    sup = tp * tn - fp * fn
    inf = (tp + fp) * (tp + fn) * (tn + fp) * (tn + fn)
    if inf==0:
        return 0
    else:
        return sup / np.sqrt(inf)

def eval_mcc(y_true, y_prob, show=False):
    idx = np.argsort(y_prob)
    y_true_sort = y_true[idx]
    n = y_true.shape[0]
    nump = 1.0 * np.sum(y_true) # number of positive
    numn = n - nump # number of negative
    tp = nump
    tn = 0.0
    fp = numn
    fn = 0.0
    best_mcc = 0.0
    best_id = -1
    mccs = np.zeros(n)
    for i in range(n):
        # all items with idx <= i are predicted negative while others are predicted positive
        if y_true_sort[i] == 1:
            tp -= 1.0
            fn += 1.0
        else:
            fp -= 1.0
            tn += 1.0
        new_mcc = mcc(tp, tn, fp, fn)
        mccs[i] = new_mcc
        if new_mcc >= best_mcc:
            best_mcc = new_mcc
            best_id = i
    if show:
        best_proba = y_prob[idx[best_id]]
        y_pred = (y_prob > best_proba).astype(int)
        score = matthews_corrcoef(y_true, y_pred)
        print(score, best_mcc)
        plt.plot(mccs)
        return best_proba, best_mcc, y_pred
    else:
        return best_mcc

In [13]:
from sklearn import cross_validation
from sklearn import ensemble



In [None]:
X=train_date.fillna(-999).values
y=train_numeric['Response'].values

In [None]:
X_train, X_test, y_train, y_test = cross_validation.train_test_split(
...     X, y, stratify=y, test_size=0.5, random_state=777)

In [None]:
clf=ensemble.RandomForestClassifier(n_estimators=100, random_state=777, verbose=1, n_jobs=4, oob_score=True, class_weight={1:10, 0:1}) # ~10 minutes
clf.fit(X_train, y_train)

In [None]:
y_pred=clf.predict_proba(X_test)[:,1]
eval_mcc(y_test, y_pred, show=True)

In [201]:
len(num_names)

968

In [196]:
selected_num_names=num_names[clf.feature_importances_ > 0.0001]
len(selected_num_names)

650