In [1]:
# LOAD LIBRARIES
from sklearn.svm import SVC, NuSVC, LinearSVC
from sklearn.linear_model import Ridge, LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis, LinearDiscriminantAnalysis
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
import sklearn.gaussian_process.kernels as ker
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import numpy as np, pandas as pd, os, gc
from tqdm import tqdm_notebook
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.preprocessing import PolynomialFeatures
from sklearn.covariance import EmpiricalCovariance
from sklearn.covariance import GraphicalLasso
from sklearn.mixture import GaussianMixture

In [2]:
def get_mean_cov(x,y):
    model = GraphicalLasso()
    ones = (y==1).astype(bool)
    x2 = x[ones]
    model.fit(x2)
    p1 = model.precision_
    m1 = model.location_
    
    onesb = (y==0).astype(bool)
    x2b = x[onesb]
    model.fit(x2b)
    p2 = model.precision_
    m2 = model.location_
    
    ms = np.stack([m1,m2])
    ps = np.stack([p1,p2])
    return ms,ps

In [3]:
print('Reading Train Data...')
train = pd.read_csv('../input/train.csv')
print('Reading Test Data...')
test = pd.read_csv('../input/test.csv')
print('Finish Reading.')

Reading Train Data...
Reading Test Data...
Finish Reading.


In [4]:
n_folds = 11

In [5]:
# INITIALIZE VARIABLES
oof_SVC = np.zeros(len(train))
# oof_NuSVC = np.zeros(len(train))
oof_KNN = np.zeros(len(train))
oof_GMM = np.zeros(len(train))
oof_LR = np.zeros(len(train))
oof_MLP = np.zeros(len(train))

preds_SVC = np.zeros(len(test))
# preds_NuSVC = np.zeros(len(test))
preds_KNN = np.zeros(len(test))
preds_GMM = np.zeros(len(test))
preds_LR = np.zeros(len(test))
preds_MLP = np.zeros(len(test))

cols = [c for c in train.columns if c not in ['id', 'target', 'wheezy-copper-turtle-magic']]

# BUILD 512 SEPARATE NON-LINEAR MODELS
for i in tqdm_notebook(range(512)):
    
    # EXTRACT SUBSET OF DATASET WHERE WHEEZY-MAGIC EQUALS I
    train2 = train[train['wheezy-copper-turtle-magic']==i]
    test2 = test[test['wheezy-copper-turtle-magic']==i]
    idx1 = train2.index; idx2 = test2.index
    train2.reset_index(drop=True,inplace=True)
    
    # FEATURE SELECTION (USE APPROX 40 OF 255 FEATURES)
    sel = VarianceThreshold(threshold=1.5).fit(train2[cols])
    train3 = sel.transform(train2[cols])
    test3 = sel.transform(test2[cols])
    
    data = pd.concat([pd.DataFrame(train2[cols]), pd.DataFrame(test2[cols])])
    data2 = StandardScaler().fit_transform(PCA(svd_solver='full',n_components='mle').fit_transform(data[cols]))
    train4 = data2[:train2.shape[0]]; test4 = data2[train2.shape[0]:]
    
    poly = PolynomialFeatures(2)
    sc = StandardScaler()
    data3 = poly.fit_transform(sc.fit_transform(VarianceThreshold(threshold=1.5).fit_transform(data[cols])))
    train5 = data3[:train2.shape[0]]; test5 = data3[train2.shape[0]:]
        
    # STRATIFIED K FOLD (Using splits=25 scores 0.002 better but is slower)
    skf = StratifiedKFold(n_splits=n_folds, random_state=42)
    for train_index, test_index in skf.split(train3, train2['target']):
        
        # MODEL WITH GMM
        ms, ps = get_mean_cov(train3[train_index,:],train2.loc[train_index]['target'].values) 
        gm = GaussianMixture(n_components=2, init_params='random', covariance_type='full', tol=0.001,reg_covar=0.001, max_iter=100, n_init=1,means_init=ms, precisions_init=ps)
        gm.fit(np.concatenate([train3[train_index,:],test3],axis = 0))
        oof_GMM[idx1[test_index]] = gm.predict_proba(train3[test_index,:])[:,0]
        preds_GMM[idx2] += gm.predict_proba(test3)[:,0] / skf.n_splits
        
        # OTHER MODELS
        clf_SVC = SVC(probability=True,kernel='poly',degree=4,gamma='auto')
#         clf_NuSVC = NuSVC(probability=True,kernel='poly',degree=4,gamma='auto',nu=0.59, coef0=0.053)
        clf_KNN = KNeighborsClassifier(n_neighbors=10,weights='distance',p=2)
        clf_LR = LogisticRegression(solver='saga',penalty='l2',C=0.01,tol=0.001)
        clf_MLP = MLPClassifier(activation='relu', solver='lbfgs', tol=1e-06, hidden_layer_sizes=(250,), random_state=42)
        
        clf_SVC.fit(train4[train_index,:],train2.loc[train_index]['target'])
#         clf_NuSVC.fit(train4[train_index,:],train2.loc[train_index]['target'])
        clf_KNN.fit(train3[train_index,:],train2.loc[train_index]['target'])
        clf_LR.fit(train5[train_index,:],train2.loc[train_index]['target'])
        clf_MLP.fit(train5[train_index,:],train2.loc[train_index]['target'])
        
        oof_SVC[idx1[test_index]] = clf_SVC.predict_proba(train4[test_index,:])[:,1]
#         oof_NuSVC[idx1[test_index]] = clf_NuSVC.predict_proba(train4[test_index,:])[:,1]
        oof_KNN[idx1[test_index]] = clf_KNN.predict_proba(train3[test_index,:])[:,1]
        oof_LR[idx1[test_index]] = clf_LR.predict_proba(train5[test_index,:])[:,1]
        oof_MLP[idx1[test_index]] = clf_MLP.predict_proba(train5[test_index,:])[:,1]
        
        preds_SVC[idx2] += clf_SVC.predict_proba(test4)[:,1] / skf.n_splits
#         preds_NuSVC[idx2] += clf_NuSVC.predict_proba(test4)[:,1] / skf.n_splits
        preds_KNN[idx2] += clf_KNN.predict_proba(test3)[:,1] / skf.n_splits
        preds_LR[idx2] += clf_LR.predict_proba(test5)[:,1] / skf.n_splits
        preds_MLP[idx2] += clf_MLP.predict_proba(test5)[:,1] / skf.n_splits
        
    if i%64==0:     
        print(i, 'GMM oof auc : ', round(roc_auc_score(train['target'][idx1], oof_GMM[idx1]), 5))
        print(i, 'SVC oof auc : ', round(roc_auc_score(train['target'][idx1], oof_SVC[idx1]), 5))
#         print(i, 'NuSVC oof auc : ', round(roc_auc_score(train['target'][idx1], oof_NuSVC[idx1]), 5))
        print(i, 'KNN oof auc : ', round(roc_auc_score(train['target'][idx1], oof_KNN[idx1]), 5))
        print(i, 'LR oof auc : ', round(roc_auc_score(train['target'][idx1], oof_LR[idx1]), 5))
        print(i, 'MLP oof auc : ', round(roc_auc_score(train['target'][idx1], oof_MLP[idx1]), 5))

HBox(children=(IntProgress(value=0, max=512), HTML(value='')))

0 GMM oof auc :  0.96445
0 SVC oof auc :  0.94854
0 KNN oof auc :  0.93811
0 LR oof auc :  0.94131
0 MLP oof auc :  0.93307
64 GMM oof auc :  0.98259
64 SVC oof auc :  0.97359
64 KNN oof auc :  0.94103
64 LR oof auc :  0.97915
64 MLP oof auc :  0.97076




128 GMM oof auc :  0.97308
128 SVC oof auc :  0.95673
128 KNN oof auc :  0.93469
128 LR oof auc :  0.96028
128 MLP oof auc :  0.94864




192 GMM oof auc :  0.98051
192 SVC oof auc :  0.9637
192 KNN oof auc :  0.93334
192 LR oof auc :  0.96721
192 MLP oof auc :  0.95764




256 GMM oof auc :  0.98403
256 SVC oof auc :  0.97598
256 KNN oof auc :  0.94539
256 LR oof auc :  0.97192
256 MLP oof auc :  0.96092
320 GMM oof auc :  0.95287
320 SVC oof auc :  0.94078
320 KNN oof auc :  0.92212
320 LR oof auc :  0.93753
320 MLP oof auc :  0.93218




384 GMM oof auc :  0.96329
384 SVC oof auc :  0.9423
384 KNN oof auc :  0.91666
384 LR oof auc :  0.94985
384 MLP oof auc :  0.95015
448 GMM oof auc :  0.97794
448 SVC oof auc :  0.97629
448 KNN oof auc :  0.93805
448 LR oof auc :  0.95899
448 MLP oof auc :  0.94768







In [6]:
auc_GMM = roc_auc_score(train['target'],oof_GMM)
print('GMM auc: ',round(auc_GMM,5))

auc_SVC = roc_auc_score(train['target'],oof_SVC)
print('SVC auc: ',round(auc_SVC,5))

# auc_NuSVC = roc_auc_score(train['target'],oof_NuSVC)
# print('NuSVC auc: ',round(auc_NuSVC,5))

auc_KNN = roc_auc_score(train['target'],oof_KNN)
print('KNN auc: ',round(auc_KNN,5))

auc_LR = roc_auc_score(train['target'],oof_LR)
print('LR auc: ',round(auc_LR,5))

auc_MLP = roc_auc_score(train['target'],oof_MLP)
print('MLP auc: ',round(auc_MLP,5))

GMM auc:  0.96874
SVC auc:  0.95515
KNN auc:  0.92928
LR auc:  0.95003
MLP auc:  0.94012


In [7]:
# INITIALIZE VARIABLES
test['target'] = preds_GMM
oof_QDA = np.zeros(len(train))
preds_QDA = np.zeros(len(test))

oof_NuSVC = np.zeros(len(train))
preds_NuSVC = np.zeros(len(test))

# BUILD 512 SEPARATE MODELS
for k in tqdm_notebook(range(512)):
    # ONLY TRAIN WITH DATA WHERE WHEEZY EQUALS I
    train2 = train[train['wheezy-copper-turtle-magic']==k] 
    train2p = train2.copy(); idx1 = train2.index 
    test2 = test[test['wheezy-copper-turtle-magic']==k]
    
    # ADD PSEUDO LABEL DATA
    test2p = test2[ (test2['target']<=0.01) | (test2['target']>=0.99) ].copy()
    test2p.loc[ test2p['target']>=0.5, 'target' ] = 1
    test2p.loc[ test2p['target']<0.5, 'target' ] = 0 
    train2p = pd.concat([train2p,test2p],axis=0)
    train2p.reset_index(drop=True,inplace=True)
    
    # FEATURE SELECTION (USE APPROX 40 OF 255 FEATURES)
    sel = VarianceThreshold(threshold=1.5).fit(train2p[cols])     
    train3p = sel.transform(train2p[cols])
    train3 = sel.transform(train2[cols])
    test3 = sel.transform(test2[cols])
    
    pca = PCA(svd_solver='full',n_components='mle').fit(train2p[cols])
    train4p = pca.transform(train2p[cols])
    train4 = pca.transform(train2[cols])
    test4 = pca.transform(test2[cols])
    sc1 = StandardScaler()
    train4p = sc1.fit_transform(train4p)
    train4 = sc1.transform(train4)
    test4 = sc1.transform(test4)
        
    # STRATIFIED K FOLD
    skf = StratifiedKFold(n_splits=11, random_state=42, shuffle=True)
    for train_index, test_index in skf.split(train3p, train2p['target']):
        test_index3 = test_index[ test_index<len(train3) ] # ignore psuedo in oof
        
        # MODEL AND PREDICT WITH QDA
        clf_QDA = QuadraticDiscriminantAnalysis(reg_param=0.5)
        clf_QDA.fit(train3p[train_index,:],train2p.loc[train_index]['target'])
        oof_QDA[idx1[test_index3]] = clf_QDA.predict_proba(train3[test_index3,:])[:,1]
        preds_QDA[test2.index] += clf_QDA.predict_proba(test3)[:,1] / skf.n_splits
        
        clf_NuSVC = NuSVC(probability=True,kernel='poly',degree=4,gamma='auto',nu=0.59, coef0=0.053)
        clf_NuSVC.fit(train4p[train_index,:],train2p.loc[train_index]['target'])
        oof_NuSVC[idx1[test_index3]] = clf_NuSVC.predict_proba(train4[test_index3,:])[:,1]
        preds_NuSVC[test2.index] += clf_NuSVC.predict_proba(test4)[:,1] / skf.n_splits
        
    if k%64==0:     
        print(k, 'QDA oof auc : ', round(roc_auc_score(train['target'][idx1], oof_QDA[idx1]), 5))
        print(k, 'NuSVC oof auc : ', round(roc_auc_score(train['target'][idx1], oof_NuSVC[idx1]), 5))

HBox(children=(IntProgress(value=0, max=512), HTML(value='')))

0 QDA oof auc :  0.9633
0 NuSVC oof auc :  0.96699
64 QDA oof auc :  0.98281
64 NuSVC oof auc :  0.98025
128 QDA oof auc :  0.97261
128 NuSVC oof auc :  0.97108
192 QDA oof auc :  0.98081
192 NuSVC oof auc :  0.98671
256 QDA oof auc :  0.98388
256 NuSVC oof auc :  0.98239
320 QDA oof auc :  0.95147
320 NuSVC oof auc :  0.95096
384 QDA oof auc :  0.96377
384 NuSVC oof auc :  0.9533
448 QDA oof auc :  0.97891
448 NuSVC oof auc :  0.97996



In [8]:
# PRINT CV AUC
auc_QDA = roc_auc_score(train['target'],oof_QDA)
print('Pseudo Labeled QDA scores CV =',round(auc_QDA,5))

auc_NuSVC = roc_auc_score(train['target'],oof_NuSVC)
print('Pseudo Labeled QDA scores CV =',round(auc_NuSVC,5))

Pseudo Labeled QDA scores CV = 0.96953
Pseudo Labeled QDA scores CV = 0.96918


In [9]:
train_new = pd.DataFrame(np.concatenate((oof_SVC.reshape(-1,1),oof_NuSVC.reshape(-1,1),oof_KNN.reshape(-1,1),oof_QDA.reshape(-1,1),oof_LR.reshape(-1,1),oof_MLP.reshape(-1,1)), axis=1))
test_new = pd.DataFrame(np.concatenate((preds_SVC.reshape(-1,1),preds_NuSVC.reshape(-1,1),preds_KNN.reshape(-1,1),preds_QDA.reshape(-1,1),preds_LR.reshape(-1,1),preds_MLP.reshape(-1,1)), axis=1))

In [10]:
param = {
    'bagging_freq': 3,
    'bagging_fraction': 0.8,
    'boost_from_average':'False',
    'boost': 'gbdt',
    'feature_fraction': 1,
    'learning_rate': 0.05,
    'max_depth': 10,
    'metric':'auc',
    'min_data_in_leaf': 82,
    'min_sum_hessian_in_leaf': 10.0,
    'num_leaves': 10,
    'objective': 'binary', 
    'verbosity': 1,
    'seed': 42
}

import lightgbm as lgb
N = 5
skf_lgb = StratifiedKFold(n_splits=N, random_state=42)

oof_lgb = np.zeros(train_new.shape[0])
pred_stack = np.zeros(len(test_new))

for fold_, (trn_idx, val_idx) in enumerate(skf_lgb.split(train_new, train['target'])):
    print("Fold {}".format(fold_+1))
    x_train, y_train = train_new.iloc[trn_idx], train['target'].iloc[trn_idx]
    x_val, y_val = train_new.iloc[val_idx], train['target'].iloc[val_idx]
    x_train.head()
    
    trn_data = lgb.Dataset(x_train, label=y_train)
    val_data = lgb.Dataset(x_val, label=y_val)
    classifier = lgb.train(param, trn_data, 100000, valid_sets = [trn_data, val_data], verbose_eval=200, early_stopping_rounds = 200)

    val_pred = classifier.predict(x_val, num_iteration=classifier.best_iteration)
    oof_lgb[val_idx] = val_pred
    pred_stack += classifier.predict(test_new, num_iteration=classifier.best_iteration) / N
    print(roc_auc_score(y_val, val_pred))

auc_lgb = roc_auc_score(train['target'],oof_lgb)
print('LGB auc: ',round(auc_lgb,5))

Fold 1
Training until validation scores don't improve for 200 rounds.
[200]	training's auc: 0.973928	valid_1's auc: 0.970491
Early stopping, best iteration is:
[86]	training's auc: 0.971876	valid_1's auc: 0.970889
0.9708892341936597
Fold 2
Training until validation scores don't improve for 200 rounds.
[200]	training's auc: 0.973569	valid_1's auc: 0.972654
[400]	training's auc: 0.975669	valid_1's auc: 0.972724
Early stopping, best iteration is:
[339]	training's auc: 0.975164	valid_1's auc: 0.972766
0.9727664634422418
Fold 3
Training until validation scores don't improve for 200 rounds.
[200]	training's auc: 0.973633	valid_1's auc: 0.972336
[400]	training's auc: 0.975738	valid_1's auc: 0.972277
Early stopping, best iteration is:
[222]	training's auc: 0.973936	valid_1's auc: 0.972399
0.9723989866598901
Fold 4
Training until validation scores don't improve for 200 rounds.
[200]	training's auc: 0.974155	valid_1's auc: 0.970047
Early stopping, best iteration is:
[109]	training's auc: 0.97242

In [11]:
submission_stack = pd.read_csv("../input/sample_submission.csv")
submission_stack['target'] = pred_stack
submission_stack.head()
submission_stack.to_csv('submission.csv', index=False)

In [12]:
train_new.to_csv('train_new.csv', index=False)
test_new.to_csv('test_new.csv', index=False)