1. [Santander Fast Compact Solution](https://www.kaggle.com/gpreda/santander-fast-compact-solution)

In [None]:
import warnings
import numpy as np
import pandas as pd
#import lightgbm as lgb
from sklearn import preprocessing
from sklearn.metrics import mean_squared_error
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
warnings.filterwarnings('ignore')

In [None]:
import seaborn as sns
sns.set()

In [None]:
import matplotlib.pyplot as plt


%matplotlib inline

In [None]:
train_df = pd.read_csv('input/train.csv')
test_df = pd.read_csv('input/test.csv')

In [None]:
train_df.describe()

Features
---
1. target, predicted feature
2. var_n, b=0, ...,199, 200 given features

In [None]:
target_feature=['target']
var_features=train_df.columns.tolist()[-200:]

In [None]:
def var_plot(df,m=0,n=3):
    df[var_features[m:n+1]].hist(figsize=[12,6])    

In [None]:
def box_plot(df,m=0,n=3):
    #visualizing the features w high positive correlation
    f, axes = plt.subplots(nrows=1, ncols=n-m+1, figsize=(12,6))

    #f.suptitle('Features With  Correlation', size=20)
    for i in range(m,n+1):
        sns.boxplot(x="target", y=var_features[i], data=df,ax=axes[i])
    

In [None]:
var_plot(train_df)

In [None]:
box_plot(train_df)

Each `var_n` seems to be distributed normally, but not standard normally. Let's to convert them into same level by `sklearn.preprocessing`: 
1. scale:
- normalize
- MinMax,

In [None]:
def df_scale(df,kind='scale'):
    f_num=df.select_dtypes(include=[np.number]).columns.tolist()[-200:]
    if kind=='scale':
       df[f_num]=preprocessing.scale(df[f_num])
    if kind=='normalize':
       df[f_num]=preprocessing.normalize(df[f_num])
    if kind=='minmax':
       scaler=preprocessing.MinMaxScaler()
       #print( df[f_num].info())
       df[f_num]=scaler.fit_transform(df[f_num])
    return df

In [None]:
del df_s
train_df_s=df_scale(train_df,kind='scale')

In [None]:
var_plot(train_df_s)

In [None]:
box_plot(train_df_s)

In [None]:
test_df_s=df_scale(test_df,kind='scale')

In [None]:
var_plot(test_df_s)

Naive Bayes Scheme
---
Naïve Bayes is a simple, yet effective and commonly-used, machine learning classifier.

1. features, $X_0,X_1,..., X_i, (X_i)$, 200 $\{X_i\}$;
2. class, $\{C_i\}$; here, only one `target`=$C_1$ evaluated with 0 or 1.
3. $$ P(C=c_1|X_i=x_i)={P(X_1=x_1,X_2=x_2,\cdots|C_1=c_1)}{P(C_1=c_1)}=P(C_1=c_1)\prod_i P (X_i|C_1=c_1)$$
4. Naive Bayes simply pick the $c_i$ that has the largest probability given the data point’s features:
   $$y= \text{argmax}_{c_i}P(C_1=c_1)\prod_i P (X_i|C_1=c_1)$$


In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

Aprior to make predict, we have to train the model: 
1. split data into train and valid sets, one for training and one for model validating;
-  What belong to `training` set and what belong to `valid` one? Without loss of generality, choose these two parts randomly.
-  Repeat above, and make prediction by each indevidual model. The the average of them would be the one we got.

In [None]:
# split data into train/valid randomly
def split_data(df,ratio):
    sample=np.random.rand(len(df))<ratio
    return(df[sample],df[~sample])

In [None]:
# Naive Bayes first try 
clf=GaussianNB()
ratio=0.6
train,test=split_data(train_df_s,ratio)
clf.fit(train[var_features],train.target)
pred=clf.predict(test[var_features])
#pred_acc=sum(test.target==pred)/len(test)
pred_acc=np.mean(test.target==pred)
print(" Accuracy %4.3f within %4.2f set." %(pred_acc,ratio))

Conclusion
---
 Accuracy is more than 92% in valid set, 40% splited from original training set.  
 


In [None]:
from sklearn.metrics import roc_curve, auc
title_config = {'fontsize': 20, 'y': 1.05}

fpr, tpr, thr = roc_curve(train.target, pipeline.predict_proba(train[var_features])[:,1])
plt.figure(figsize=[12,6])
plt.plot(fpr, tpr)
# 1 - Specficity
plt.xlabel('False Positive Rate, $FP/(FP+TN)$')
# Sensibility
plt.ylabel('True Positive Rate, $TP/(TP+FN)$')
plt.title('Receiver Operating Characteristic Plot', **title_config)
auc(fpr, tpr)

In [None]:
def NB_Classify_1(df_train,df_test,folds=5,ratio=0.6):    
    df_test_p=df_test[['ID_code']].copy()
    for i in range(folds):
        print("Fold %s" %i)
        train,test=split_data(df_train,ratio)
        clf=GaussianNB()
        clf.fit(train.drop(['ID_code','target'],axis=1),train.target)
        pred=clf.predict(test.drop(['ID_code','target'],axis=1))
        pred_acc=np.mean(test.target==pred) 
        print(" Accuracy %4.3f within %4.2f set." %(pred_acc,ratio))
        pre_col='target_'+str(i)
        df_test_p[pre_col]=clf.predict(df_test.drop(['ID_code'],axis=1))
    return df_test_p   

In [None]:
sub=NB_Classify_1(train_df_s,test_df_s,folds=5)

In [None]:
sub['target']=sub[sub.columns.to_list()[1:]].mean(axis=1)
sub=sub.drop(sub.columns.to_list()[1:-1],axis=1)

sub.to_csv("output/2019-03-16-NB_1.csv",index=False)

Advantage from Scikit-learn
---
We had scaled (transformed) train data set manually. Scikit-learn avails more flexible utilities for EDA, including:
1. transform data set,
- pipeline supported,
- score evaluted,
- cross-validate function, 

     

In [None]:
from sklearn.preprocessing import QuantileTransformer
from sklearn.pipeline import make_pipeline

In [None]:
#train,test=split_data(train_df_s,ratio)
X_train=train_df[var_features]
y_train=train_df['target']

pipeline = make_pipeline(QuantileTransformer(output_distribution='normal'), GaussianNB())
pipeline.fit(X_train,y_train)

In [None]:
from sklearn.metrics import roc_curve, auc
title_config = {'fontsize': 20, 'y': 1.05}

fpr, tpr, thr = roc_curve(y_train, pipeline.predict_proba(X_train)[:,1])
plt.figure(figsize=[12,6])
plt.plot(fpr, tpr)
# 1 - Specficity
plt.xlabel('False Positive Rate, $FP/(FP+TN)$')
# Sensibility
plt.ylabel('True Positive Rate, $TP/(TP+FN)$')
plt.title('Receiver Operating Characteristic Plot', **title_config)
auc(fpr, tpr)

In [None]:
from sklearn.model_selection import cross_val_score

cross_val_score(pipeline, X_train,y_train, scoring='roc_auc', cv=10).mean()

In [None]:
X_test=test_df[var_features]
submission = pd.read_csv('input/sample_submission.csv')

submission['target'] = pipeline.predict_proba(X_test)[:,1]

In [None]:
submission.to_csv('output/2019-03-16-NB_pipeline_.csv', index=False)

Conclusion
---
 Accuracy is more than 92% in valid set, 40% splited from original training set.  
 


In [None]:
def NB_Classify_1(df_train,df_test,folds=5,ratio=0.6):    
    df_test_p=df_test[['ID_code']].copy()
    for i in range(folds):
        print("Fold %s" %i)
        train,test=split_data(df_train,ratio)
        clf=GaussianNB()
        clf.fit(train.drop(['ID_code','target'],axis=1),train.target)
        pred=clf.predict(test.drop(['ID_code','target'],axis=1))
        pred_acc=np.mean(test.target==pred) 
        print(" Accuracy %4.3f within %4.2f set." %(pred_acc,ratio))
        pre_col='target_'+str(i)
        df_test_p[pre_col]=clf.predict(df_test.drop(['ID_code'],axis=1))
    return df_test_p   

In [None]:
sub=NB_Classify_1(train_df_s,test_df_s,folds=5)

In [None]:
sub.target_0.value_counts()

In [None]:
sub['target']=sub[sub.columns.to_list()[1:-1]].mean(axis=1)
sub=sub.drop(sub.columns.to_list()[1:-1],axis=1)

sub.to_csv("output/2019-03-16-NB_1.csv",index=False)

In [None]:
sub_1=pd.read_csv("output/submission-1.09-2.csv")

In [None]:
sub['target']=sub_1.target*0.8+sub['target']*0.2


In [None]:
sub.to_csv("output/2019-03-16-lgb_0.9_NB_0.1.csv",index=False)

In [None]:
sub_1.target.sum()

 Questions
 ---
 1. Since the data were splited randomly, is the result always the same? 
          Redo above repeatedly to get the final prediction
 -  In this data set, use 200 features, var_0 to vae_199, to predict the output, `target`. As expected, all the features is certainly related with the output more or less; however, is any one noise feature which we can ignore. furthermore, could it be possible to make prediction worse?    