# Exploratory Data Analysis + Features

In [None]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from math import pi

import warnings # current version of seaborn generates a bunch of warnings that we'll ignore
warnings.filterwarnings("ignore")
import seaborn as sns
sns.set(style="white", color_codes=True)
%matplotlib inline 
import scipy
from scipy.stats import describe
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
import lightgbm as lgb
import xgboost as xgb

import sys
from fastai.structured import *
from fastai.column_data import *
from sklearn.model_selection import *

train = pd.read_csv("../input/train.csv") # the train dataset is now a Pandas DataFrame
test = pd.read_csv("../input/test.csv") # the train dataset is now a Pandas DataFrame

# Let's see what's in the trainings data - Jupyter notebooks print the result of the last thing you do
train.head()

## Shape of the data

In [None]:
print("Santander Value Prediction Challenge train -  rows:",train.shape[0]," columns:", train.shape[1])
print("Santander Value Prediction Challenge test -  rows:",test.shape[0]," columns:", test.shape[1])

In [None]:
train.head()

In [None]:
test.head()

## Missing values

In [None]:
train.isnull().values.any()

In [None]:
test.isnull().values.any()

## Types of Feature

In [None]:
dtype_df = train.dtypes.reset_index()
dtype_df.columns = ["Count", "Column Type"]
dtype_df.groupby("Column Type").aggregate('count').reset_index()

## Distribution of Target Variable

In [None]:
plt.title("Distribution of Target")
sns.distplot(train['target'].dropna(),color='blue', kde=True,bins=100)
plt.show()

### Violin distribution of target

In [None]:
sns.set_style("whitegrid")
ax = sns.violinplot(x=train.target.values)
plt.show()

In [None]:
plt.title("Distribution of log(target)")
sns.distplot(np.log1p(train['target']).dropna(),color='blue', kde=True,bins=100)
plt.show()

In [None]:
sns.set_style("whitegrid")
ax = sns.violinplot(x=np.log(1+train.target.values))
plt.show()

## Identifying features that are highly correlated with target

In [None]:
labels = []
values = []
for col in train.columns:
    if col not in ["ID", "target"]:
        labels.append(col)
        values.append(np.corrcoef(train[col].values, train["target"].values)[0,1])
corr_df = pd.DataFrame({'columns_labels':labels, 'corr_values':values})
corr_df = corr_df.sort_values(by='corr_values')
 
corr_df = corr_df[(corr_df['corr_values']>0.25) | (corr_df['corr_values']<-0.25)]
ind = np.arange(corr_df.shape[0])
width = 0.9
fig, ax = plt.subplots(figsize=(10,6))
rects = ax.barh(ind, np.array(corr_df.corr_values.values), color='black')
ax.set_yticks(ind)
ax.set_yticklabels(corr_df.columns_labels.values, rotation='horizontal')
ax.set_xlabel("Correlation coefficient")
ax.set_title("Correlation coefficient of the variables")
plt.show()

## Correlation matrix of the most highly correlated features

In [None]:
temp_df = train[corr_df.columns_labels.tolist()]
corrmat = temp_df.corr(method='pearson')
f, ax = plt.subplots(figsize=(12, 12))
sns.heatmap(corrmat, vmax=1., square=True, cmap=plt.cm.BrBG)
plt.title("Important variables correlation map", fontsize=15)
plt.show()

## Sparsity

In [None]:
sparsity = {
    col: (train[col] == 0).mean()
    for idx, col in enumerate(train)
}
sparsity = pd.Series(sparsity)

fig = plt.figure(figsize=[7,12])
ax = fig.add_subplot(211)
ax.hist(sparsity, range=(0,1), bins=100)
ax.set_xlabel('Sparsity of Features')
ax.set_ylabel('Number of Features')
ax = fig.add_subplot(212)
ax.hist(sparsity, range=(0.8,1), bins=100)
ax.set_xlabel('Sparsity of Features')
ax.set_ylabel('Number of Features')
plt.show()

In [None]:
cat_flds = []
bs = 64

In [None]:
test = test.set_index('ID')
train['target'] = np.log(train['target'])
x, y, nas = proc_df(train, 'target', skip_flds=['ID'])
df_train_x, df_val_x, df_train_y, df_val_y= train_test_split(x, y, test_size=0.1, random_state=42)

In [None]:
model_data = ColumnarModelData.from_data_frames(
    '.', df_train_x, df_val_x, df_train_y, df_val_y, cat_flds, bs, is_reg=True, is_multi=False, test_df=test)

In [None]:
emb_szs = []
n_cont = len(df_train_x.columns)
emb_drop = 0.0
out_sz = 1
szs = [400, 50]
drops = [0.0,0.0]

In [None]:
learner = model_data.get_learner(emb_szs, n_cont, emb_drop, out_sz, szs, drops)

In [None]:
learner.lr_find2(start_lr=1, end_lr=1000, num_it=500)
learner.sched.plot()

In [None]:
learner.unfreeze()

In [None]:
lr = 0.105
learner.fit(lr, 10, cycle_len=2)

In [None]:
preds = learner.predict(is_test = True)

In [None]:
train_describe = train.describe()

In [None]:
train_describe

In [None]:
test_describe = test.describe()

In [None]:
test_describe

In [None]:
plt.figure(figsize=(12, 5))
plt.hist(train.target.values, bins=100)
plt.title('Histogram target counts')
plt.xlabel('Count')
plt.ylabel('Target')
plt.show()

In [None]:
plt.figure(figsize=(30, 5))
x = train.iloc[1]
plt.hist(x)
plt.title('Histogram target counts')
plt.xlabel('Count')
plt.ylabel('Log 1+Target')
plt.show()

*This is a highly skewed distribution, so let's try to re-plot it with with log transform of the target.*



In [None]:
plt.figure(figsize=(12, 5))
plt.hist(np.log(1+train.target.values), bins=100)
plt.title('Histogram target counts')
plt.xlabel('Count')
plt.ylabel('Log 1+Target')
plt.show()

In [None]:
sns.set_style("whitegrid")
ax = sns.violinplot(x=np.log(1+train.target.values))
plt.show()

*Let's take a look at the statistics of the Log(1+target)*

In [None]:
train_log_target = train[['target']]
train_log_target['target'] = np.log(1+train['target'].values)
train_log_target.describe()

*We see that the statistical properties of teh Log(1+Target) distribution are much more amenable.*

*Now let's take a look at columns with constant value.*

In [None]:
constant_train = train.loc[:, (train == train.iloc[0]).all()].columns.tolist()
constant_test = test.loc[:, (test == test.iloc[0]).all()].columns.tolist()

In [None]:
print('Number of constant columns in the train set:', len(constant_train))
print('Number of constant columns in the test set:', len(constant_test))

> So this is interesting: there are 256 constant columns in the train set, but none in the test set. These constant columns are thus most likely an artifact of the way that the train and test sets were constructed, and not necessarily irrelevant in their own right. This is yet another byproduct of having a very small dataset. For most problems it would be useful to take a look at the description of these columns, but in this competition they are anonymized, and thus would not yield any useful information.

So let's subset the colums that we'd use to just those that are not constant.

In [None]:
columns_to_use = test.columns.tolist()
del columns_to_use[0] # Remove 'ID'
columns_to_use = [x for x in columns_to_use if x not in constant_train] #Remove all 0 columns
len(columns_to_use)

> So we have the total of 4735 columns to work with. However, as mentioned earlier, most of these columns seem to be filled predominatly with zeros. Let's try to get a better sense of this data.

In [None]:
describe(train[columns_to_use].values, axis=None)

> If we treat all the train matrix values as if they belonged to a single row vector, we see a huge amount of varience, far exceeding the similar variance for the target variable.

Now let's plot it to see how diverse the numerical values are.

In [None]:
plt.figure(figsize=(12, 5))
plt.hist(train[columns_to_use].values.flatten(), bins=50)
plt.title('Histogram all train counts')
plt.xlabel('Count')
plt.ylabel('Value')
plt.show()

> Most of the values are heavily concentrated around 0
Let's see with the log plot..

In [None]:
plt.figure(figsize=(12, 5))
plt.hist(np.log(train[columns_to_use].values.flatten()+1), bins=50)
plt.title('Log Histogram all train counts')
plt.xlabel('Count')
plt.ylabel('Log value')
plt.show()

> Only marginal improvement - there is a verly small bump close to 15.

Let's try out with violin plot

In [None]:
sns.set_style("whitegrid")
ax = sns.violinplot(x=np.log(train[columns_to_use].values.flatten()+1))
plt.show()

*Not really - the plot looks nicer, but the overall shape is almost same.*

let's take a look at the distribution of non-zero values.

In [None]:
train_nz = np.log(train[columns_to_use].values.flatten()+1)
train_nz = train_nz[np.nonzero(train_nz)]
plt.figure(figsize=(12, 5))
plt.hist(train_nz, bins=50)
plt.title('Log Histogram nonzero train counts')
plt.xlabel('Count')
plt.ylabel('Log value')
plt.show()

In [None]:
sns.set_style("whitegrid")
ax = sns.violinplot(x=train_nz)
plt.show()

In [None]:
describe(train_nz)

Let's do the same thing with the test data.

In [None]:
test_nz = np.log(test[columns_to_use].values.flatten()+1)
test_nz = test_nz[np.nonzero(test_nz)]
plt.figure(figsize=(12, 5))
plt.hist(test_nz, bins=50)
plt.title('Log Histogram nonzero test counts')
plt.xlabel('Count')
plt.ylabel('Log value')
plt.show()

In [None]:
sns.set_style("whitegrid")
ax = sns.violinplot(x=test_nz)
plt.show()

In [None]:
describe(test_nz)

*Again, we see that these distributions look similar, but they are definitely not the same.*

let's take a closer look at the shape and content of the train data. We want to get a better numerical grasp of the true extent of zeros.

In [None]:
train[columns_to_use].values.flatten().shape

In [None]:
((train[columns_to_use].values.flatten())==0).mean()

*Almost 97% of all values in the train dataframe are zeros. That looks pretty sparse to me, but let's see how much variation is there between different columns.*

In [None]:
train_zeros = pd.DataFrame({'Percentile':((train[columns_to_use].values)==0).mean(axis=0),
                           'Column' : columns_to_use})
train_zeros.head()

In [None]:
describe(train_zeros.Percentile.values)

*It seems that the vast majority of columns have 95+ percent of zeros in them. Let's see how would that look on a plot.*

In [None]:
plt.figure(figsize=(12, 5))
plt.hist(train_zeros.Percentile.values, bins=50)
plt.title('Histogram percentage zeros train counts')
plt.xlabel('Count')
plt.ylabel('Value')
plt.show()

In [None]:
describe(np.log(train[columns_to_use].values+1), axis=None)

In [None]:
describe(test[columns_to_use].values, axis=None)

In [None]:
describe(np.log(test[columns_to_use].values+1), axis=None)

In [None]:
test_zeros = pd.DataFrame({'Percentile':(np.log(1+test[columns_to_use].values)==0).mean(axis=0),
                           'Column' : columns_to_use})
test_zeros.head()

In [None]:
describe(test_zeros.Percentile.values)

In [None]:
y = np.log(1+train.target.values)
y.shape

In [None]:
y

## Predictive Modeling

In [None]:
train_1 = lgb.Dataset(train[columns_to_use],y ,feature_name = "auto")

In [None]:
params = {'boosting_type': 'gbdt', 
          'objective': 'regression', 
          'metric': 'rmse', 
          'learning_rate': 0.0105, 
          'num_leaves': 100, 
          'feature_fraction': 0.4, 
          'bagging_fraction': 0.6, 
          'max_depth': 5, 
          'min_child_weight': 10}


clf = lgb.train(params,
        train_1,
        num_boost_round = 400,
        verbose_eval=True)

In [None]:
preds = clf.predict(test[columns_to_use])
preds

In [None]:
sample_submission = pd.read_csv("../input/sample_submission.csv")
sample_submission.target = np.exp(preds)-1
sample_submission.to_csv('simple_lgbm.csv', index=False)
sample_submission.head()

In [None]:
nr_splits = 5
random_state = 1054

y_oof = np.zeros((y.shape[0]))
total_preds = 0

kf = KFold(n_splits=nr_splits, shuffle=True, random_state=random_state)
for i, (train_index, val_index) in enumerate(kf.split(y)):
    print('Fitting fold', i+1, 'out of', nr_splits)
    X_train, X_val  = train[columns_to_use].iloc[train_index], train[columns_to_use].iloc[val_index]
    y_train, y_val = y[train_index], y[val_index]
    
    train_1 = lgb.Dataset(X_train,y_train ,feature_name = "auto")
    val = lgb.Dataset(X_val ,y_val ,feature_name = "auto")
    clf = lgb.train(params,train_1,num_boost_round = 400,verbose_eval=True)
    
    total_preds += clf.predict(test[columns_to_use])/nr_splits
    pred_oof = clf.predict(X_val)
    y_oof[val_index] = pred_oof
    print('Fold error', np.sqrt(mean_squared_error(y_val, pred_oof)))

print('Total error', np.sqrt(mean_squared_error(y, y_oof)))

In [None]:
params['max_depth'] = 4

y_oof_2 = np.zeros((y.shape[0]))
total_preds_2 = 0


kf = KFold(n_splits=nr_splits, shuffle=True, random_state=random_state)
for i, (train_index, val_index) in enumerate(kf.split(y)):
    print('Fitting fold', i+1, 'out of', nr_splits)
    X_train, X_val  = train[columns_to_use].iloc[train_index], train[columns_to_use].iloc[val_index]
    y_train, y_val = y[train_index], y[val_index]
    
    train_1 = lgb.Dataset(X_train,y_train ,feature_name = "auto")
    val = lgb.Dataset(X_val ,y_val ,feature_name = "auto")
    clf = lgb.train(params,train_1,num_boost_round = 400,verbose_eval=True)
    
    total_preds_2 += clf.predict(test[columns_to_use])/nr_splits
    pred_oof = clf.predict(X_val)
    y_oof_2[val_index] = pred_oof
    print('Fold error', np.sqrt(mean_squared_error(y_val, pred_oof)))

print('Total error', np.sqrt(mean_squared_error(y, y_oof_2)))

In [None]:
params['max_depth'] = 6

y_oof_3 = np.zeros((y.shape[0]))
total_preds_3 = 0

kf = KFold(n_splits=nr_splits, shuffle=True, random_state=random_state)
for i, (train_index, val_index) in enumerate(kf.split(y)):
    print('Fitting fold', i+1, 'out of', nr_splits)
    X_train, X_val  = train[columns_to_use].iloc[train_index], train[columns_to_use].iloc[val_index]
    y_train, y_val = y[train_index], y[val_index]
    
    train_1 = lgb.Dataset(X_train,y_train ,feature_name = "auto")
    val = lgb.Dataset(X_val ,y_val ,feature_name = "auto")
    clf = lgb.train(params,train_1,num_boost_round = 400,verbose_eval=True)
    
    total_preds_3 += clf.predict(test[columns_to_use])/nr_splits
    pred_oof = clf.predict(X_val)
    y_oof_3[val_index] = pred_oof
    print('Fold error', np.sqrt(mean_squared_error(y_val, pred_oof)))

print('Total error', np.sqrt(mean_squared_error(y, y_oof_3)))

In [None]:
params['max_depth'] = 7

y_oof_4 = np.zeros((y.shape[0]))
total_preds_4 = 0

kf = KFold(n_splits=nr_splits, shuffle=True, random_state=random_state)
for i, (train_index, val_index) in enumerate(kf.split(y)):
    print('Fitting fold', i+1, 'out of', nr_splits)
    X_train, X_val  = train[columns_to_use].iloc[train_index], train[columns_to_use].iloc[val_index]
    y_train, y_val = y[train_index], y[val_index]
    
    train_1 = lgb.Dataset(X_train,y_train ,feature_name = "auto")
    val = lgb.Dataset(X_val ,y_val ,feature_name = "auto")
    clf = lgb.train(params,train_1,num_boost_round = 400,verbose_eval=True)
    
    total_preds_4 += clf.predict(test[columns_to_use])/nr_splits
    pred_oof = clf.predict(X_val)
    y_oof_4[val_index] = pred_oof
    print('Fold error', np.sqrt(mean_squared_error(y_val, pred_oof)))

print('Total error', np.sqrt(mean_squared_error(y, y_oof_4)))

In [None]:
params['max_depth'] = 8

y_oof_5 = np.zeros((y.shape[0]))
total_preds_5 = 0


kf = KFold(n_splits=nr_splits, shuffle=True, random_state=random_state)
for i, (train_index, val_index) in enumerate(kf.split(y)):
    print('Fitting fold', i+1, 'out of', nr_splits)
    X_train, X_val  = train[columns_to_use].iloc[train_index], train[columns_to_use].iloc[val_index]
    y_train, y_val = y[train_index], y[val_index]
    
    train_1 = lgb.Dataset(X_train,y_train ,feature_name = "auto")
    val = lgb.Dataset(X_val ,y_val ,feature_name = "auto")
    clf = lgb.train(params,train_1,num_boost_round = 400,verbose_eval=True)
    
    total_preds_5 += clf.predict(test[columns_to_use])/nr_splits
    pred_oof = clf.predict(X_val)
    y_oof_5[val_index] = pred_oof
    print('Fold error', np.sqrt(mean_squared_error(y_val, pred_oof)))

print('Total error', np.sqrt(mean_squared_error(y, y_oof_5)))

In [None]:
params['max_depth'] = 10

y_oof_6 = np.zeros((y.shape[0]))
total_preds_6 = 0


kf = KFold(n_splits=nr_splits, shuffle=True, random_state=random_state)
for i, (train_index, val_index) in enumerate(kf.split(y)):
    print('Fitting fold', i+1, 'out of', nr_splits)
    X_train, X_val  = train[columns_to_use].iloc[train_index], train[columns_to_use].iloc[val_index]
    y_train, y_val = y[train_index], y[val_index]
    
    train_1 = lgb.Dataset(X_train,y_train ,feature_name = "auto")
    val = lgb.Dataset(X_val ,y_val ,feature_name = "auto")
    clf = lgb.train(params,train_1,num_boost_round = 400,verbose_eval=True)
    
    total_preds_6 += clf.predict(test[columns_to_use])/nr_splits
    pred_oof = clf.predict(X_val)
    y_oof_6[val_index] = pred_oof
    print('Fold error', np.sqrt(mean_squared_error(y_val, pred_oof)))

print('Total error', np.sqrt(mean_squared_error(y, y_oof_6)))

In [None]:
params['max_depth'] = 12

y_oof_7 = np.zeros((y.shape[0]))
total_preds_7 = 0


kf = KFold(n_splits=nr_splits, shuffle=True, random_state=random_state)
for i, (train_index, val_index) in enumerate(kf.split(y)):
    print('Fitting fold', i+1, 'out of', nr_splits)
    X_train, X_val  = train[columns_to_use].iloc[train_index], train[columns_to_use].iloc[val_index]
    y_train, y_val = y[train_index], y[val_index]
    
    train_1 = lgb.Dataset(X_train,y_train ,feature_name = "auto")
    val = lgb.Dataset(X_val ,y_val ,feature_name = "auto")
    clf = lgb.train(params,train_1,num_boost_round = 400,verbose_eval=True)
    
    total_preds_7 += clf.predict(test[columns_to_use])/nr_splits
    pred_oof = clf.predict(X_val)
    y_oof_7[val_index] = pred_oof
    print('Fold error', np.sqrt(mean_squared_error(y_val, pred_oof)))

print('Total error', np.sqrt(mean_squared_error(y, y_oof_7)))

In [None]:
print('Total error', np.sqrt(mean_squared_error(y, 1.4*(1.6*y_oof_7-0.6*y_oof_6)-0.4*y_oof_5)))
print('Total error', np.sqrt(mean_squared_error(y, -0.5*y_oof-0.5*y_oof_2-y_oof_3
                                                +3*y_oof_4)))
print('Total error', np.sqrt(mean_squared_error(y, 0.75*(1.4*(1.6*y_oof_7-0.6*y_oof_6)-0.4*y_oof_5)+
                                                0.25*(-0.5*y_oof-0.5*y_oof_2-y_oof_3
                                                +3*y_oof_4))))

In [None]:
sub_preds = (0.75*(1.4*(1.6*total_preds_7-0.6*total_preds_6)-0.4*total_preds_5)+
                                                0.25*(-0.5*total_preds-0.5*total_preds_2-total_preds_3
                                                +3*total_preds_4))
sample_submission.target = np.exp(sub_preds)-1
sample_submission.to_csv('submission_1.csv', index=False)
sample_submission.head()

In [None]:
params = {'objective': 'reg:linear', 
          'eval_metric': 'rmse',
          'eta': 0.01,
          'max_depth': 10, 
          'subsample': 0.6, 
          'colsample_bytree': 0.6,
          'alpha':0.001,
          'random_state': 42, 
          'silent': True}

y_oof_8 = np.zeros((y.shape[0]))
total_preds_8 = 0

dtest = xgb.DMatrix(test[columns_to_use])

kf = KFold(n_splits=nr_splits, shuffle=True, random_state=random_state)
for i, (train_index, val_index) in enumerate(kf.split(y)):
    print('Fitting fold', i+1, 'out of', nr_splits)
    X_train, X_val  = train[columns_to_use].iloc[train_index], train[columns_to_use].iloc[val_index]
    y_train, y_val = y[train_index], y[val_index]
    
    train_1 = xgb.DMatrix(X_train, y_train)
    val = xgb.DMatrix(X_val, y_val)
    
    watchlist = [(train_1, 'train'), (val, 'val')]
    
    clf = xgb.train(params, train_1, 1000, watchlist, 
                          maximize=False, early_stopping_rounds = 60, verbose_eval=100)

    
    total_preds_8 += clf.predict(dtest, ntree_limit=clf.best_ntree_limit)/nr_splits
    pred_oof = clf.predict(val, ntree_limit=clf.best_ntree_limit)
    y_oof_8[val_index] = pred_oof
    print('Fold error', np.sqrt(mean_squared_error(y_val, pred_oof)))

print('Total error', np.sqrt(mean_squared_error(y, y_oof_8)))

In [None]:
print('Total error', np.sqrt(mean_squared_error(y, 0.7*(0.75*(1.4*(1.6*y_oof_7-0.6*y_oof_6)-0.4*y_oof_5)+0.25*(-0.5*y_oof-0.5*y_oof_2-y_oof_3+3*y_oof_4))+0.3*y_oof_8)))

In [None]:
sub_preds = (0.7*(0.75*(1.4*(1.6*total_preds_7-0.6*total_preds_6)-0.4*total_preds_5)+0.25*(-0.5*total_preds-0.5*total_preds_2-total_preds_3+3*total_preds_4))+0.3*total_preds_8)

sample_submission.target = np.exp(sub_preds)-1
sample_submission.to_csv('blended_submission_2.csv', index=False)
sample_submission.head()