### Load libraries 

In [159]:
import numpy as np
import scipy as sp
import pandas as pd
import re
from IPython.display import display

pd.options.display.max_rows = 1000
pd.options.display.max_columns = 135

from collections import namedtuple

from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans
from sklearn.mixture import BayesianGaussianMixture, GaussianMixture

import xgboost as xgb

from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation, Merge, Reshape
from keras.layers.embeddings import Embedding

### Load data files 

In [6]:
Files = namedtuple('Files',['train','test'])
RawData = namedtuple('RawData',['train','test'])
ModelData = namedtuple('ModelData',['X_train','y_train',
                                    'X_val','y_val',
                                    'X_test'])

In [7]:
datafiles = Files('Data/train.csv.gz','Data/test.csv.gz')
rawdata = RawData(pd.read_csv(datafiles.train),
            pd.read_csv(datafiles.test))

In [8]:
rawdata.train.head()

Unnamed: 0,id,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,cat9,cat10,cat11,cat12,cat13,cat14,cat15,cat16,cat17,cat18,cat19,cat20,cat21,cat22,cat23,cat24,cat25,cat26,cat27,cat28,cat29,cat30,cat31,cat32,cat33,cat34,cat35,cat36,cat37,cat38,cat39,cat40,cat41,cat42,cat43,cat44,cat45,cat46,cat47,cat48,cat49,cat50,cat51,cat52,cat53,cat54,cat55,cat56,cat57,cat58,cat59,cat60,cat61,cat62,cat63,cat64,cat65,cat66,cat67,cat68,cat69,cat70,cat71,cat72,cat73,cat74,cat75,cat76,cat77,cat78,cat79,cat80,cat81,cat82,cat83,cat84,cat85,cat86,cat87,cat88,cat89,cat90,cat91,cat92,cat93,cat94,cat95,cat96,cat97,cat98,cat99,cat100,cat101,cat102,cat103,cat104,cat105,cat106,cat107,cat108,cat109,cat110,cat111,cat112,cat113,cat114,cat115,cat116,cont1,cont2,cont3,cont4,cont5,cont6,cont7,cont8,cont9,cont10,cont11,cont12,cont13,cont14,loss
0,1,A,B,A,B,A,A,A,A,B,A,B,A,A,A,A,A,A,A,A,A,A,A,B,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,B,A,D,B,B,D,D,B,D,C,B,D,B,A,A,A,A,A,D,B,C,E,A,C,T,B,G,A,A,I,E,G,J,G,BU,BC,C,AS,S,A,O,LB,0.7263,0.245921,0.187583,0.789639,0.310061,0.718367,0.33506,0.3026,0.67135,0.8351,0.569745,0.594646,0.822493,0.714843,2213.18
1,2,A,B,A,A,A,A,A,A,B,B,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,D,B,B,D,D,A,B,C,B,D,B,A,A,A,A,A,D,D,C,E,E,D,T,L,F,A,A,E,E,I,K,K,BI,CQ,A,AV,BM,A,O,DP,0.330514,0.737068,0.592681,0.614134,0.885834,0.438917,0.436585,0.60087,0.35127,0.43919,0.338312,0.366307,0.611431,0.304496,1283.6
2,5,A,B,A,A,B,A,A,A,B,B,B,B,B,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,B,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,D,B,B,B,D,B,D,C,B,B,B,A,A,A,A,A,D,D,C,E,E,A,D,L,O,A,B,E,F,H,F,A,AB,DK,A,C,AF,A,I,GK,0.261841,0.358319,0.484196,0.236924,0.397069,0.289648,0.315545,0.2732,0.26076,0.32446,0.381398,0.373424,0.195709,0.774425,3005.09
3,10,B,B,A,B,A,A,A,A,B,A,A,A,A,A,A,A,A,A,A,A,A,A,B,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,B,A,A,A,D,B,B,D,D,D,B,C,B,D,B,A,A,A,A,A,D,D,C,E,E,D,T,I,D,A,A,E,E,I,K,K,BI,CS,C,N,AE,A,O,DJ,0.321594,0.555782,0.527991,0.373816,0.422268,0.440945,0.391128,0.31796,0.32128,0.44467,0.327915,0.32157,0.605077,0.602642,939.85
4,11,A,B,A,B,A,A,A,A,B,B,A,B,A,A,A,A,A,A,A,A,A,A,B,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,A,B,A,A,A,A,D,B,D,B,D,B,B,C,B,B,C,A,A,A,B,H,D,B,D,E,E,A,P,F,J,A,A,D,E,K,G,B,H,C,C,Y,BM,A,K,CK,0.273204,0.15999,0.527991,0.473202,0.704268,0.178193,0.247408,0.24564,0.22089,0.2123,0.204687,0.202213,0.246011,0.432606,2763.85


### Evaluate raw data

In [53]:
categorical_field_analysis = []
for c in rawdata.train.columns.values:
    if re.match(r'^cat.*',c):
        # get pct freq coverage of top value 
        v = rawdata.train[c].value_counts()
        first_pct = v[0]*1.0/sum(v)
        
        # get count of distinct values
        distinct_vals = set(rawdata.train[c].values)
        d_cnt = len(distinct_vals)
        
        # calculate logical freq  
        logical_pct = 1.0/d_cnt
        
        # append
        categorical_field_analysis.append((c, first_pct, logical_pct, d_cnt))

categorical_field_analysis = pd.DataFrame(categorical_field_analysis,
                                 columns=['Cat_Col',
                                          'First_Freq',
                                          'Logical_Freq',
                                          'Distinct_Val_Cnt',])
display(categorical_field_analysis)

Unnamed: 0,Cat_Col,First_Freq,Logical_Freq,Distinct_Val_Cnt
0,cat1,0.751654,0.5,2
1,cat2,0.566706,0.5,2
2,cat3,0.945173,0.5,2
3,cat4,0.681799,0.5,2
4,cat5,0.657064,0.5,2
5,cat6,0.699312,0.5,2
6,cat7,0.975711,0.5,2
7,cat8,0.941355,0.5,2
8,cat9,0.600697,0.5,2
9,cat10,0.850758,0.5,2


In [50]:
continuous_fields = [ c for c in rawdata.train.columns.values
                         if re.match(r'^cont.*',c) ]
rawdata.train[continuous_fields].describe()

Unnamed: 0,cont1,cont2,cont3,cont4,cont5,cont6,cont7,cont8,cont9,cont10,cont11,cont12,cont13,cont14
count,188318.0,188318.0,188318.0,188318.0,188318.0,188318.0,188318.0,188318.0,188318.0,188318.0,188318.0,188318.0,188318.0,188318.0
mean,0.493861,0.507188,0.498918,0.491812,0.487428,0.490945,0.48497,0.486437,0.485506,0.498066,0.493511,0.49315,0.493138,0.495717
std,0.18764,0.207202,0.202105,0.211292,0.209027,0.205273,0.17845,0.19937,0.18166,0.185877,0.209737,0.209427,0.212777,0.222488
min,1.6e-05,0.001149,0.002634,0.176921,0.281143,0.012683,0.069503,0.23688,8e-05,0.0,0.035321,0.036232,0.000228,0.179722
25%,0.34609,0.358319,0.336963,0.327354,0.281143,0.336105,0.350175,0.3128,0.35897,0.36458,0.310961,0.311661,0.315758,0.29461
50%,0.475784,0.555782,0.527991,0.452887,0.422268,0.440945,0.438285,0.44106,0.44145,0.46119,0.457203,0.462286,0.363547,0.407403
75%,0.623912,0.681761,0.634224,0.652072,0.643315,0.655021,0.591045,0.62358,0.56682,0.61459,0.678924,0.675759,0.689974,0.724623
max,0.984975,0.862654,0.944251,0.954297,0.983674,0.997162,1.0,0.9802,0.9954,0.99498,0.998742,0.998484,0.988494,0.844848


In [173]:
binary_filter = categorical_field_analysis.Distinct_Val_Cnt == 2
complex_filter = categorical_field_analysis.Distinct_Val_Cnt > 2 
field_sets = {
    'binary':categorical_field_analysis[binary_filter].Cat_Col.values ,
    'complex':categorical_field_analysis[complex_filter].Cat_Col.values ,
}

### Label encode

In [215]:
label_encoders = {}
for c in field_sets['complex']:
    le = LabelEncoder()
    le.fit(rawdata.train[c])
    label_encoders[c] = le

### Entity Embedding

In [234]:
models = []

# binary fields
n, m = rawdata.train.loc[:,field_sets['binary']].shape
bin_model = Sequential()
bin_model.add(Dense(m, input_dim=m))
models.append(bin_model)

# complex fields
for c in field_sets['complex']:
    m = len(set(rawdata.train.loc[:, c].values))
    new_m = m//2 # blindly halve
    comp_model = Sequential()
    comp_model.add(Embedding(m, new_m, input_length=1))
    comp_model.add(Reshape(target_shape=(new_m,)))
    models.append(comp_model)

# NN model
model = Sequential()
model.add(Merge(models, mode='concat'))
model.add(Dropout(0.02))
model.add(Dense(1000, init='uniform'))
model.add(Activation('relu'))
model.add(Dense(500, init='uniform'))
model.add(Activation('relu'))
model.add(Dense(1))
model.add(Activation('sigmoid'))
model.compile(loss='mean_absolute_error', optimizer='adam')

In [235]:
max_y = np.max(np.log(rawdata.train.loss))

def preprocessing_X(X_dat):
    X_out = []
    d = X_dat.loc[:, field_sets['binary']]
    dt = d.replace(['A', 'B'], [1, 0])
    X_out.append(dt.as_matrix())
    
    for c in field_sets['complex']:
        d = X_dat.loc[:, c].as_matrix()
        dt = label_encoders[c].transform(d)
        X_out.append(dt)
    return X_out

def preprocessing_Y(y_dat):
    return np.log(y_dat.values)/max_y

n = 100000
X_train = rawdata.train.iloc[:n, :]
y_train = rawdata.train.loss[:n]
X_val = rawdata.train.iloc[n:, :]
y_val = rawdata.train.loss[n:]

In [None]:
model.fit(preprocessing_X(X_train), preprocessing_Y(y_train), 
          validation_data=(preprocessing_X(X_val), preprocessing_Y(y_val)),
          nb_epoch=10, batch_size=128)

### OHE encode data

In [9]:
def preprocess_features(X):
    ''' Preprocesses the student data and converts non-numeric binary variables into
        binary (0/1) variables. Converts categorical variables into dummy variables. '''

    # Investigate each feature column for the data
    for col, col_data in X.iteritems():
        
        # If data type is non-numeric, replace all yes/no values with 1/0
        if col_data.dtype == object:
            col_data = col_data.replace(['yes', 'no'], [1, 0])

        # If data type is categorical, convert to dummy variables
        if col_data.dtype == object:
            # Example: 'school' => 'school_GP' and 'school_MS'
            col_data = pd.get_dummies(col_data, prefix = col)  
        
        # Collect the revised columns
        yield col_data

In [10]:
# split out ids 
ids = pd.concat((rawdata.train.id, rawdata.test.id), axis=0)

# concatenate all data
X_all = pd.concat((rawdata.train.drop(['id','loss'], axis=1),
                  rawdata.test.drop(['id'], axis=1)), axis=0)

# process all data
X_processed = preprocess_features(X_all)
X_processed = pd.DataFrame(pd.concat(X_processed, axis=1), index = X_all.index )

# set target variable
Y = rawdata.train.loss

In [11]:
X_processed.shape

(313864, 1190)

In [12]:
rawdata.train.shape[0]

188318

In [13]:
train_rows = 100000
val_rows = rawdata.train.shape[0]

fulltrain = X_processed.iloc[:val_rows, :]
modeldata = ModelData(X_train = X_processed.iloc[:train_rows, :],
                      X_val = X_processed.iloc[train_rows:val_rows, :],
                      y_train = Y.iloc[:train_rows],
                      y_val = Y.iloc[train_rows:val_rows],
                      X_test = X_processed.iloc[val_rows:, : ] ) 

In [14]:
modeldata.X_train.describe()

Unnamed: 0,cat1_A,cat1_B,cat2_A,cat2_B,cat3_A,cat3_B,cat4_A,cat4_B,cat5_A,cat5_B,cat6_A,cat6_B,cat7_A,cat7_B,cat8_A,cat8_B,cat9_A,cat9_B,cat10_A,cat10_B,cat11_A,cat11_B,cat12_A,cat12_B,cat13_A,cat13_B,cat14_A,cat14_B,cat15_A,cat15_B,cat16_A,cat16_B,cat17_A,cat17_B,cat18_A,cat18_B,cat19_A,cat19_B,cat20_A,cat20_B,cat21_A,cat21_B,cat22_A,cat22_B,cat23_A,cat23_B,cat24_A,cat24_B,cat25_A,cat25_B,cat26_A,cat26_B,cat27_A,cat27_B,cat28_A,cat28_B,cat29_A,cat29_B,cat30_A,cat30_B,cat31_A,cat31_B,cat32_A,cat32_B,cat33_A,cat33_B,cat34_A,...,cat116_LJ,cat116_LK,cat116_LL,cat116_LM,cat116_LN,cat116_LO,cat116_LP,cat116_LQ,cat116_LR,cat116_LS,cat116_LT,cat116_LU,cat116_LV,cat116_LW,cat116_LX,cat116_LY,cat116_M,cat116_MA,cat116_MB,cat116_MC,cat116_MD,cat116_ME,cat116_MF,cat116_MG,cat116_MH,cat116_MI,cat116_MJ,cat116_MK,cat116_ML,cat116_MM,cat116_MN,cat116_MO,cat116_MP,cat116_MQ,cat116_MR,cat116_MS,cat116_MT,cat116_MU,cat116_MV,cat116_MW,cat116_MX,cat116_N,cat116_O,cat116_P,cat116_Q,cat116_R,cat116_S,cat116_T,cat116_U,cat116_V,cat116_W,cat116_X,cat116_Y,cont1,cont2,cont3,cont4,cont5,cont6,cont7,cont8,cont9,cont10,cont11,cont12,cont13,cont14
count,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,...,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0
mean,0.75433,0.24567,0.56543,0.43457,0.94539,0.05461,0.6835,0.3165,0.65775,0.34225,0.69885,0.30115,0.97504,0.02496,0.94134,0.05866,0.59915,0.40085,0.84997,0.15003,0.89268,0.10732,0.84838,0.15162,0.89552,0.10448,0.98773,0.01227,0.99981,0.00019,0.96579,0.03421,0.99276,0.00724,0.99467,0.00533,0.9907,0.0093,0.99899,0.00101,0.99778,0.00222,0.99977,0.00023,0.83637,0.16363,0.96611,0.03389,0.9035,0.0965,0.94104,0.05896,0.89424,0.10576,0.96028,0.03972,0.98064,0.01936,0.98123,0.01877,0.97149,0.02851,0.99351,0.00649,0.99507,0.00493,0.99684,...,0.00208,8e-05,0.00118,0.01693,0.0159,0.01077,0.0,0.00315,0.00026,0.0,0.00024,2e-05,0.00164,0.00348,0.00196,0.00675,3e-05,0.00037,0.0,0.00194,0.01061,0.00147,1e-05,0.002,0.0,0.00055,0.00364,0.00011,9e-05,5e-05,2e-05,0.00016,0.00048,0.00016,0.00013,0.0,0.0,0.0002,1e-05,0.00018,0.0,0.0,1e-05,1e-05,2e-05,1e-05,1e-05,1e-05,8e-05,1e-05,1e-05,2e-05,1e-05,0.49326,0.50717,0.498687,0.491165,0.487172,0.490604,0.484356,0.48555,0.485119,0.497532,0.492876,0.492545,0.493294,0.496731
std,0.430486,0.430486,0.495703,0.495703,0.227219,0.227219,0.465113,0.465113,0.474465,0.474465,0.45876,0.45876,0.156004,0.156004,0.234988,0.234988,0.490073,0.490073,0.357103,0.357103,0.309521,0.309521,0.358654,0.358654,0.305884,0.305884,0.110089,0.110089,0.013783,0.013783,0.181769,0.181769,0.08478,0.08478,0.072812,0.072812,0.095988,0.095988,0.031765,0.031765,0.047065,0.047065,0.015164,0.015164,0.369941,0.369941,0.180947,0.180947,0.295277,0.295277,0.235551,0.235551,0.307532,0.307532,0.195302,0.195302,0.137787,0.137787,0.135712,0.135712,0.166426,0.166426,0.080299,0.080299,0.070041,0.070041,0.056125,...,0.04556,0.008944,0.034331,0.12901,0.125089,0.103219,0.0,0.056037,0.016122,0.0,0.01549,0.004472,0.040464,0.058889,0.044229,0.081881,0.005477,0.019232,0.0,0.044003,0.102457,0.038313,0.003162,0.044677,0.0,0.023446,0.060223,0.010488,0.009486,0.007071,0.004472,0.012648,0.021904,0.012648,0.011401,0.0,0.0,0.014141,0.003162,0.013415,0.0,0.0,0.003162,0.003162,0.004472,0.003162,0.003162,0.003162,0.008944,0.003162,0.003162,0.004472,0.003162,0.187697,0.207296,0.202301,0.211479,0.208837,0.205241,0.178139,0.198707,0.181771,0.185748,0.209513,0.209156,0.212668,0.222586
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.6e-05,0.001149,0.002634,0.176921,0.281143,0.012683,0.069503,0.23688,8e-05,0.0,0.035321,0.036232,0.000228,0.180268
25%,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.34609,0.358319,0.336963,0.327354,0.281143,0.335056,0.350175,0.3128,0.35897,0.36458,0.310961,0.308395,0.315758,0.294839
50%,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.475784,0.555782,0.527991,0.452887,0.422268,0.440945,0.437799,0.44106,0.43731,0.46119,0.457203,0.462286,0.363547,0.410939
75%,1.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.623912,0.681761,0.634224,0.652072,0.635304,0.655021,0.590568,0.61795,0.55855,0.61459,0.678924,0.675759,0.689974,0.724797
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.984452,0.862654,0.944251,0.952482,0.983674,0.997162,1.0,0.9802,0.99379,0.99498,0.998742,0.998484,0.988494,0.844848


In [15]:
modeldata.y_train.describe()

count    100000.00000
mean       3040.33884
std        2922.19542
min           5.25000
25%        1200.77750
50%        2119.06500
75%        3869.95250
max      121012.25000
Name: loss, dtype: float64

In [16]:
modeldata.X_train.head()

Unnamed: 0,cat1_A,cat1_B,cat2_A,cat2_B,cat3_A,cat3_B,cat4_A,cat4_B,cat5_A,cat5_B,cat6_A,cat6_B,cat7_A,cat7_B,cat8_A,cat8_B,cat9_A,cat9_B,cat10_A,cat10_B,cat11_A,cat11_B,cat12_A,cat12_B,cat13_A,cat13_B,cat14_A,cat14_B,cat15_A,cat15_B,cat16_A,cat16_B,cat17_A,cat17_B,cat18_A,cat18_B,cat19_A,cat19_B,cat20_A,cat20_B,cat21_A,cat21_B,cat22_A,cat22_B,cat23_A,cat23_B,cat24_A,cat24_B,cat25_A,cat25_B,cat26_A,cat26_B,cat27_A,cat27_B,cat28_A,cat28_B,cat29_A,cat29_B,cat30_A,cat30_B,cat31_A,cat31_B,cat32_A,cat32_B,cat33_A,cat33_B,cat34_A,...,cat116_LJ,cat116_LK,cat116_LL,cat116_LM,cat116_LN,cat116_LO,cat116_LP,cat116_LQ,cat116_LR,cat116_LS,cat116_LT,cat116_LU,cat116_LV,cat116_LW,cat116_LX,cat116_LY,cat116_M,cat116_MA,cat116_MB,cat116_MC,cat116_MD,cat116_ME,cat116_MF,cat116_MG,cat116_MH,cat116_MI,cat116_MJ,cat116_MK,cat116_ML,cat116_MM,cat116_MN,cat116_MO,cat116_MP,cat116_MQ,cat116_MR,cat116_MS,cat116_MT,cat116_MU,cat116_MV,cat116_MW,cat116_MX,cat116_N,cat116_O,cat116_P,cat116_Q,cat116_R,cat116_S,cat116_T,cat116_U,cat116_V,cat116_W,cat116_X,cat116_Y,cont1,cont2,cont3,cont4,cont5,cont6,cont7,cont8,cont9,cont10,cont11,cont12,cont13,cont14
0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.7263,0.245921,0.187583,0.789639,0.310061,0.718367,0.33506,0.3026,0.67135,0.8351,0.569745,0.594646,0.822493,0.714843
1,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.330514,0.737068,0.592681,0.614134,0.885834,0.438917,0.436585,0.60087,0.35127,0.43919,0.338312,0.366307,0.611431,0.304496
2,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.261841,0.358319,0.484196,0.236924,0.397069,0.289648,0.315545,0.2732,0.26076,0.32446,0.381398,0.373424,0.195709,0.774425
3,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.321594,0.555782,0.527991,0.373816,0.422268,0.440945,0.391128,0.31796,0.32128,0.44467,0.327915,0.32157,0.605077,0.602642
4,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.273204,0.15999,0.527991,0.473202,0.704268,0.178193,0.247408,0.24564,0.22089,0.2123,0.204687,0.202213,0.246011,0.432606


In [85]:
ohe_binary = set()
ohe_complex = set()
ohe_other = set()
categorical_cols = list()
continuous_cols = list()

for c in modeldata.X_train.columns.values:
    if re.match('^cat.*', c):
        categorical_cols.append(c)
        col, val = c.split('_')
        
        if col in field_sets['binary']:
            ohe_binary.add(col)
        
        elif col in field_sets['complex']:
            ohe_other.add(col)
    else:
        continuous_cols.append(c)

ohe_field_sets = {
    'binary': ohe_binary,
    'complex': ohe_complex,
}

### XBoost

In [None]:
dtrain = xgb.DMatrix( modeldata.X_train, label=np.log(modeldata.y_train) )
dval = xgb.DMatrix( modeldata.X_val, label=np.log(modeldata.y_val) )
dtest = xgb.DMatrix( modeldata.X_test )

In [None]:
d = 2
e = 0.01
t = 600
param = {'max_depth':d, 
         'eta':e, 
         'subsample':0.5, 
         'colsample_bytree':0.5,
         'colsample_bylevel':0.5,
         'silent':1, 
         'alpha':0.2,
         'objective':'reg:linear' }
param['eval_metric'] = 'mae'
param['nthread'] = 2
evallist  = [(dval,'eval'), (dtrain,'train')]
xgb_model = xgb.train(param.items(), dtrain, t+1, evallist, verbose_eval=t//10)

#### Re-train with full dataset

In [None]:
dtrain = xgb.DMatrix( fulltrain, label=np.log(Y) )
xgb_model = xgb.train(param.items(), dtrain, t+1)
predictions = xgb_model.predict(dtest)

In [None]:
submission = pd.DataFrame(np.vstack((rawdata.test.id.astype(str), 
                                     np.exp(predictions))).T,
                          columns=['Id','loss'])
submission.to_csv('submission.csv', index=False)