In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import KFold
from sklearn import preprocessing, ensemble, linear_model
from scipy.stats import skew, boxcox
from IPython.core.pylabtools import figsize
import xgboost as xgb
import matplotlib.pyplot as plt
%matplotlib inline

from os import path
to_filename = lambda name: path.join("..", "data", "allstate", name +".csv")

import seaborn as sns
sns.set_style("whitegrid")



In [2]:
train = pd.read_csv(to_filename("train"), index_col=0)
test = pd.read_csv(to_filename("test"), index_col=0)
print("shape: train {}, test {}".format(train.shape, test.shape))
print(train.head(2))

shape: train (188318, 131), test (125546, 130)
   cat1 cat2 cat3 cat4 cat5 cat6 cat7 cat8 cat9 cat10   ...        cont6  \
id                                                      ...                
1     A    B    A    B    A    A    A    A    B     A   ...     0.718367   
2     A    B    A    A    A    A    A    A    B     B   ...     0.438917   

       cont7    cont8    cont9   cont10    cont11    cont12    cont13  \
id                                                                      
1   0.335060  0.30260  0.67135  0.83510  0.569745  0.594646  0.822493   
2   0.436585  0.60087  0.35127  0.43919  0.338312  0.366307  0.611431   

      cont14     loss  
id                     
1   0.714843  2213.18  
2   0.304496  1283.60  

[2 rows x 131 columns]


In [3]:
response = np.log(train.loss)
mean_resp = np.mean(response)
response -= mean_resp

def restore_pred(y):
    return np.exp(y + mean_resp)

In [4]:
cat_features = [col for col in train.columns if col.startswith("cat")]
print("Categorical columns:", cat_features)

('Categorical columns:', ['cat1', 'cat2', 'cat3', 'cat4', 'cat5', 'cat6', 'cat7', 'cat8', 'cat9', 'cat10', 'cat11', 'cat12', 'cat13', 'cat14', 'cat15', 'cat16', 'cat17', 'cat18', 'cat19', 'cat20', 'cat21', 'cat22', 'cat23', 'cat24', 'cat25', 'cat26', 'cat27', 'cat28', 'cat29', 'cat30', 'cat31', 'cat32', 'cat33', 'cat34', 'cat35', 'cat36', 'cat37', 'cat38', 'cat39', 'cat40', 'cat41', 'cat42', 'cat43', 'cat44', 'cat45', 'cat46', 'cat47', 'cat48', 'cat49', 'cat50', 'cat51', 'cat52', 'cat53', 'cat54', 'cat55', 'cat56', 'cat57', 'cat58', 'cat59', 'cat60', 'cat61', 'cat62', 'cat63', 'cat64', 'cat65', 'cat66', 'cat67', 'cat68', 'cat69', 'cat70', 'cat71', 'cat72', 'cat73', 'cat74', 'cat75', 'cat76', 'cat77', 'cat78', 'cat79', 'cat80', 'cat81', 'cat82', 'cat83', 'cat84', 'cat85', 'cat86', 'cat87', 'cat88', 'cat89', 'cat90', 'cat91', 'cat92', 'cat93', 'cat94', 'cat95', 'cat96', 'cat97', 'cat98', 'cat99', 'cat100', 'cat101', 'cat102', 'cat103', 'cat104', 'cat105', 'cat106', 'cat107', 'cat108', 'c

In [5]:
# Method 2: Using ordered features for categorical features
col = cat_features[0]
test_col = train[col][:10].copy()
for col in cat_features:
    key_map = response.groupby(train[col]).mean().to_dict()
    train[col] = train[col].replace(key_map)
    for k in set(test[col].value_counts().index).difference(key_map.keys()):
        key_map[k] = np.NaN
    test[col] = test[col].replace(key_map)

In [6]:
for col in test.columns:
    train.loc[train[col].isnull(), col] = train[col].mean()
    test.loc[test[col].isnull(), col] = test[col].mean()
    sc = preprocessing.StandardScaler()
    sc.fit(train[[col]])
    train[col] = sc.transform(train[[col]])
    test[col] = sc.transform(test[[col]])

In [7]:
train.head(2)

Unnamed: 0_level_0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,cat9,cat10,...,cont6,cont7,cont8,cont9,cont10,cont11,cont12,cont13,cont14,loss
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.574804,1.143636,-0.240848,1.463785,-0.722441,0.655727,-0.157776,-0.249598,1.226524,-0.418835,...,1.107908,-0.84007,-0.922092,1.023032,1.813218,0.363476,0.484637,1.547892,0.984894,2213.18
2,0.574804,1.143636,-0.240848,-0.68316,-0.722441,0.655727,-0.157776,-0.249598,1.226524,2.387575,...,-0.253457,-0.271142,0.573972,-0.738944,-0.316748,-0.739973,-0.605672,0.555951,-0.859471,1283.6


In [8]:
X = train.drop("loss", 1).values
y = response.values

In [None]:
# model = linear_model.LassoLarsCV(cv=5)

LassoLarsCV: ~1257

In [None]:
folds = 3
kf = KFold(folds, shuffle=True)

for i, (train_index, test_index) in enumerate(kf.split(X)):
    x_train, y_train = X[train_index], y[train_index]
    x_test, y_test = X[test_index], y[test_index]
    # model = linear_model.LassoLarsCV(cv=5)
    model = ensemble.RandomForestRegressor(n_estimators=100, n_jobs=-1, criterion="mae")
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    print("mae", i, mean_absolute_error(restore_pred(y_pred), restore_pred(y_test)))

In [None]:
model.fit(train.drop("loss", 1), train.loss)

In [None]:
res = model

In [None]:
m_log_alphas = -np.log10(model.cv_alphas_)

plt.figure()
plt.plot(m_log_alphas, model.cv_mse_path_, ':')
plt.plot(m_log_alphas, model.cv_mse_path_.mean(axis=-1), 'k',
         label='Average across the folds', linewidth=2)
plt.axvline(-np.log10(model.alpha_), linestyle='--', color='k',
            label='alpha CV')
plt.legend()
plt.xlabel('-log(alpha)')
plt.ylabel('Mean square error')
plt.show()

In [None]:
m_log_alphas