## Basic exploration

### Import modules

In [None]:
import os, sys, time, random, math
import tarfile, zipfile  # Work with compressed files

import numpy as np     # Linear algebra
import pandas as pd    # Data processing

from IPython.display import display, Image  # Nice print statements
from ggplot import *     # yhat/ggplot for plots

from subprocess import check_output
print(check_output(["ls", "../../input"]).decode("utf8"))

### Load data

In [None]:
train_data = pd.read_csv("../../input/train.csv")
test_data = pd.read_csv("../../input/test.csv")

### Examine shape of datasets

In [None]:
print ("Dataset has {} samples with {} features each.".format(*train_data.shape))
train_data.info()
display(train_data.head(5))

print ("Dataset has {} samples with {} features each.".format(*test_data.shape))
test_data.info()
display(test_data.head(5))

### Examine distrbutions of datasets

In [None]:
cols = train_data.columns
features = [c for c in cols if c not in ["id", "loss"]]
cat_features  = [c for c in cols if "cat" in c]
cont_features = [c for c in cols if "cont" in c]

print('Total {} features. {} category features, {} continuous features'.format(len(features), len(cat_features), len(cont_features)))

display (train_data['loss'].describe())
print ("Mean/Variance values for cont_features:")
pd.concat( [train_data[cont_features].mean() , train_data[cont_features].var()], axis = 1)

### Plot loss distributions

In [None]:
ggplot(aes(x = 'loss'), train_data) + geom_density()

### Check if category values of test_data appear in train_data

In [None]:
for c in cats[0:]:
    vals_d = data[c].unique()
    vals_t = test_data[c].unique()
    missing_vals = [v for v in vals_t if v not in vals_d]
    if len(missing_vals) > 0:
        print (c, missing_vals)
            

### Combine categories
Combine categories from test and train data

In [None]:
ntrain = train_data.shape[0]
ntest = test_data.shape[0]
train_test = pd.concat((train_data[features], test_data[features])).reset_index(drop=True)
for c in range(len(cat_features)):
    train_test[cat_features[c]] = train_test[cat_features[c]].astype('category').cat.codes

X = train_test.iloc[:ntrain,:]
X_test = train_test.iloc[ntrain:,:]
y = np.log(train_data['loss'])

### Let's start with some scikit-learn!
How about a fast randomforestregressor

In [None]:
from sklearn.ensemble import RandomForestRegressor
rfr = RandomForestRegressor(n_jobs=3, n_estimators=6, min_samples_split=10, criterion='mse')
rfr.fit(X, y)
y_test = rfr.predict(X_test)

Quickly look at the outputs..

In [None]:
display(y_test)
print(np.exp(y_test).mean())

### Save to a csv file.


In [None]:
submission = pd.read_csv('../../sample_submission.csv')
submission['loss'] = np.exp(y_test)
submission.to_csv('submission_rfr_mse_estimators_6_min_sample_split_10.csv', index=None)