In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import Lasso, Hinge
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_log_error

In [2]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

In [3]:
train.iloc[:,1:2].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4459 entries, 0 to 4458
Data columns (total 1 columns):
target    4459 non-null float64
dtypes: float64(1)
memory usage: 34.9 KB


In [4]:
len(train)

4459

In [5]:
len(test)

49342

In [6]:
train.head()

Unnamed: 0,ID,target,48df886f9,0deb4b6a8,34b15f335,a8cb14b00,2f0771a37,30347e683,d08d1fbe3,6ee66e115,...,3ecc09859,9281abeea,8675bec0b,3a13ed79a,f677d4d13,71b203550,137efaa80,fb36b89d9,7e293fbaf,9fc776466
0,000d6aaf2,38000000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
1,000fbd867,600000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
2,0027d6b71,10000000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
3,0028cbf45,2000000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
4,002a68644,14400000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0


In [7]:
test.head()

Unnamed: 0,ID,48df886f9,0deb4b6a8,34b15f335,a8cb14b00,2f0771a37,30347e683,d08d1fbe3,6ee66e115,20aa07010,...,3ecc09859,9281abeea,8675bec0b,3a13ed79a,f677d4d13,71b203550,137efaa80,fb36b89d9,7e293fbaf,9fc776466
0,000137c73,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,00021489f,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0004d7953,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,00056a333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,00056d8eb,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
X = train.iloc[:, 2:].copy()
y = train['target']

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=10)

In [19]:
ss = StandardScaler()
ss.fit(X_train)
X_train_ss = ss.transform(X_train)
X_test_ss = ss.transform(X_test)

In [80]:
X_ss = ss.transform(X)

## Random Forest

In [92]:
rf = RandomForestRegressor(n_estimators=200)
lasso = Lasso()
rf_params = {
    'max_depth': [5,100,None],
    'min_samples_split': [2],
    'min_samples_leaf': [1],
    'max_features': ['auto'],
    'max_leaf_nodes': [5,100,None]
}

In [71]:
rf_gs = GridSearchCV(rf, rf_params, n_jobs=-1, cv=3, verbose=1)
rf_gs.fit(X_train_ss, y_train)

Fitting 3 folds for each of 9 candidates, totalling 27 fits


[Parallel(n_jobs=-1)]: Done  27 out of  27 | elapsed: 106.9min finished


GridSearchCV(cv=3, error_score='raise',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'max_depth': [5, 100, None], 'min_samples_split': [2], 'min_samples_leaf': [1], 'max_features': ['auto'], 'max_leaf_nodes': [5, 100, None]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=1)

In [22]:
rf.fit(X_train_ss, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [72]:
rf_gs.best_score_

0.22199274526159243

In [73]:
rf_gs.score(X_train_ss, y_train)

0.7120383636790844

In [74]:
rf_gs.score(X_test_ss, y_test)

0.2051973640046284

In [75]:
y_train_hat = rf_gs.predict(X_train_ss)

In [78]:
mean_squared_log_error(y_train, y_train_hat)**0.5

1.6928886704597534

In [79]:
rf_best = rf_gs.best_estimator_

In [81]:
rf_best.fit(X_ss, y)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=100,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [83]:
y_hat = rf_best.predict(X_ss)

In [84]:
mean_squared_log_error(y, y_hat)**0.5

1.7232895662597036

In [10]:
testa = test.iloc[:, 1:].copy()
# testa_ss = ss.transform(testa)

In [86]:
test_predict = rf_best.predict(testa_ss)

In [87]:
submission = pd.DataFrame({'ID': test['ID'], 'target': test_predict})

In [91]:
submission.to_csv('results/submission.csv', index=False)

In [89]:
submission.head()

Unnamed: 0,ID,target
0,000137c73,6281280.0
1,00021489f,5288888.0
2,0004d7953,4743197.0
3,00056a333,9710091.0
4,00056d8eb,3619312.0


In [90]:
submission.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49342 entries, 0 to 49341
Data columns (total 2 columns):
ID        49342 non-null object
target    49342 non-null float64
dtypes: float64(1), object(1)
memory usage: 771.0+ KB


## Polynomial Features

In [None]:
rf_pl = Pipeline([('pf', PolynomialFeatures(include_bias=False)),
                  ('ss', StandardScaler()),
                  ('rfr', RandomForestRegressor(n_estimators=100))])

rf_pl_params = {
    'rfr__max_depth': [5,100,200,400,None],
    'rfr__max_features': [2,200,400,'log2','sqrt',None],
    'rfr__max_leaf_nodes': [5,100,200,400,None]
}

rf_pl_gs = GridSearchCV(rf_pl, rf_pl_params, n_jobs=2, cv=3, verbose=1)
rf_pl_gs.fit(X_train,y_train)

Fitting 3 folds for each of 150 candidates, totalling 450 fits
