In [14]:
import pandas as pd
import numpy as np

from sklearn.model_selection import ParameterGrid
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer

from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor

In [10]:
train_price = pd.read_csv('train_price.csv')
train_price.head()

Unnamed: 0,item_condition_id,price,shipping,brand_mean_price,brand_std_dev,branded,upper_price_limit,lower_price_limit
0,3,10.0,1,21.133453,27.36126,0,48.494714,6.227807
1,3,52.0,0,45.021277,26.420902,1,71.442179,18.600375
2,1,10.0,1,15.047109,10.483385,1,25.530495,4.563724
3,1,35.0,1,21.133453,27.36126,0,48.494714,6.227807
4,1,44.0,0,21.133453,27.36126,0,48.494714,6.227807


In [62]:
test_price = pd.read_csv('test_price.csv')
test_price.head()

Unnamed: 0,item_condition_id,shipping,brand_mean_price,brand_std_dev,branded,upper_price_limit,lower_price_limit
0,1,1,21.133453,27.36126,0,48.494714,6.227807
1,1,1,21.133453,27.36126,0,48.494714,6.227807
2,1,1,41.14255,36.52768,1,77.67023,4.61487
3,2,0,21.133453,27.36126,0,48.494714,6.227807
4,3,1,21.133453,27.36126,0,48.494714,6.227807


In [12]:
train_price.shape

(1482535, 8)

In [13]:
test_price.shape

(693359, 7)

In [18]:
X_cols = train_price.columns.tolist()
X_cols.remove('price')
X = train_price[X_cols]
y = train_price['price']

In [25]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, shuffle=True)

In [26]:
def rmsle_func(y_pred,y_true):
    return np.sqrt(((np.log(y_pred + 1) - np.log(y_true + 1))**2).mean())
rmsle_score = make_scorer(rmsle_func, greater_is_better=False)

In [27]:
rf_grid = {
        'n_estimators': [20,25,50,75,100],
        'min_samples_leaf': [50,75,100,125,150],
        'n_jobs': [-1],
        'verbose': [1]
    }
rfr = RandomForestRegressor()

In [28]:
rfr_gs = GridSearchCV(rfr, param_grid = rf_grid, n_jobs=-1, cv=3,  scoring = rmsle_score, verbose=1)

In [29]:
rfr_gs.fit(X_train,y_train)

Fitting 3 folds for each of 25 candidates, totalling 75 fits


[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:  1.1min finished
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:  1.1min finished
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:  1.1min finished
[Parallel(n_jobs=4)]: Done  20 out of  20 | elapsed:    2.3s finished
[Parallel(n_jobs=4)]: Done  20 out of  20 | elapsed:    3.0s finished
[Parallel(n_jobs=4)]: Done  20 out of  20 | elapsed:    3.0s finished
[Parallel(n_jobs=4)]: Done  20 out of  20 | elapsed:    5.1s finished
[Parallel(n_jobs=4)]: Done  20 out of  20 | elapsed:    5.2s finished
[Parallel(n_jobs=4)]: Done  20 out of  20 | elapsed:    5.1s finished
[Parallel(n_jobs=-1)]: Done  25 out of  25 | elapsed:  1.4min finished
[Parallel(n_jobs=4)]: Done  25 out of  25 | elapsed:    3.4s finished
[Parallel(n_jobs=4)]: Done  25 out of  25 | elapsed:    6.8s finished
[Parallel(n_jobs=-1)]: Done  25 out of  25 | elapsed:  1.4min finished
[Parallel(n_jobs=-1)]: Done  25 out of  25 | elapsed:  1.4min finished
[Parallel(n_jo

[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    5.1s
[Parallel(n_jobs=4)]: Done  75 out of  75 | elapsed:    9.0s finished
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  2.3min
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   10.8s
[Parallel(n_jobs=4)]: Done  75 out of  75 | elapsed:   20.5s finished
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:  1.1min finished
[Parallel(n_jobs=4)]: Done  20 out of  20 | elapsed:    2.4s finished
[Parallel(n_jobs=4)]: Done  20 out of  20 | elapsed:    4.5s finished
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  5.7min finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    5.2s
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  2.3min
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:   12.1s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   10.4s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:   24.0s finished
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:  1.0min finished

[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  2.6min finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    8.8s
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    4.9s
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:   10.4s finished
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:    5.6s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    9.4s
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:   11.2s finished
[Parallel(n_jobs=-1)]: Done  75 out of  75 | elapsed:  3.9min finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    4.9s
[Parallel(n_jobs=4)]: Done  75 out of  75 | elapsed:    8.5s finished
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  2.2min
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    9.8s
[Parallel(n_jobs=4)]: Done  75 out of  75 | elapsed:   17.0s finished
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  2.2min
[Parallel(n_jobs=

GridSearchCV(cv=3, error_score='raise',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'n_estimators': [20, 25, 50, 75, 100], 'min_samples_leaf': [50, 75, 100, 125, 150], 'n_jobs': [-1], 'verbose': [1]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=make_scorer(rmsle_func, greater_is_better=False), verbose=1)

In [33]:
gs = [x[1] for x in rfr_gs.grid_scores_]
sorted(gs,reverse=True)[:5]



[-0.67632298586381523,
 -0.67634900027023492,
 -0.67635886127705314,
 -0.67637893839219154,
 -0.67643318147847342]

In [34]:
gs_model = list()
for x  in rfr_gs.grid_scores_:
    if x[1] > -0.6764:
        gs_model.append(x)
gs_model



[mean: -0.67632, std: 0.00022, params: {'min_samples_leaf': 50, 'n_estimators': 50, 'n_jobs': -1, 'verbose': 1},
 mean: -0.67635, std: 0.00016, params: {'min_samples_leaf': 50, 'n_estimators': 75, 'n_jobs': -1, 'verbose': 1},
 mean: -0.67638, std: 0.00020, params: {'min_samples_leaf': 50, 'n_estimators': 100, 'n_jobs': -1, 'verbose': 1},
 mean: -0.67636, std: 0.00020, params: {'min_samples_leaf': 75, 'n_estimators': 25, 'n_jobs': -1, 'verbose': 1}]

In [38]:
rfr_t = RandomForestRegressor(min_samples_leaf=50, n_estimators=50, n_jobs=-1)
rfr_t.fit(X_train,y_train)
predict = rfr_t.predict(X_val)
print(rmsle_func(predict,y_val))

0.676612128055


In [58]:
rfr_s = RandomForestRegressor(min_samples_leaf=50, n_estimators=50, n_jobs=-1)
rfr_s.fit(X,y)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=50, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=-1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [59]:
sub_file = pd.read_csv('sample_submission.csv')

In [63]:
sub_file['price'] = rfr_s.predict(test_price)

In [64]:
sub_file.head()

Unnamed: 0,test_id,price
0,0,17.572564
1,1,17.572564
2,2,58.620546
3,3,23.81631
4,4,18.725606
