In [5]:
import numpy as np
import pandas as pd
import math
from sklearn.datasets import load_boston

from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import Ridge, RidgeCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

In [3]:
df = pd.read_csv(r'C:\Users\student\Desktop\forestfires.csv')

In [9]:
def func(x):
    if x != 0:
        return math.log(x)
    else:
        return x

df['area'] = df['area'].apply(func)

In [12]:
df['area'].max()

6.9947033206327935

In [14]:
y = df['area']

In [15]:
df = df.drop(['area'], axis=1)

In [23]:
df = pd.get_dummies(df, ['month', 'day'])

In [24]:
df.head()

Unnamed: 0,X,Y,FFMC,DMC,DC,ISI,temp,RH,wind,rain,...,month_nov,month_oct,month_sep,day_fri,day_mon,day_sat,day_sun,day_thu,day_tue,day_wed
0,7,5,86.2,26.2,94.3,5.1,8.2,51,6.7,0.0,...,0,0,0,1,0,0,0,0,0,0
1,7,4,90.6,35.4,669.1,6.7,18.0,33,0.9,0.0,...,0,1,0,0,0,0,0,0,1,0
2,7,4,90.6,43.7,686.9,6.7,14.6,33,1.3,0.0,...,0,1,0,0,0,1,0,0,0,0
3,8,6,91.7,33.3,77.5,9.0,8.3,97,4.0,0.2,...,0,0,0,1,0,0,0,0,0,0
4,8,6,89.3,51.3,102.2,9.6,11.4,99,1.8,0.0,...,0,0,0,0,0,0,1,0,0,0


In [25]:
df.columns

Index(['X', 'Y', 'FFMC', 'DMC', 'DC', 'ISI', 'temp', 'RH', 'wind', 'rain',
       'month_apr', 'month_aug', 'month_dec', 'month_feb', 'month_jan',
       'month_jul', 'month_jun', 'month_mar', 'month_may', 'month_nov',
       'month_oct', 'month_sep', 'day_fri', 'day_mon', 'day_sat', 'day_sun',
       'day_thu', 'day_tue', 'day_wed'],
      dtype='object')

In [26]:
x_train, x_test, y_train, y_test = train_test_split(df, y)

In [27]:
x_train_a, x_train_b, y_train_a, y_train_b = train_test_split(x_train, y_train)

In [28]:
gs_ridge = RidgeCV(alphas=np.logspace(-2, 2))
gs_ridge.fit(x_train_a, y_train_a)
meta1 = gs_ridge.predict(x_train_b)

In [29]:
gs_knn1 = GridSearchCV(KNeighborsRegressor(), {'n_neighbors':np.arange(1, 20)})
gs_knn1.fit(x_train_a, y_train_a)
meta2 = gs_knn1.predict(x_train_b)
print(gs_knn1.best_params_)

{'n_neighbors': 17}


In [30]:
gs_knn2 = GridSearchCV(KNeighborsRegressor(), {'n_neighbors':np.arange(21, 100)})
gs_knn2.fit(x_train_a, y_train_a)
meta3 = gs_knn1.predict(x_train_b)
print(gs_knn2.best_params_)

{'n_neighbors': 99}


In [31]:
gs_rf1 = GridSearchCV(RandomForestRegressor(n_estimators=30, n_jobs=-1), 
                      {'max_depth':np.arange(2, 15)})
gs_rf1.fit(x_train_a, y_train_a)
meta4 = gs_rf1.predict(x_train_b)
print(gs_rf1.best_params_)

{'max_depth': 2}


In [32]:
gs_rf2 = GridSearchCV(RandomForestRegressor(n_estimators=100, n_jobs=-1), 
                      {'max_depth':np.arange(2, 15)})
gs_rf2.fit(x_train_a, y_train_a)
meta5 = gs_rf2.predict(x_train_b)
print(gs_rf2.best_params_)

{'max_depth': 2}


In [33]:
from sklearn.metrics import mean_absolute_error

In [34]:
print('ridge', mean_absolute_error(y_test, gs_ridge.predict(x_test)))
print('knn1', mean_absolute_error(y_test, gs_knn1.predict(x_test)))
print('knn2', mean_absolute_error(y_test, gs_knn2.predict(x_test)))
print('rf1', mean_absolute_error(y_test, gs_rf1.predict(x_test)))
print('rf2', mean_absolute_error(y_test, gs_rf2.predict(x_test)))

ridge 1.14746746661
knn1 1.13375097921
knn2 1.16967139961
rf1 1.15935985535
rf2 1.15328181316


In [35]:
new_x_train = np.vstack((meta1, meta2, meta3, meta4, meta5)).T

In [36]:
meta_ridge = RidgeCV(np.logspace(-2, 2))
meta_ridge.fit(new_x_train, y_train_b)

new_x_test = np.vstack((
            gs_ridge.predict(x_test),
            gs_knn1.predict(x_test),
            gs_knn2.predict(x_test),
            gs_rf1.predict(x_test),
            gs_rf2.predict(x_test))).T
y_pred = meta_ridge.predict(new_x_test)
mean_absolute_error(y_test, y_pred)

In [37]:
mean_absolute_error(y_test, y_pred)

1.1719305351681257

In [45]:
gs_nrf = GridSearchCV(RandomForestRegressor(n_estimators=30), {'max_depth':np.arange(2, 15)})
gs_nrf.fit(new_x_train, y_train_b)
#y_pred = meta_rf.predict(new_x_test)
#mean_absolute_error(y_test, y_pred)

GridSearchCV(cv=None, error_score='raise',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=30, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'max_depth': array([ 2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [46]:
mean_absolute_error(y_test, gs_nrf.predict(new_x_test))

1.2293261301126204

In [48]:
nknn = KNeighborsRegressor(n_neighbors=17)
nknn.fit(new_x_train, y_train_b)
mean_absolute_error(y_test, nknn.predict(new_x_test))

1.1853325194422724