In [10]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import statsmodels.formula.api as smf

from statsmodels.graphics.gofplots import qqplot
from statsmodels.graphics.tsaplots import plot_acf

from sklearn import linear_model as lm, model_selection as ms, preprocessing
from sklearn.pipeline import Pipeline

from sklearn import ensemble, model_selection as ms, tree

from sklearn.model_selection import train_test_split

In [6]:
Boston = pd.read_csv('datasets/train.csv')

In [69]:
X = Boston.drop(columns=['medv', 'ID'])
y = Boston['medv']

In [70]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [71]:
model = lm.LinearRegression()

In [72]:
model.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [73]:
model.predict(X_test)

array([27.09122013, 22.72624815, 19.47812946, 41.98073777, 20.20440595,
       27.11211212, 13.89686579, 10.99446962, 23.78972413, 19.50071046,
       22.51033358, 24.22490983, 33.33311934, 26.81369821, 31.66896121,
       22.80293415, 15.99956826, 20.10037311, 36.47971812, 25.34439817,
       20.8143324 , 23.28212846, 20.90526522, 17.63357727, 37.70774208,
       23.87687407,  1.76661231, 13.80594229, 24.21528222, 28.89106839,
       22.74387479, 29.72979756,  6.24458098, 10.25016401, 20.91884105,
       17.88675502, 28.55082646, 14.19848606, 24.65401157, 35.35724622,
       26.05047503,  8.78591086, 22.18479524, 15.58922643, 14.43819343,
       23.57346461, 29.46439338, 23.83206832, 19.22138153, 13.65455114,
       22.04291859, 19.95632897, 21.94706386, 13.96786157, 31.13038958,
       19.62900596, 40.85736643, 22.92562916, 23.1807064 , 23.32263841,
       17.52097536, 24.49530993, 20.5395353 , 15.16367295, 36.57852378,
       13.10630553, 28.99475204])

In [74]:
import sklearn.metrics

In [75]:
sklearn.metrics.mean_squared_error(y_test, model.predict(X_test))

17.513553107264528

In [76]:
cv = ms.KFold(n_splits=5, shuffle=True, random_state=42)

In [77]:
mses = ms.cross_val_score(lm.LinearRegression(), X_train, y_train, scoring='neg_mean_squared_error', cv=cv)
np.mean(-mses)

29.26611058338679

In [78]:
rf1 = ensemble.RandomForestRegressor(n_estimators=20, random_state=42)
rf1.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=None,
           oob_score=False, random_state=42, verbose=0, warm_start=False)

In [79]:
for n_trees in [2, 5, 10, 20, 50, 100]:
    estimator = ensemble.RandomForestRegressor(n_estimators=n_trees, random_state=42)
    aucs = ms.cross_val_score(estimator, X, y, scoring='neg_mean_squared_error', cv=cv)
    print('{:>3} trees: mean mses {:.2}'.format(n_trees, np.mean(mses)))

  2 trees: mean mses -2.9e+01
  5 trees: mean mses -2.9e+01
 10 trees: mean mses -2.9e+01
 20 trees: mean mses -2.9e+01
 50 trees: mean mses -2.9e+01
100 trees: mean mses -2.9e+01


In [80]:
sklearn.metrics.mean_squared_error(y_test, rf1.predict(X_test))

8.619934701492538

In [81]:
gbt1 = ensemble.GradientBoostingRegressor(n_estimators=20, random_state=42)
gbt1.fit(X_train, y_train)

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=3, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=20, n_iter_no_change=None, presort='auto',
             random_state=42, subsample=1.0, tol=0.0001,
             validation_fraction=0.1, verbose=0, warm_start=False)

In [85]:
sklearn.metrics.mean_squared_error(y_test, gbt1.predict(X_test))

10.908231434573095

In [86]:
gbt1.feature_importances_

array([0.02487661, 0.        , 0.        , 0.00472473, 0.00968452,
       0.31653344, 0.00655149, 0.10365855, 0.        , 0.00870953,
       0.00869065, 0.00161927, 0.51495123])

In [87]:
list(zip(X_test.columns, gbt1.feature_importances_))

[('crim', 0.024876608243593654),
 ('zn', 0.0),
 ('indus', 0.0),
 ('chas', 0.0047247265939148525),
 ('nox', 0.00968452152851787),
 ('rm', 0.31653344188420435),
 ('age', 0.0065514852640180055),
 ('dis', 0.10365854783367912),
 ('rad', 0.0),
 ('tax', 0.008709526636295106),
 ('ptratio', 0.008690646096897988),
 ('black', 0.0016192690383395464),
 ('lstat', 0.5149512268805395)]