<h1>03 Error Estimation for Linear Regression and 3NN</h1>

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import pandas as pd
import numpy as np

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import GridSearchCV

<h1>Introductory Remarks</h1>
<ul>
    <li>We're going to predict the strength of concrete!</li>
    <li>We have a labeled dataset, originally described in<br />
        I-Cheng Yeh, "Modeling of strength of high performance concrete using artificial neural networks," Cement and Concrete Research, Vol. 28, No. 12, pp. 1797-1808 (1998).
    </li>
    <li>Nowadays, it is available from the <a href="http://archive.ics.uci.edu/ml/index.php">UC Irvine Machine Learning Repository</a>. I have taken a copy and made it available to you as a CSV file called <code>dataset_concrete.csv</code>.
    </li>
    <li>Use error estimation to compare linear regression and 3NN.
    </li>
 </ul>

In [3]:
df = pd.read_csv('../datasets/dataset_concrete.csv')
df

Unnamed: 0,cement,slag,fly_ash,water,superplasticizer,coarse_aggregate,fine_aggregate,age,strength
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28,79.99
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28,61.89
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270,40.27
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365,41.05
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360,44.30
...,...,...,...,...,...,...,...,...,...
1025,276.4,116.0,90.3,179.6,8.9,870.1,768.3,28,44.28
1026,322.2,0.0,115.6,196.0,10.4,817.9,813.4,28,31.18
1027,148.5,139.4,108.6,192.7,6.1,892.4,780.0,28,23.70
1028,159.1,186.7,0.0,175.6,11.3,989.6,788.9,28,32.77


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1030 entries, 0 to 1029
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   cement            1030 non-null   float64
 1   slag              1030 non-null   float64
 2   fly_ash           1030 non-null   float64
 3   water             1030 non-null   float64
 4   superplasticizer  1030 non-null   float64
 5   coarse_aggregate  1030 non-null   float64
 6   fine_aggregate    1030 non-null   float64
 7   age               1030 non-null   int64  
 8   strength          1030 non-null   float64
dtypes: float64(8), int64(1)
memory usage: 72.5 KB


In [5]:
features = list(df.columns.values)[:-1]
features

['cement',
 'slag',
 'fly_ash',
 'water',
 'superplasticizer',
 'coarse_aggregate',
 'fine_aggregate',
 'age']

In [6]:
dev_df, test_df = train_test_split(df, train_size=0.8, random_state=2)

In [7]:
dev_X = dev_df[features]
test_X = test_df[features]

dev_y = dev_df["strength"].values
test_y = test_df["strength"].values

In [8]:
preprocessor = ColumnTransformer([("scaler", StandardScaler(), features)], remainder="passthrough")
preprocessor

In [9]:
knn_model = Pipeline([
    ("preprocessor", preprocessor),
    ("predictor", KNeighborsRegressor())
])
knn_model

In [10]:
param_grid = {"predictor__n_neighbors": [x for x in range(1, 11)]}
param_grid

{'predictor__n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}

In [11]:
ss = ShuffleSplit(n_splits=1, train_size=0.75, random_state=2)
ss

ShuffleSplit(n_splits=1, random_state=2, test_size=None, train_size=0.75)

In [12]:
gs = GridSearchCV(knn_model, param_grid, scoring="neg_mean_absolute_error", cv=ss, error_score='raise', refit=True)
gs

In [13]:
gs.fit(dev_X, dev_y)

In [14]:
gs.best_params_, gs.best_score_

({'predictor__n_neighbors': 2}, -6.72631067961165)

In [None]:
knn_model = Pipeline([
    ("preprocessor", preprocessor),
    ("predictor", KNeighborsRegressor(n_neighbors=12))
])
knn_model

In [15]:
mean_absolute_error(test_y, gs.predict(test_X))

NameError: name 'mean_absolute_error' is not defined