### Practice

#### Q1

In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import explained_variance_score

In [2]:
explained_variance_score([7, 4, 9, 4], [8, 7, 12, 5])

0.7777777777777778

#### Q2

In [3]:
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X, y = fetch_california_housing(as_frame=True, return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)
ss = StandardScaler(with_mean=True, with_std=True)
X_train_norm = ss.fit_transform(X_train)
X_test_norm = ss.transform(X_test)

In [4]:
from sklearn.linear_model import SGDRegressor

sgd = SGDRegressor(loss='squared_error', penalty='l1', random_state=1)
sgd.fit(X_train_norm, y_train)
sgd.score(X_test_norm, y_test)

0.5996851134711898

#### Alternatively, using a pipeline

In [5]:
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import SGDRegressor
from sklearn.pipeline import Pipeline

X, y = fetch_california_housing(as_frame=True, return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)
sgd_grid_pipeline = Pipeline([('feature_scaling', StandardScaler(with_mean=True, with_std=True)),
                        ('sgd', SGDRegressor(loss='squared_error', penalty='l1', random_state=1))])

sgd_grid_pipeline.fit(X_train, y_train)
sgd_grid_pipeline.score(X_test, y_test)

0.5996851134711898

### Graded

#### Q1

In [6]:
from sklearn.model_selection import KFold, RepeatedKFold
X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]])
data = RepeatedKFold(n_splits=2,n_repeats=2,random_state=1)
for each in data.split(X):
  print(each)

(array([0, 1]), array([2, 3]))
(array([2, 3]), array([0, 1]))
(array([1, 3]), array([0, 2]))
(array([0, 2]), array([1, 3]))


#### Q2

In [7]:
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X, y = fetch_california_housing(as_frame=True, return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

In [8]:
from sklearn.linear_model import SGDRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

In [9]:
sgd_grid_pipeline = Pipeline([('feature_scaling', StandardScaler(with_mean=True, with_std=True)),
                        ('sgd', SGDRegressor(random_state=1))])

param_grid = {'sgd__loss': ['squared_error', 'huber'],
             'sgd__penalty': ['l1', 'l2'],
             'sgd__alpha': [0.1, 0.01, 0.001],
             'sgd__max_iter': [1000,2000,5000]}

sgd_grid_pipeline.get_params().keys()

dict_keys(['memory', 'steps', 'verbose', 'feature_scaling', 'sgd', 'feature_scaling__copy', 'feature_scaling__with_mean', 'feature_scaling__with_std', 'sgd__alpha', 'sgd__average', 'sgd__early_stopping', 'sgd__epsilon', 'sgd__eta0', 'sgd__fit_intercept', 'sgd__l1_ratio', 'sgd__learning_rate', 'sgd__loss', 'sgd__max_iter', 'sgd__n_iter_no_change', 'sgd__penalty', 'sgd__power_t', 'sgd__random_state', 'sgd__shuffle', 'sgd__tol', 'sgd__validation_fraction', 'sgd__verbose', 'sgd__warm_start'])

In [10]:
sgd_grid_search = GridSearchCV(sgd_grid_pipeline, param_grid=param_grid, cv=4, return_train_score=True) # don't pass scoring parameter

In [11]:
sgd_grid_search.fit(X_train, y_train)
sgd_grid_search.best_index_

12

In [12]:
sgd_grid_search.score(X_test, y_test)

0.5951040704728553

In [13]:
sgd_grid_search.cv_results_['mean_test_score'][sgd_grid_search.best_index_]

0.5925872328967452

In [14]:
print(f"Best value of hyper-parameters is {sgd_grid_search.best_params_}")

Best value of hyper-parameters is {'sgd__alpha': 0.01, 'sgd__loss': 'squared_error', 'sgd__max_iter': 1000, 'sgd__penalty': 'l1'}


#### Q3

In [15]:
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X, y = fetch_california_housing(as_frame=True, return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

In [16]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

In [17]:
ridge_grid_pipeline = Pipeline([('feature_scaling', StandardScaler(with_mean=True, with_std=True)),
                        ('ridge', Ridge())])

param_grid = {'ridge__alpha': [0.5,0.1,0.05,0.01,0.005,0.001],
             'ridge__fit_intercept': [True, False]}

In [18]:
ridge_grid_search = GridSearchCV(ridge_grid_pipeline, param_grid=param_grid, cv=4, return_train_score=True)

In [19]:
ridge_grid_search.fit(X_train, y_train)
ridge_grid_search.best_index_

0

In [20]:
ridge_grid_search.score(X_test, y_test)

0.597145061224877

In [21]:
ridge_grid_search.cv_results_['mean_test_score'][ridge_grid_search.best_index_]

0.6047106636809509

In [22]:
print(f"Best value of hyper-parameters is {ridge_grid_search.best_params_}")

Best value of hyper-parameters is {'ridge__alpha': 0.5, 'ridge__fit_intercept': True}


#### Q4

In [23]:
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X, y = fetch_california_housing(as_frame=True, return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=1)

In [24]:
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

In [25]:
lasso_grid_pipeline = Pipeline([('feature_scaling', StandardScaler(with_mean=True, with_std=True)),
                        ('lasso', Lasso())])

param_grid = {'lasso__alpha': [0.5,0.1,0.05,0.01,0.005,0.001],
             'lasso__fit_intercept': [True, False]}

In [26]:
lasso_grid_search = GridSearchCV(lasso_grid_pipeline, param_grid=param_grid, cv=6, return_train_score=True)

In [27]:
lasso_grid_search.fit(X_train, y_train)
lasso_grid_search.best_index_

10

In [28]:
lasso_grid_search.score(X_test, y_test)

0.6065831805608592

In [29]:
lasso_grid_search.cv_results_['mean_test_score'][lasso_grid_search.best_index_]

0.5997671812675263

In [30]:
print(f"Best value of hyper-parameters is {lasso_grid_search.best_params_}")

Best value of hyper-parameters is {'lasso__alpha': 0.001, 'lasso__fit_intercept': True}
