# Workshop 3

Starter code for workshop 3. You should have seen most of it before, but make sure you understand what it is doing!

In [7]:
# Common imports
import numpy as np
import os

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

In [8]:
# Read data
import pandas as pd

housing = pd.read_csv("workshop3.csv")
housing.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,-122.23,37.88,41,880,129.0,322,126,8.3252,452600
1,-122.22,37.86,21,7099,1106.0,2401,1138,8.3014,358500
2,-122.24,37.85,52,1467,190.0,496,177,7.2574,352100
3,-122.25,37.85,52,1274,235.0,558,219,5.6431,341300
4,-122.25,37.85,52,1627,280.0,565,259,3.8462,342200


Split the available data 80/20 for training and testing. Don't use the test data until the very end!

In [9]:
# Split our data in train and test
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(housing, test_size=0.2, random_state=20)

median_house_value is the value we want to predict, so separate it from the other features.

In [10]:
# Split train data in training_features (X) and training_labels 
training_features = train_set.drop(["median_house_value"], axis=1) # X
training_labels = train_set["median_house_value"].copy() # Y

In [11]:
# Create a linear regression model
from sklearn.linear_model import LinearRegression

# Train and predict in train data
lr = LinearRegression()
lr.fit(training_features,training_labels)
prediction_lr = lr.predict(training_features)

In [12]:
# Get the root mean squared error
from sklearn.metrics import mean_squared_error

# Get mean squared error
mse = mean_squared_error(training_labels,prediction_lr) #MSE
# Get root mean squared error
rmse = mean_squared_error(training_labels,prediction_lr,squared = False) #RMSE
print(mse)
print(rmse)

4883215704.094342
69880.00933095488


In [13]:
# Create a baseline using the mean of median_house variable
baseline = np.full(training_labels.shape,training_labels.mean())

In [14]:
baseline

array([207484.99204698, 207484.99204698, 207484.99204698, ...,
       207484.99204698, 207484.99204698, 207484.99204698])

In [15]:
# Get the rmse for our baseline
rmse_bl = mean_squared_error(training_labels,baseline,squared = False) #RMSE
print(rmse_bl)

115666.22344961535


In [16]:
# Try a new model - KNN
from sklearn.neighbors import KNeighborsRegressor

# We need to define the n_neighbors parameter
knn = KNeighborsRegressor(n_neighbors= 1) # set the k value

# Later we can train and predict in our training data
knn.fit(training_features,training_labels)
pred_knn = knn.predict(training_features)

In [17]:
# Get the rmse of our KNN model
rmse_knn = mean_squared_error(training_labels,pred_knn,squared = False) #RMSE
print(rmse_knn)

0.0


In [18]:
# Now, we go to see the use of cross validation
from sklearn.model_selection import cross_val_score

# We need to define a model
knn = KNeighborsRegressor(n_neighbors= 5)
# Implement cross_val_score in our knn model. We need to define the numbers of folds (cv).
cv_scores = cross_val_score(knn, training_features, training_labels, cv=5,scoring= 'neg_root_mean_squared_error')

In [19]:
cv_scores

array([ -98917.06418581,  -99422.00966907, -100110.50308413,
       -100079.4708306 , -100161.97417681])

In [20]:
# Using cross_validation (from sklearn) returns a negative rmse. Then, we need to add a negative to get the positive value.
print('this is my rmse for cv: ', -cv_scores.mean())

this is my rmse for cv:  99738.2043892845


In [21]:
# Also, we can implement other similar function, cross_validate
from sklearn.model_selection import cross_validate

# Set a model
knn = KNeighborsRegressor(n_neighbors= 5)
# Get results
cv_results = cross_validate(knn, training_features, training_labels, cv=5,scoring= 'neg_root_mean_squared_error')

In [22]:
# This function gives us more information
cv_results

{'fit_time': array([0.03590274, 0.02789354, 0.03092718, 0.02792573, 0.0309267 ]),
 'score_time': array([0.03494239, 0.02496409, 0.02591968, 0.02592897, 0.02588415]),
 'test_score': array([ -98917.06418581,  -99422.00966907, -100110.50308413,
        -100079.4708306 , -100161.97417681])}

In [23]:
# In this way we can obtain the rmse for our cross validation process
-cv_results['test_score'].mean()

99738.2043892845

In [24]:
# Now, we go to use GridSearchCV to determine which are the best parameters for our model
from sklearn.model_selection import GridSearchCV

# We go to use knn
knn = KNeighborsRegressor()
# Using the method "get_params()" we can see the different parameters which can change in our model
knn.get_params()

{'algorithm': 'auto',
 'leaf_size': 30,
 'metric': 'minkowski',
 'metric_params': None,
 'n_jobs': None,
 'n_neighbors': 5,
 'p': 2,
 'weights': 'uniform'}

In [25]:
# We need to create a dictionary with the different values that we go to use
parameters = {'n_neighbors':[3,5,7,9,11], 'weights':['uniform', 'distance']}

# Create our GridSearchCV model
clf = GridSearchCV(knn, parameters,scoring= 'neg_root_mean_squared_error')

# Train our GridSearch. This can take long time, if we have many possible values for each parameter
clf.fit(training_features, training_labels)

GridSearchCV(estimator=KNeighborsRegressor(),
             param_grid={'n_neighbors': [3, 5, 7, 9, 11],
                         'weights': ['uniform', 'distance']},
             scoring='neg_root_mean_squared_error')

In [67]:
# Finally, we can obtain the atribute "best_params_" from our GridSearchCV object and know which are the best setting of parameters
clf.best_params_

{'n_neighbors': 11, 'weights': 'distance'}

In [68]:
# Also, we can get the score of the best scenario (using the best parameters). Remember that GridSearchCV uses cross validation internally
-clf.best_score_

96473.95790063369

In [69]:
# How you can use the best parameters in a model?
clf.predict(training_features)

array([216600., 187700., 122500., ...,  67500., 150000., 239100.])

In [70]:
# Create a new instance of the model
knn = KNeighborsRegressor()
# You can use 'set_params' to set the "best parameters" for your model
knn.set_params(**clf.best_params_)

KNeighborsRegressor(n_neighbors=11, weights='distance')

In [71]:
# Now, you can train in your all train data
knn.fit(training_features, training_labels)

# Split test data in training_features (X) and training_labels 
testing_features = test_set.drop(["median_house_value"], axis=1) # X
testing_labels = test_set["median_house_value"].copy() # Y

# Predict in test
prediction_test = knn.predict(testing_features)

### **Now, you can implement this inside a Pipeline**

In [72]:
# How you can create a Pipeline?
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

# You can create a pipeline in this way
first_pipeline = Pipeline([("imputer",SimpleImputer(strategy='median')),
                           ('std_scaler',StandardScaler()),
                           ('model',KNeighborsRegressor())])

# If you want, you can set the parameters in this way
first_pipeline['model'].set_params(**clf.best_params_)

KNeighborsRegressor(n_neighbors=11, weights='distance')

In [73]:
# You can see now, how the pipeline contains the parameters which we selected
first_pipeline

Pipeline(steps=[('imputer', SimpleImputer(strategy='median')),
                ('std_scaler', StandardScaler()),
                ('model',
                 KNeighborsRegressor(n_neighbors=11, weights='distance'))])

In [74]:
# You can keep going with the next steps