In [99]:
#Import libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import ShuffleSplit

In [100]:
#Load the Boston housing dataset
data = pd.read_csv('BostonHousing.csv')
prices = data['MEDV']
features = data.drop('MEDV', axis = 1)

In [101]:
print("Boston housing dataset has {} data points with {} variables each.".format(*data.shape))

Boston housing dataset has 489 data points with 4 variables each.


In [102]:
np.amin(prices)

105000.0

In [103]:
#Minimum price
minPrice = np.amin(prices)

#Maximum price
maxPrice = np.amax(prices)

#Mean price
meanPrice = np.mean(prices)

#Median price
medianPrice = np.median(prices)

#Standard deviation of prices
stdPrice = np.std(prices)

#Print Stats
print("Statistics for Boston housing dataset:\n")
print("Minimum price: ${}".format(minPrice)) 
print("Maximum price: ${}".format(maxPrice))
print("Mean price: ${}".format(meanPrice))
print("Median price ${}".format(medianPrice))
print("Standard deviation of prices: ${}".format(stdPrice))

Statistics for Boston housing dataset:

Minimum price: $105000.0
Maximum price: $1024800.0
Mean price: $454342.9447852761
Median price $438900.0
Standard deviation of prices: $165171.13154429474


In [104]:
#Import 'r2_score'
from sklearn.metrics import r2_score

def performance_metric(y_true, y_predict):

    
    #TODO: Calculate the performance score between 'y_true' and 'y_predict'
    score = r2_score(y_true, y_predict)
    
    #Return the score
    return score

In [105]:
#Import 'train_test_split'
from sklearn.model_selection import train_test_split

#Shuffle the data and split it
X_train, X_test, y_train, y_test = train_test_split(features, prices, test_size=0.2, random_state = 42)

print("Training and testing split was successful.")

Training and testing split was successful.


In [106]:
#Import 'make_scorer', 'DecisionTreeRegressor', and 'GridSearchCV'
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV

def fit_model(X, y):
    
    #cross-validation
    cv_sets = ShuffleSplit(n_splits = 10, test_size = 0.20, random_state = 0)

    #Create a decision tree regressor object
    regressor = DecisionTreeRegressor()

    params = {'max_depth':[1,2,3,4,5,6,7,8,9,10]}

    scoring_fnc = make_scorer(performance_metric)

    #Create the grid search cv object --> GridSearchCV()
    #Make sure to include the right parameters in the object:
    grid = GridSearchCV(estimator=regressor, param_grid=params, scoring=scoring_fnc, cv=cv_sets)

    #Fit the grid search object to the data to compute the optimal model
    grid = grid.fit(X, y)

    #Return the optimal model after fitting the data
    return grid.best_estimator_

In [107]:
#Fit the training data to the model using grid search
reg = fit_model(X_train, y_train)

print("Parameter 'max_depth' is {} for the optimal model.".format(reg.get_params()['max_depth']))

Parameter 'max_depth' is 4 for the optimal model.


In [108]:
#Create a matrix for client data
client_data = [[5, 17, 15], # Client 1
               [4, 32, 22], # Client 2
               [8, 3, 12]]  # Client 3

#Show predictions
for i, price in enumerate(reg.predict(client_data)):
    print("Predicted price for Client {}'s house: ${:,.2f}".format(i+1, price))

Predicted price for Client 1's house: $403,025.00
Predicted price for Client 2's house: $237,478.72
Predicted price for Client 3's house: $931,636.36




In [117]:
from sklearn.metrics import mean_squared_error

y_true = [3, -0.5, 2, 7]
y_pred = [2.5, 0.0, 2, 8]

mse = mean_squared_error(y_true, y_pred)
mse_formatted = round(mse, 2)

print("Mean Squared Error (MSE):", mse_formatted)

Mean Squared Error (MSE): 0.38


In [118]:
from sklearn.metrics import mean_absolute_error

y_true = [1.2, 2.3, 3.4, 4.5, 5.6]
y_pred = [1.5, 2.6, 3.7, 4.8, 5.9]

mae = mean_absolute_error(y_true, y_pred)
mae_formatted = round(mae, 2)

print(f"Mean Absolute Error: {mae_formatted}")

Mean Absolute Error: 0.3
