In [None]:
#Import libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import ShuffleSplit

In [None]:
#Import supplementary visuals
import visuals as vs

In [None]:
%matplotlib inline

In [None]:
#Load the Boston housing dataset
data = pd.read_csv('BostonHousing.csv')
prices = data['MEDV']
features = data.drop('MEDV', axis = 1)

In [None]:
print("Boston housing dataset has {489} data points with {4} variables each.".format(*data.shape))

In [None]:
np.amin(prices)

In [None]:
#Minimum price
minPrice = np.amin(prices)

#Maximum price
maxPrice = np.amax(prices)

#Mean price
meanPrice = np.mean(prices)

#Median price
medianPrice = np.median(prices)

#Standard deviation of prices
stdPrice = np.std(prices)

#Print Stats
print("Statistics for Boston housing dataset:\n")
print("Minimum price: ${}".format(minPrice)) 
print("Maximum price: ${}".format(maxPrice))
print("Mean price: ${}".format(meanPrice))
print("Median price ${}".format(medianPrice))
print("Standard deviation of prices: ${}".format(stdPrice))

In [None]:
#Import 'r2_score'
from sklearn.metrics import r2_score

def performance_metric(y_true, y_predict):

    
    #TODO: Calculate the performance score between 'y_true' and 'y_predict'
    score = r2_score(y_true, y_predict)
    
    #Return the score
    return score

In [None]:
#Import 'train_test_split'
from sklearn.model_selection import train_test_split

#Shuffle the data and split it
X_train, X_test, y_train, y_test = train_test_split(features, prices, test_size=0.2, random_state = 42)

print("Training and testing split was successful.")

In [None]:
#Produce learning curves for varying training set sizes and maximum depths
vs.ModelLearning(features, prices)

In [None]:

vs.ModelComplexity(X_train, y_train)

In [None]:
#Import 'make_scorer', 'DecisionTreeRegressor', and 'GridSearchCV'
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV

def fit_model(X, y):
    
    #cross-validation
    cv_sets = ShuffleSplit(n_splits = 10, test_size = 0.20, random_state = 0)

    #Create a decision tree regressor object
    regressor = DecisionTreeRegressor()

    params = {'max_depth':[1,2,3,4,5,6,7,8,9,10]}

    scoring_fnc = make_scorer(performance_metric)

    #Create the grid search cv object --> GridSearchCV()
    #Make sure to include the right parameters in the object:
    grid = GridSearchCV(estimator=regressor, param_grid=params, scoring=scoring_fnc, cv=cv_sets)

    #Fit the grid search object to the data to compute the optimal model
    grid = grid.fit(X, y)

    #Return the optimal model after fitting the data
    return grid.best_estimator_

In [None]:
#Fit the training data to the model using grid search
reg = fit_model(X_train, y_train)

print("Parameter 'max_depth' is {} for the optimal model.".format(reg.get_params()['max_depth']))

In [None]:
#Create a matrix for client data
client_data = [[5, 17, 15], # Client 1
               [4, 32, 22], # Client 2
               [8, 3, 12]]  # Client 3

#Show predictions
for i, price in enumerate(reg.predict(client_data)):
    print("Predicted price for Client {}'s house: ${:,.2f}".format(i+1, price))

In [None]:
#Run Predictions
vs.PredictTrials(features, prices, fit_model, client_data)