# Supervised Learning Model Evaluation Lab

Complete the exercises below to solidify your knowledge and understanding of supervised learning model evaluation.

In [1]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error,root_mean_squared_error, confusion_matrix, balanced_accuracy_score,accuracy_score,f1_score,precision_score,recall_score
from sklearn.linear_model import LinearRegression, LogisticRegression

## Regression Model Evaluation

In [4]:
column_names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV']
data = pd.read_csv('housing.csv', header=None, delimiter=r"\s+", names=column_names)

In [6]:
"""
CRIM - per capita crime rate by town
ZN - proportion of residential land zoned for lots over 25,000 sq.ft.
INDUS - proportion of non-retail business acres per town.
CHAS - Charles River dummy variable (1 if tract bounds river; 0 otherwise)
NOX - nitric oxides concentration (parts per 10 million)
RM - average number of rooms per dwelling
AGE - proportion of owner-occupied units built prior to 1940
DIS - weighted distances to five Boston employment centres
RAD - index of accessibility to radial highways
TAX - full-value property-tax rate per $10,000
PTRATIO - pupil-teacher ratio by town
B - 1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town
LSTAT - % lower status of the population
MEDV - Median value of owner-occupied homes in $1000's"""

"\nCRIM - per capita crime rate by town\nZN - proportion of residential land zoned for lots over 25,000 sq.ft.\nINDUS - proportion of non-retail business acres per town.\nCHAS - Charles River dummy variable (1 if tract bounds river; 0 otherwise)\nNOX - nitric oxides concentration (parts per 10 million)\nRM - average number of rooms per dwelling\nAGE - proportion of owner-occupied units built prior to 1940\nDIS - weighted distances to five Boston employment centres\nRAD - index of accessibility to radial highways\nTAX - full-value property-tax rate per $10,000\nPTRATIO - pupil-teacher ratio by town\nB - 1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town\nLSTAT - % lower status of the population\nMEDV - Median value of owner-occupied homes in $1000's"

In [8]:
data

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.0900,1,296.0,15.3,396.90,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.90,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,18.7,396.90,5.33,36.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0,0.573,6.593,69.1,2.4786,1,273.0,21.0,391.99,9.67,22.4
502,0.04527,0.0,11.93,0,0.573,6.120,76.7,2.2875,1,273.0,21.0,396.90,9.08,20.6
503,0.06076,0.0,11.93,0,0.573,6.976,91.0,2.1675,1,273.0,21.0,396.90,5.64,23.9
504,0.10959,0.0,11.93,0,0.573,6.794,89.3,2.3889,1,273.0,21.0,393.45,6.48,22.0


## 1. Split this data set into training (80%) and testing (20%) sets.

The `MEDV` field represents the median value of owner-occupied homes (in $1000's) and is the target variable that we will want to predict.

In [11]:
# Your code here :
X = data.drop("MEDV", axis=1)
y = data["MEDV"]

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)

In [15]:
print("Training Dataset Size : {}".format(X_train.shape))
print("Test Dataset Size : {}".format(X_test.shape))

Training Dataset Size : (404, 13)
Test Dataset Size : (102, 13)


## 2. Train a `LinearRegression` model on this data set and generate predictions on both the training and the testing set.

In [18]:
# Your code here :
model = LinearRegression().fit(X_train,y_train)

#Predict for the training set
y_train_preds = model.predict(X_train)

#Predict for the test set
y_test_preds = model.predict(X_test)

## 3. Calculate and print R-squared for both the training and the testing set.

In [21]:
# Your code here :
print("R2 Score for Training Set: {}".format(r2_score(y_train,y_train_preds)))
print("R2 Score for Test Set: {}".format(r2_score(y_test,y_test_preds)))

R2 Score for Training Set: 0.7508856358979672
R2 Score for Test Set: 0.6687594935356314


## 4. Calculate and print mean squared error for both the training and the testing set.

In [24]:
# Your code here :
print("MSE Score for Training Set: {}".format(mean_squared_error(y_train,y_train_preds)))
print("MSE Score for Test Set: {}".format(mean_squared_error(y_test,y_test_preds)))

MSE Score for Training Set: 21.641412753226316
MSE Score for Test Set: 24.29111947497357


## 5. Calculate and print mean absolute error for both the training and the testing set.

In [27]:
# Your code here :
print("MAE Score for Training Set: {}".format(mean_absolute_error(y_train,y_train_preds)))
print("MAE Score for Test Set: {}".format(mean_absolute_error(y_test,y_test_preds)))

MAE Score for Training Set: 3.314771626783228
MAE Score for Test Set: 3.1890919658878496


## 6. Calculate and print root mean squared error for both the training and the testing set.

In [30]:
print("RMSE Score for Training Set: {}".format(root_mean_squared_error(y_train,y_train_preds)))
print("RMSE Score for Test Set: {}".format(root_mean_squared_error(y_test,y_test_preds)))

RMSE Score for Training Set: 4.6520331848801675
RMSE Score for Test Set: 4.928602182665342


## Classification Model Evaluation

In [33]:
from sklearn.datasets import load_iris
data = load_iris()

In [35]:
print(data.DESCR)

.. _iris_dataset:

Iris plants dataset
--------------------

**Data Set Characteristics:**

:Number of Instances: 150 (50 in each of three classes)
:Number of Attributes: 4 numeric, predictive attributes and the class
:Attribute Information:
    - sepal length in cm
    - sepal width in cm
    - petal length in cm
    - petal width in cm
    - class:
            - Iris-Setosa
            - Iris-Versicolour
            - Iris-Virginica

:Summary Statistics:

                Min  Max   Mean    SD   Class Correlation
sepal length:   4.3  7.9   5.84   0.83    0.7826
sepal width:    2.0  4.4   3.05   0.43   -0.4194
petal length:   1.0  6.9   3.76   1.76    0.9490  (high!)
petal width:    0.1  2.5   1.20   0.76    0.9565  (high!)

:Missing Attribute Values: None
:Class Distribution: 33.3% for each of 3 classes.
:Creator: R.A. Fisher
:Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)
:Date: July, 1988

The famous Iris database, first used by Sir R.A. Fisher. The dataset is taken
from Fis

In [37]:
column_names = data.feature_names

In [39]:
df = pd.DataFrame(data['data'],columns=column_names)

In [41]:
df

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
...,...,...,...,...
145,6.7,3.0,5.2,2.3
146,6.3,2.5,5.0,1.9
147,6.5,3.0,5.2,2.0
148,6.2,3.4,5.4,2.3


In [43]:
target = pd.DataFrame(data.target)

In [45]:
data.keys()

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename', 'data_module'])

In [47]:
data['target_names']

array(['setosa', 'versicolor', 'virginica'], dtype='<U10')

## 6. Split this data set into training (80%) and testing (20%) sets.

The `class` field represents the type of flower and is the target variable that we will want to predict.

In [50]:
# Your code here :
X_train, X_test, y_train, y_test = train_test_split(df,target,test_size=0.2,random_state=42)
print("Training Dataset Size : {}".format(X_train.shape))
print("Test Dataset Size : {}".format(X_test.shape))

Training Dataset Size : (120, 4)
Test Dataset Size : (30, 4)


## 7. Train a `LogisticRegression` model on this data set and generate predictions on both the training and the testing set.

In [55]:
# Your code here :
log_model = LogisticRegression().fit(X_train,y_train)

#Predict for the training set
y_train_preds = log_model.predict(X_train)

#Predict for the test set
y_test_preds = log_model.predict(X_test)

## 8. Calculate and print the accuracy score for both the training and the testing set.

In [58]:
# Your code here :
print("Accuracy Score for Training Set: {}".format(accuracy_score(y_train,y_train_preds)))
print("Accuracy Score for Test Set: {}".format(accuracy_score(y_test,y_test_preds)))

Accuracy Score for Training Set: 0.975
Accuracy Score for Test Set: 1.0


## 9. Calculate and print the balanced accuracy score for both the training and the testing set.

In [61]:
# Your code here :
print("Balanced Accuracy Score for Training Set: {}".format(balanced_accuracy_score(y_train,y_train_preds)))
print("Balanced Accuracy Score for Test Set: {}".format(balanced_accuracy_score(y_test,y_test_preds)))

Balanced Accuracy Score for Training Set: 0.975609756097561
Balanced Accuracy Score for Test Set: 1.0


## 10. Calculate and print the precision score for both the training and the testing set.

In [63]:
# Your code here :
print("Precision Score for Training Set: {}".format(precision_score(y_train,y_train_preds,average='weighted')))
print("Precision Score for Test Set: {}".format(precision_score(y_test,y_test_preds,average='weighted')))

Precision Score for Training Set: 0.9767857142857144
Precision Score for Test Set: 1.0


## 11. Calculate and print the recall score for both the training and the testing set.

In [66]:
# Your code here :
print("Recall Score for Training Set: {}".format(recall_score(y_train,y_train_preds,average='weighted')))
print("Recall Score for Test Set: {}".format(recall_score(y_test,y_test_preds,average='weighted')))

Recall Score for Training Set: 0.975
Recall Score for Test Set: 1.0


## 12. Calculate and print the F1 score for both the training and the testing set.

In [68]:
# Your code here :
print("F1 Score for Training Set: {}".format(f1_score(y_train,y_train_preds,average='weighted')))
print("F1 Score for Test Set: {}".format(f1_score(y_test,y_test_preds,average='weighted')))

F1 Score for Training Set: 0.9749882794186592
F1 Score for Test Set: 1.0


## 13. Generate confusion matrices for both the training and the testing set.

In [71]:
# Your code here :
print("Confusion Matrix for Training Set: {}".format(confusion_matrix(y_train,y_train_preds)))
print("Confusion Matrix for Test Set: {}".format(confusion_matrix(y_test,y_test_preds)))

Confusion Matrix for Training Set: [[40  0  0]
 [ 0 38  3]
 [ 0  0 39]]
Confusion Matrix for Test Set: [[10  0  0]
 [ 0  9  0]
 [ 0  0 11]]
