# <center>__Assignment 5__</center>

In [2]:
"""import necessary libraries"""

import numpy as np
import pandas as pd
from sklearn import datasets as data
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPRegressor
from scikeras.wrappers import KerasRegressor
from keras.models import Sequential
from keras.layers import Dense

## __*Part 1: Implement a Perceptron*__

*Objectives*

- Implement code from Assignment 2 to prepare Diabetes data
- Given the diabetes dataset you used during Assignment 2, implement an
MLP Regressor.

In [3]:
"""load diabetes dataset from scikit-learn and print dataset description"""

diabetes_bunch = data.load_diabetes(as_frame=True)
diabetes_df = diabetes_bunch['data']
diabetes_df['disease_proression'] = diabetes_bunch.target
print(diabetes_bunch['DESCR'])

.. _diabetes_dataset:

Diabetes dataset
----------------

Ten baseline variables, age, sex, body mass index, average blood
pressure, and six blood serum measurements were obtained for each of n =
442 diabetes patients, as well as the response of interest, a
quantitative measure of disease progression one year after baseline.

**Data Set Characteristics:**

  :Number of Instances: 442

  :Number of Attributes: First 10 columns are numeric predictive values

  :Target: Column 11 is a quantitative measure of disease progression one year after baseline

  :Attribute Information:
      - age     age in years
      - sex
      - bmi     body mass index
      - bp      average blood pressure
      - s1      tc, total serum cholesterol
      - s2      ldl, low-density lipoproteins
      - s3      hdl, high-density lipoproteins
      - s4      tch, total cholesterol / HDL
      - s5      ltg, possibly log of serum triglycerides level
      - s6      glu, blood sugar level

Note: Each of these 1

In [4]:
"""Check data types for features in the diabetes data, check for missing data"""

diabetes_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 442 entries, 0 to 441
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   age                 442 non-null    float64
 1   sex                 442 non-null    float64
 2   bmi                 442 non-null    float64
 3   bp                  442 non-null    float64
 4   s1                  442 non-null    float64
 5   s2                  442 non-null    float64
 6   s3                  442 non-null    float64
 7   s4                  442 non-null    float64
 8   s5                  442 non-null    float64
 9   s6                  442 non-null    float64
 10  disease_proression  442 non-null    float64
dtypes: float64(11)
memory usage: 38.1 KB


In [None]:
"""
print descriptive statistics for numeric features
note all features have been scaled by default
"""

pd.set_option('display.float_format', lambda x : '%.5f' % x)
diabetes_df.agg(func=['min', 'median', 'max', 'mean', 'var', 'std',
                      'skew', 'kurt'])

In [6]:
"split the data into training and test sets"

X = diabetes_df.iloc[:, :-1]
y = diabetes_df.iloc[:, -1:]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.20,
                                                    shuffle=True, random_state=14)

In [7]:
"""implement the multilayer perceptron regressor using sklearn"""

# instantiate the mltilayer perceptron regressor
mlpr = MLPRegressor(activation='relu', solver='sgd', max_iter=2000, random_state=42)
# train the network
mlpr.fit(X_train, y_train.to_numpy().flatten())
# make predictions on the training and test sets
y_train_pred = mlpr.predict(X_train)
y_test_pred = mlpr.predict(X_test)

In [8]:
"""Evaluate performance against the training set"""

rmse = mean_squared_error(y_train, y_train_pred, squared=False)
r2 = r2_score(y_train.to_numpy().flatten(), y_train_pred)
print("Root Mean Squared Error: {:.4f}".format(rmse))
print("R-squared: {:.4f}".format(r2))

Root Mean Squared Error: 53.3617
R-squared: 0.5168


In [9]:
"""Evaluate performance against the test set"""

rmse = mean_squared_error(y_test, y_test_pred, squared=False)
r2 = r2_score(y_test.to_numpy().flatten(), y_test_pred)
print("Root Mean Squared Error: {:.4f}".format(rmse))
print("R-squared: {:.4f}".format(r2))

Root Mean Squared Error: 55.1027
R-squared: 0.4967


## __*Part 2: Implement a Keras Classifier*__

*Objectives*

- Implement code from Assignment 3 to prepare Titanic data.
- Given the prepared Titanic dataset from Assignment 3, implement a Keras
sequential classifier with relu activation functions.

Implement code from assignment to prepare Titanic data.

In [10]:
"""load the data from Github"""
! curl https://raw.githubusercontent.com/arjayit/cs4432_data/master/train.csv --output titanic.csv
titanic = pd.read_csv('titanic.csv')

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 61194  100 61194    0     0   136k      0 --:--:-- --:--:-- --:--:--  138k


In [11]:
"""drop the cabin feature from the titanic dataframe"""

titanic.drop(columns=['Cabin'], inplace=True)

In [12]:
"""
Impute missing values in the age feature using median age value of a passengers
respective boarding class. Boolean index used instead of pd.apply, faster and easier syntax
"""

median_vals = titanic[['Age', 'Pclass']].groupby(by=['Pclass']).median()
titanic.loc[(titanic['Pclass'] == 1) & (titanic['Age'].isna()), 'Age'] = median_vals['Age'][1]
titanic.loc[(titanic['Pclass'] == 2) & (titanic['Age'].isna()), 'Age'] = median_vals['Age'][2]
titanic.loc[(titanic['Pclass'] == 3) & (titanic['Age'].isna()), 'Age'] = median_vals['Age'][3]
titanic.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       2
dtype: int64

In [13]:
"""drop remaining rows with missing values"""

titanic.dropna(axis=0,  inplace=True)
titanic.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
dtype: int64

In [14]:
"""
Drop features name, ticket, and passengerid
get dummies for remaining categorical features then drop original columns
"""

titanic.drop(columns=['Name', 'Ticket', 'PassengerId'], inplace=True)
categorical_idx = titanic.select_dtypes(exclude=['int64', 'float64']).columns
numeric_idx = titanic[['Age', 'Fare']].columns
dummy_vars = pd.get_dummies(titanic[categorical_idx])
titanic = pd.concat([titanic, dummy_vars], axis=1)
titanic.drop(columns=categorical_idx, inplace=True)
sc = StandardScaler()
titanic[numeric_idx] = sc.fit_transform(titanic[numeric_idx])

In [15]:
"""
create the feature set by copying target label to new dataframe and dropping from features
split data into traning and test sets
"""
y = titanic['Survived'].copy()
titanic.drop(columns=['Survived'], inplace=True)
X_train, X_test, y_train, y_test = train_test_split(titanic, y,test_size=.30,
                                                    random_state=14)
# convert training and test sets to numpy array
X_train = X_train.to_numpy()
X_test = X_test.to_numpy()
y_train = y_train.to_numpy().flatten()
y_test = y_test.to_numpy().flatten()

- Given the prepared Titanic dataset from Assignment 3, implement a Keras
sequential classifier with relu activation functions.

In [16]:
"""instantiate the keras model"""

sequential_classifier = Sequential()
# input layer
sequential_classifier.add(Dense(300, activation='relu', input_dim=10))
# two hidden layers
sequential_classifier.add(Dense(100, activation='relu'))
sequential_classifier.add(Dense(10, activation='relu'))
# output layer
sequential_classifier.add(Dense(1, activation='sigmoid'))
# compile the network model
sequential_classifier.compile(optimizer='adam', loss='binary_crossentropy',
                              metrics=['accuracy'])
# fit the network model
sequential_classifier.fit(X_train, y_train, epochs=25)



Epoch 1/25


2022-08-14 16:19:38.016898: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<keras.callbacks.History at 0x140a28820>

In [17]:
"""assess model performance on the training set"""

scores = sequential_classifier.evaluate(X_train, y_train, verbose=0)

print('Accuracy on training data: {}%'.format(scores[1]))

Accuracy on training data: 0.8376205563545227%


In [18]:
"""assess model performance on the test set"""

scores = sequential_classifier.evaluate(X_test, y_test, verbose=0)

print('Accuracy on test data: {}%'.format(scores[1]))

Accuracy on test data: 0.8164793848991394%


## __*Part 3: Implement a Keras Regresssor*__

*Objectives*

- Implement code from assignment 4 to load and prepare the bike-share data.
- Given the prepared bike-share dataset from Assignment 4, implement a
Keras sequential regressor with relu activation functions.

Implement code from assignment 4 to load and prepare the bike-share data

In [39]:
"""load the data and data descritption from Github"""

! curl https://raw.githubusercontent.com/arjayit/cs4432_data/master/bike_share_hour.csv --output bike_share_hour.csv
! curl https://raw.githubusercontent.com/arjayit/cs4432_data/master/bike_share_Readme.txt --output bike_share_Readme.txt
bike_share = pd.read_csv('bike_share_hour.csv')

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 1129k  100 1129k    0     0  3132k      0 --:--:-- --:--:-- --:--:-- 3209k
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  5607  100  5607    0     0  29943      0 --:--:-- --:--:-- --:--:-- 31150


In [42]:
"""print the data description"""

bike_share.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17379 entries, 0 to 17378
Data columns (total 17 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   instant     17379 non-null  int64  
 1   dteday      17379 non-null  object 
 2   season      17379 non-null  int64  
 3   yr          17379 non-null  int64  
 4   mnth        17379 non-null  int64  
 5   hr          17379 non-null  int64  
 6   holiday     17379 non-null  int64  
 7   weekday     17379 non-null  int64  
 8   workingday  17379 non-null  int64  
 9   weathersit  17379 non-null  int64  
 10  temp        17379 non-null  float64
 11  atemp       17379 non-null  float64
 12  hum         17379 non-null  float64
 13  windspeed   17379 non-null  float64
 14  casual      17379 non-null  int64  
 15  registered  17379 non-null  int64  
 16  cnt         17379 non-null  int64  
dtypes: float64(4), int64(12), object(1)
memory usage: 2.3+ MB


In [44]:
"""convert categorical features to dtype category"""

# specify index (colnames) of categorical features
cat_features = ['season', 'holiday', 'weekday', 'workingday', 'weathersit', 'mnth', 'yr', 'hr']
# convert features
bike_share[cat_features] = bike_share[cat_features].astype('category')
bike_share.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17379 entries, 0 to 17378
Data columns (total 17 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   instant     17379 non-null  int64   
 1   dteday      17379 non-null  object  
 2   season      17379 non-null  category
 3   yr          17379 non-null  category
 4   mnth        17379 non-null  category
 5   hr          17379 non-null  category
 6   holiday     17379 non-null  category
 7   weekday     17379 non-null  category
 8   workingday  17379 non-null  category
 9   weathersit  17379 non-null  category
 10  temp        17379 non-null  float64 
 11  atemp       17379 non-null  float64 
 12  hum         17379 non-null  float64 
 13  windspeed   17379 non-null  float64 
 14  casual      17379 non-null  int64   
 15  registered  17379 non-null  int64   
 16  cnt         17379 non-null  int64   
dtypes: category(8), float64(4), int64(4), object(1)
memory usage: 1.3+ MB


In [45]:
"""scale numeric features"""

#instantiate StandardScaler
scaler = StandardScaler()
# get numeric features index
numeric_idx = bike_share[['instant', 'temp', 'atemp', 'hum', 'windspeed','casual', 'registered']].columns
# scale the data
bike_share[numeric_idx] = scaler.fit_transform(bike_share[numeric_idx])

In [46]:
"""drop select features from the dataset"""

bike_share.drop(columns=['casual', 'registered', 'dteday', 'instant'], inplace=True)

In [47]:
"""get dummy vectors for categorical features"""

# get dummy vectors, original columns are automatically transformed
bike_share = pd.get_dummies(bike_share)
# reset index of target variable "cnt"
bike_share = bike_share.reindex(columns=[col for col in bike_share.columns if col != 'cnt'] + ['cnt'])

In [48]:
"""split the data into training and test sets"""

X_train, X_test, y_train, y_test = train_test_split(bike_share.iloc[:,:-1], bike_share.iloc[:, -1:],
                                                    test_size=0.33, random_state=14)
# convert data to numpy array for use with Keras
X_train = X_train.to_numpy()
X_test = X_test.to_numpy()
y_train = y_train.to_numpy().flatten()
y_test.to_numpy().flatten()

array([128, 174, 275, ..., 188, 446,   8])

Given the prepared bike-share dataset from Assignment 4, implement a
Keras sequential regressor with relu activation functions

In [49]:
"""instantiate baseline Keras regressor"""

#instatniate the model
sequential_regressor = Sequential()
# input layer
sequential_regressor.add(Dense(300, activation='relu', input_dim=61))
# two hidden layers
sequential_regressor.add(Dense(100, activation='relu'))
sequential_regressor.add(Dense(10, activation='relu'))
# output layer
sequential_regressor.add(Dense(1, activation='relu'))
# compile the network model
sequential_regressor.compile(optimizer='adam', loss='mean_squared_error', metrics=['mean_squared_error'])
# fit the network model
sequential_regressor.fit(X_train, y_train, verbose=0)

<keras.callbacks.History at 0x1447fdfd0>

In [50]:
"""Evaluate performance against the training set"""

# make predictins on the training set
y_train_pred = sequential_regressor.predict(X_train)
rmse = mean_squared_error(y_train, y_train_pred, squared=False)
r2 = r2_score(y_train, y_train_pred)
print("Root Mean Squared Error: {:.4f}".format(rmse))
print("R-squared: {:.4f}".format(r2))

Root Mean Squared Error: 96.1205
R-squared: 0.7209


In [51]:
"""Evaluate performance against the test set"""

# make predictins on the training set
y_test_pred = sequential_regressor.predict(X_test)
rmse = mean_squared_error(y_test, y_test_pred, squared=False)
r2 = r2_score(y_test, y_test_pred)
print("Root Mean Squared Error: {:.4f}".format(rmse))
print("R-squared: {:.4f}".format(r2))

Root Mean Squared Error: 97.7285
R-squared: 0.7060


## __*Part 4: Tune Your Keras Regressor*__

*Objectives*

- Tune your Keras regressor from Part 3 by implementing a grid search with
different optimizers.

In [62]:
"""perform grid search cross validation with sklearn"""

network = KerasRegressor(model=sequential_regressor, random_state=42, verbose=False)
# create search grid and instantiate GridSearchCV object
params = {'epochs': [10, 15, 20], 'optimizer': ['SGD', 'Adam', 'Adadelta']}
network_search_grid = GridSearchCV(network, param_grid=params, cv=3)
# fit the grid
network_search_grid.fit(X_train, y_train)

INFO:tensorflow:Assets written to: ram:///var/folders/s4/yckpqxmx6cnf837zqjqn_hxc0000gn/T/tmpq58_bf7a/assets
INFO:tensorflow:Assets written to: ram:///var/folders/s4/yckpqxmx6cnf837zqjqn_hxc0000gn/T/tmpycco04ra/assets
INFO:tensorflow:Assets written to: ram:///var/folders/s4/yckpqxmx6cnf837zqjqn_hxc0000gn/T/tmpky8k7bs3/assets
INFO:tensorflow:Assets written to: ram:///var/folders/s4/yckpqxmx6cnf837zqjqn_hxc0000gn/T/tmphhlalhee/assets
INFO:tensorflow:Assets written to: ram:///var/folders/s4/yckpqxmx6cnf837zqjqn_hxc0000gn/T/tmppqgaoba5/assets
INFO:tensorflow:Assets written to: ram:///var/folders/s4/yckpqxmx6cnf837zqjqn_hxc0000gn/T/tmpebgtta3i/assets
INFO:tensorflow:Assets written to: ram:///var/folders/s4/yckpqxmx6cnf837zqjqn_hxc0000gn/T/tmpy8hxi36w/assets
INFO:tensorflow:Assets written to: ram:///var/folders/s4/yckpqxmx6cnf837zqjqn_hxc0000gn/T/tmpgc4iqf7f/assets
INFO:tensorflow:Assets written to: ram:///var/folders/s4/yckpqxmx6cnf837zqjqn_hxc0000gn/T/tmpvqwi1t2i/assets
INFO:tensorflow:Ass

In [63]:
"""print the best parameters"""

network_search_grid.best_params_

{'epochs': 20, 'optimizer': 'SGD'}

In [64]:
"""Evaluate performance against the training set"""

# make predictins on the training set
y_train_pred = network_search_grid.predict(X_train)
rmse = mean_squared_error(y_train, y_train_pred, squared=False)
r2 = r2_score(y_train, y_train_pred)
print("Root Mean Squared Error: {:.4f}".format(rmse))
print("R-squared: {:.4f}".format(r2))

Root Mean Squared Error: 30.9682
R-squared: 0.9710


In [65]:
"""Evaluate performance against the test set"""

# make predictins on the training set
y_test_pred = network_search_grid.predict(X_test)
rmse = mean_squared_error(y_test, y_test_pred, squared=False)
r2 = r2_score(y_test, y_test_pred)
print("Root Mean Squared Error: {:.4f}".format(rmse))
print("R-squared: {:.4f}".format(r2))

Root Mean Squared Error: 40.8688
R-squared: 0.9486
