# Neural Network Exercise (Core)
- Zach Hanson

## Importing Libraries and Data

### Libraries 

In [24]:
#Pandas, numpy
import pandas as pd
import numpy as np

#Graphing
import matplotlib.pyplot as plt
import seaborn as sns

#Preprocessing
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_selector
from sklearn.compose import make_column_transformer

#Metrics
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error, \
                            precision_score, recall_score, accuracy_score, \
                            f1_score, ConfusionMatrixDisplay, \
                            classification_report

#TensorFlow and Keras
import tensorflow as tf
import tensorflow.keras as keras
from keras.models import Sequential
from keras.layers import Dense

#Set random seeds for consistency in testing
keras.backend.clear_session()
np.random.seed(42)
tf.random.set_seed(42)

#Setting global SciKit-Learn Configuration
#Easier to visualize pipelines
from sklearn import set_config
set_config(display='diagram')

### Data

In [25]:
#Loading data
filename = 'train.csv'
df = pd.read_csv(filename)
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [26]:
df.shape

(1460, 81)

- 1460 Rows, 80 Columns in this data 

## Functions

In [27]:
def plot_history(history):
  """Takes a keras model learning history and plots each metric"""
  
  metrics = history.history.keys()
  
  for metric in metrics:
      if not 'val' in metric:
        plt.plot(history.history[f'{metric}'], label=f'{metric}')
        if f'val_{metric}' in metrics:
          plt.plot(history.history[f'val_{metric}'], label=f'val_{metric}')
        plt.legend()
        plt.title(f'{metric}')
        plt.show()
        
def eval_regression(true, pred, name='Model'):
    """Evaluates true and predicted values from a regression model.  
    Outputs a dataframe of metrics"""
    scores = pd.DataFrame()
    scores['Model Name'] = [name]
    scores['RMSE'] = [np.sqrt(mean_squared_error(true, pred))]
    scores['MAE'] = [mean_absolute_error(true, pred)]
    scores['R2'] = [r2_score(true, pred)]
    scores.set_index('Model Name', inplace=True)

    return scores

def eval_classification(true, pred, name='Model', labels=None):
    """shows classification_report and confusion matrix
    for classification model predictions.  Outputs a dataframe of metrics"""
  
    print(name, '\n')
    print(classification_report(true, pred, target_names=labels))
    ConfusionMatrixDisplay.from_predictions(true, pred, display_labels=labels)
    plt.show()

    scores = pd.DataFrame()
    scores['Model Name'] = [name]
    scores['Precision'] = [precision_score(true, pred)]
    scores['Recall'] = [recall_score(true, pred)]
    scores['F1 Score'] = [f1_score(true, pred)]
    scores['Accuracy'] = [accuracy_score(true, pred)]
    scores.set_index('Model Name', inplace=True)

    return scores

## Cleaning


### Unnecessary Rows


In [28]:
#Checking for duplicates
df.duplicated().sum()

0

- No duplicated rows


### Checking for missing values


In [29]:
df.isna().sum()

Id                 0
MSSubClass         0
MSZoning           0
LotFrontage      259
LotArea            0
                ... 
MoSold             0
YrSold             0
SaleType           0
SaleCondition      0
SalePrice          0
Length: 81, dtype: int64

- Too many columns to view all at once

In [30]:
for column in df.columns:
    if df[column].isna().sum() >= 1:
        print(f"Column: {column}")
        print(f"Number of missing values: {df[column].isna().sum()}")
        print('\n')


Column: LotFrontage
Number of missing values: 259


Column: Alley
Number of missing values: 1369


Column: MasVnrType
Number of missing values: 8


Column: MasVnrArea
Number of missing values: 8


Column: BsmtQual
Number of missing values: 37


Column: BsmtCond
Number of missing values: 37


Column: BsmtExposure
Number of missing values: 38


Column: BsmtFinType1
Number of missing values: 37


Column: BsmtFinType2
Number of missing values: 38


Column: Electrical
Number of missing values: 1


Column: FireplaceQu
Number of missing values: 690


Column: GarageType
Number of missing values: 81


Column: GarageYrBlt
Number of missing values: 81


Column: GarageFinish
Number of missing values: 81


Column: GarageQual
Number of missing values: 81


Column: GarageCond
Number of missing values: 81


Column: PoolQC
Number of missing values: 1453


Column: Fence
Number of missing values: 1179


Column: MiscFeature
Number of missing values: 1406




- We can see the columns "alley", "PoolQC", "Fence" and "MiscFeature" are missing values in almost every row, these are not going to be useful in our predictions
- Column "FireplaceQu" missing almost half of its potential values, this is most likely not useful in our prediction either

### Unnecessary Columns

In [31]:
#Dropping unecessary columns
df = df.drop(columns= ['Id', 'Alley', 'PoolQC', 'Fence', 'MiscFeature', 'FireplaceQu'])
df.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,60,RL,65.0,8450,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,0,0,0,0,0,2,2008,WD,Normal,208500
1,20,RL,80.0,9600,Pave,Reg,Lvl,AllPub,FR2,Gtl,...,0,0,0,0,0,5,2007,WD,Normal,181500
2,60,RL,68.0,11250,Pave,IR1,Lvl,AllPub,Inside,Gtl,...,0,0,0,0,0,9,2008,WD,Normal,223500
3,70,RL,60.0,9550,Pave,IR1,Lvl,AllPub,Corner,Gtl,...,272,0,0,0,0,2,2006,WD,Abnorml,140000
4,60,RL,84.0,14260,Pave,IR1,Lvl,AllPub,FR2,Gtl,...,0,0,0,0,0,12,2008,WD,Normal,250000


In [32]:
#Double checking our missing values 
for column in df.columns:
    if df[column].isna().sum() >= 1:
        print(f"Column: {column}")
        print(f"Number of missing values: {df[column].isna().sum()}")
        print('\n')

Column: LotFrontage
Number of missing values: 259


Column: MasVnrType
Number of missing values: 8


Column: MasVnrArea
Number of missing values: 8


Column: BsmtQual
Number of missing values: 37


Column: BsmtCond
Number of missing values: 37


Column: BsmtExposure
Number of missing values: 38


Column: BsmtFinType1
Number of missing values: 37


Column: BsmtFinType2
Number of missing values: 38


Column: Electrical
Number of missing values: 1


Column: GarageType
Number of missing values: 81


Column: GarageYrBlt
Number of missing values: 81


Column: GarageFinish
Number of missing values: 81


Column: GarageQual
Number of missing values: 81


Column: GarageCond
Number of missing values: 81




- Columns we wanted to drop were successfully dropped, will deal with other missing values later

## Preprocessing

### Defining Target and Features

In [33]:
target = 'SalePrice'
y = df[target].copy()
X = df.drop(columns = [target]).copy()

### Splitting Data

In [34]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

### Column Selectors

In [35]:
#Numerical Selector
num_selector = make_column_selector(dtype_include='number')

#Categorical Selector
cat_selector = make_column_selector(dtype_include='object')

### Imputers

In [36]:
#Imputer for missing numerical data
med_imputer = SimpleImputer(strategy='median')

#Imputer for missing categorical data
miss_imputer = SimpleImputer(strategy='constant', fill_value='missing')

### Transformers


In [37]:
scaler = StandardScaler()
ohe = OneHotEncoder(sparse=False, handle_unknown='ignore')

### Pipelines

In [38]:
#Numerical Pipeline
num_pipe = make_pipeline(med_imputer, scaler)
num_pipe

In [39]:
#Categorical Pipeline
cat_pipe = make_pipeline(miss_imputer, ohe)
cat_pipe

### Tuples


In [40]:
#Numerical Tuple
num_tuple = (num_pipe, num_selector)

#Categorical Tuple
cat_tuple = (cat_pipe, cat_selector)

### Column Transformer

In [41]:
preprocessor = make_column_transformer(num_tuple,
                                       cat_tuple,
                                       remainder='drop')
preprocessor

### Fitting Preprocessor

In [42]:
preprocessor.fit(X_train)
X_train = preprocessor.transform(X_train)
X_test = preprocessor.transform(X_test)