In [60]:
# Data is from Kaggle.com
import pandas as pd

In [76]:
# Preprocessing and Exploration
#load the dataset
dataset = pd.read_csv('insurance.csv') 
#first 7 columns as features
features = dataset.iloc[:,0:6] 
#choose the final column for prediction
labels = dataset.iloc[:,-1] 
print(features.head())
#print the number of features in the dataset
print("Number of features:", features.shape[1]) 
#print the number of samples in the dataset
print("Number of samples: ", features.shape[0]) 
#summary statistics for numeric features
print(features.describe().round(2)) 
print(features.info())
print(labels.shape)
print(labels.describe().round(2))
#print(dataset.isna().sum())

   age     sex     bmi  children smoker     region
0   19  female  27.900         0    yes  southwest
1   18    male  33.770         1     no  southeast
2   28    male  33.000         3     no  southeast
3   33    male  22.705         0     no  northwest
4   32    male  28.880         0     no  northwest
Number of features: 6
Number of samples:  1338
           age      bmi  children
count  1338.00  1338.00   1338.00
mean     39.21    30.66      1.09
std      14.05     6.10      1.21
min      18.00    15.96      0.00
25%      27.00    26.30      0.00
50%      39.00    30.40      1.00
75%      51.00    34.69      2.00
max      64.00    53.13      5.00
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 

In [77]:
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Normalizer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
import numpy as np
#from sklearn.preprocessing import LabelEncoder OHE accepts strings since v 0.2
np.random.seed(42)


In [78]:
#load the dataset
dataset = pd.read_csv('insurance.csv')
#choose first 7 columns as features
features = dataset.iloc[:,0:6]
features2 = dataset.iloc[:,0:6]
#choose the final column for prediction
labels = dataset.iloc[:,-1]
#Many models other than decision trees only work with numeric features.
#One-hot encoding creates a binary column for each category.
#Two common ways of doing this is pandas.get_dummies method on categorical columns or LabelEncoder + OnehotEncoder
features = pd.get_dummies(features) 
# Next cell covers a Pipeline for ColumTransformer over get_dummies. ,
# Get dummies is more visual to work with as it also returns a dataframe with column names.

#le = LabelEncoder()

#split the data into training and test data,
features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size=0.33, random_state=42) # 42 because Galaxies are fun,


In [64]:
# By having features with differing scales, the optimizer might update some weights faster than the others.
# To remedy this we can use for example Normalization or Standardization
# Case 1: Normalize the numeric columns Normalization is scales the numerical features to a fixed range - usually between 0 and 1,
ct = ColumnTransformer([('normalize', Normalizer(), ['age', 'bmi', 'children'])], remainder='passthrough')
#fit the normalizer to the training data and convert from numpy arrays to pandas frame
features_train_norm = ct.fit_transform(features_train) 
#applied the trained normalizer on the test data and convert from n umpy arrays to pandas frame,
features_test_norm = ct.transform(features_test) 
# Note: By applying the transformer after, we get punished for not having large enough dataset.
# If we do not have enough data to really account for the broad specturum of variability. 
# The split will have many new types of outliers in our test set and really punished a model in evaluation.

#ColumnTransformer returns numpy arrays. Convert the features to dataframes
features_train_norm = pd.DataFrame(features_train_norm, columns = features_train.columns)
features_test_norm = pd.DataFrame(features_test_norm, columns = features_test.columns)

In [65]:
# Case 2: StandardScaler() standardization that rescales features to zero mean and unit variance. 
my_ct = ColumnTransformer([('scale', StandardScaler(), ['age', 'bmi', 'children'])], remainder='passthrough')
# The column transformer object has the columns to work on in it.

features_train_scale = my_ct.fit_transform(features_train)
features_test_scale = my_ct.transform(features_test)

# Transformes the Scaled objects back to DataFrames from Numpy arrays.
features_train_scale = pd.DataFrame(features_train_scale, columns=features_train.columns)
features_test_scale = pd.DataFrame(features_test_scale, columns=features_test.columns)

print(features_train_scale.describe().round(2))
print(features_test_scale.describe().round(2)) # Summary statistics.

          age     bmi  children  sex_female  sex_male  smoker_no  smoker_yes  \
count  896.00  896.00    896.00      896.00    896.00     896.00      896.00   
mean    -0.00    0.00     -0.00        0.49      0.51       0.79        0.21   
std      1.00    1.00      1.00        0.50      0.50       0.41        0.41   
min     -1.49   -2.44     -0.91        0.00      0.00       0.00        0.00   
25%     -0.86   -0.71     -0.91        0.00      0.00       1.00        0.00   
50%     -0.02   -0.05     -0.08        0.00      1.00       1.00        0.00   
75%      0.90    0.66      0.75        1.00      1.00       1.00        0.00   
max      1.74    3.78      3.24        1.00      1.00       1.00        1.00   

       region_northeast  region_northwest  region_southeast  region_southwest  
count            896.00            896.00            896.00            896.00  
mean               0.26              0.25              0.26              0.24  
std                0.44              0.

In [66]:
# Pipeline Version of the above. The below object would be even better Object oriented programming wise
ColumnTransformer(transformers=[('num', SimpleImputer(strategy='constant'), ['age', 'bmi', 'children']),
('cat', Pipeline(steps=[('imputer',SimpleImputer(strategy='most_frequent')),
('onehot',OneHotEncoder(handle_unknown='ignore'))]),['sex', 'smoker', 'region'])])

ColumnTransformer(transformers=[('num', SimpleImputer(strategy='constant'),
                                 ['age', 'bmi', 'children']),
                                ('cat',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(strategy='most_frequent')),
                                                 ('onehot',
                                                  OneHotEncoder(handle_unknown='ignore'))]),
                                 ['sex', 'smoker', 'region'])])

In [67]:
#load the dataset
dataset2 = pd.read_csv('insurance.csv') 
#choose first 7 columns as features
features2 = dataset.iloc[:,0:6]
#choose the final column for prediction
target = dataset.iloc[:,-1] # popping is not a good idea here since we are doing parallell 
# Transformer Pipeline
X_train, X_val, y_train, y_val = train_test_split(features2, target, train_size=0.8, test_size=0.2, random_state=0)

numerical_cols = [col for col in features2.columns if features2[col].dtype in ['int64', 'float64']]
# or pandas .select_dtypes(['int64', 'float64'])
#categorical_cols = [col for col in inputs.columns if inputs[col].dtype==object]
categorical_cols = [col for col in features2.columns if features2[col].dtype==object and features2[col].nunique() <=10]
# Categorical colum does a check for Cardinality. If too many columns are created the dataset becomes very large.

# Keep selected columns only
my_cols = categorical_cols + numerical_cols
X_train = X_train[my_cols].copy()
X_valid = X_val[my_cols].copy()

# PIPELINE OBJECTs are useful for reproducibility and Object oriented code.
# TAKES PREPROCESSOR/TRANSFORMER STEPs AND ONE ESTIMATOR STEP. Here it just takes a 

categorical_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='most_frequent')),
('onehot', OneHotEncoder(handle_unknown='ignore'))])

numerical_transformer = SimpleImputer(strategy='constant')

preprocessor = ColumnTransformer(transformers=[
('num', numerical_transformer, numerical_cols),
('cat', categorical_transformer, categorical_cols)])

train_piped = preprocessor.fit(X_train)
test_piped = preprocessor.fit(X_val)
print(train_piped)

ColumnTransformer(transformers=[('num', SimpleImputer(strategy='constant'),
                                 ['age', 'bmi', 'children']),
                                ('cat',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(strategy='most_frequent')),
                                                 ('onehot',
                                                  OneHotEncoder(handle_unknown='ignore'))]),
                                 ['sex', 'smoker', 'region'])])


In [68]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import InputLayer,Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping



def linear_model(features): 
  model = Sequential(name = "linear model")
  num_features = features.shape[1]
  input = InputLayer(input_shape=(num_features,)) # local scope Built-in overwrite 
  model.add(input) #add the input layer
  model.add(Dense(1)) # The One output makes it a Regression model. A continous prediction per observation.
  return model

def linear_model2(features):
  model = Sequential(name= 'linear model 2')
  num_features = features.shape[1] # rows, columns
  input = InputLayer(input_shape=(num_features,))
  model.add(input)
  model.add(Dense(128, activation='relu')) # Binary number bases are easier to compute like 2^7 = 128
  # recitified linear unit allows for non-linear. It takes the input array and returns max(input, 0) array.
  model.add(Dense(1))
  opt = Adam(learning_rate=0.01) # General purpose learning optimizer 
  model.compile(loss='mse',  metrics=['mae'], optimizer=opt) 
  # Linear Regression Optimizer mimizing Mean Square Error. 
  # MSE punishes mistakes on outliers more than \normal\ data so far in the iterations
  # displaying Mean Absolute Error during Training. Easier To understand for Validation
  # To model on evaluation will return both MSE and MAE. How far we are on Each Metric
  # MSE, MAE = model.evaluate(params) Tuple
  return model

import warnings
warnings.filterwarnings('ignore')

In [69]:
# 896 split samples, 11 features as in our dataset
input = tf.ones((896, 11)) 
# a fully-connected layer with 8 neurons
layer = Dense(8)
# calculate the outputs
output = layer(input)
# print the weights
print(layer.weights) 
warnings.filterwarnings('ignore')

[<tf.Variable 'dense_10/kernel:0' shape=(11, 8) dtype=float32>, <tf.Variable 'dense_10/bias:0' shape=(8,) dtype=float32>]


In [70]:
model = linear_model(features_train_scale) # Using the Potentially Smaller DataSet adjusted for say Cardinality.,
print(model.summary())
# 11 inputs, to 9 connected + 3 biases => 12 params

Model: "linear model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_11 (Dense)             (None, 1)                 12        
Total params: 12
Trainable params: 12
Non-trainable params: 0
_________________________________________________________________
None


In [71]:
# Stops Training if No more improvments are made. 
# This is usually done on a validation set
# here I didn't use such a split so it rather acts as a stop for large number of epoch
# The utility is lower this way but the functionality is helpful with computational time.
# See my tensorflow_3_computational_reducibility for how I interpret and use this functionallity.
callbacks = [EarlyStopping(monitor='mean_absolute_error',min_delta=1e-2,patience=2, verbose=1)]

# Normally stop training when `val_loss` is no longer improving
# no longer improving being defined as \no better than 1e-2 less
# no longer improving being further defined as for at least 2 epochs


In [72]:
model2 = linear_model2(features_train_scale)
model2.fit(features_train_scale, labels_train, callbacks= callbacks, epochs=40, batch_size=1, verbose=0)
# The callback is also using MSE here
# Batch of 1 is maximum updates. With more data it should be increased. Vebrose = True for output #9600-> 2800
# Callbacks come in list for here defined on assignment

val_mse, val_mae = model2.evaluate(features_test_scale, labels_test, verbose=0) # Like predict + score in scikit learn,
print(val_mae)

Epoch 00031: early stopping
2798.1626


In [73]:
prediction = model2.predict(np.array([features_test_scale.iloc[44],])) # .values.reshape goes to array anyway,
print(prediction)
print(dataset.iloc[44])

[[6316.572]]
age                38
sex              male
bmi             37.05
children            1
smoker             no
region      northeast
charges       6079.67
Name: 44, dtype: object


# Conclusion,
Decent performance on this task. We had a Skewed set of data with ,
a mean of 13270.42,
std  12110.01,
One of my metrics is the MAE / std ratio. And here it is about 24% which is way better than chancing it.,
Another alternative would be to use a simple linear regression. And use that as a baseline for the model. ,
The mean is used with in regression line for determining R2, the impact of the relationships. ,
If we have random data the mean and the regression line are the same but on many datasets a regression baseline style could be helpful for determining the usefulness of a Neural Network.,
Just like the mean another baseline to evaluate is the median.