In [None]:
# Import libraries. You may or may not use all of these.
!pip install -q git+https://github.com/tensorflow/docs
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

try:
  # %tensorflow_version only exists in Colab.
  %tensorflow_version 2.x
except Exception:
  pass
import tensorflow as tf

from tensorflow import keras
from tensorflow.keras import layers

import tensorflow_docs as tfdocs
import tensorflow_docs.plots
import tensorflow_docs.modeling

from sklearn.linear_model import LinearRegression

In [None]:
# Import data
!wget https://cdn.freecodecamp.org/project-data/health-costs/insurance.csv
dataset = pd.read_csv('insurance.csv')
dataset.tail()

In [None]:
df = dataset
df.head()

In [None]:
df.isna().sum()
df = df.dropna()



In [None]:
# @title region

from matplotlib import pyplot as plt
import seaborn as sns
df.groupby('region').size().plot(kind='barh', color=sns.palettes.mpl_palette('Dark2'))
plt.gca().spines[['top', 'right',]].set_visible(False)

In [None]:
# convert categorical data to numbers
numeric_var = {"sex": {"male":0, "female":1}, "smoker":{"yes":1,"no":0}}
df = df.replace(numeric_var)
#print(df)

# get the dummies and store it in a variable
dummies = pd.get_dummies(df.region, dtype=int)

# Concatenate the dummies to original dataframe
merged = pd.concat([df, dummies], axis='columns')

# drop the values 'drop southwest???
merged.drop(['region', 'southwest'], axis='columns', inplace=True)

print(merged)

In [None]:
# train dataset and test dataset
train_dataset = merged.sample(frac=0.8, random_state=0)
test_dataset = merged.drop(train_dataset.index)


In [None]:
sns.pairplot(train_dataset[['expenses', 'age', 'sex', 'northeast']], diag_kind='kde')


In [None]:
train_dataset.describe().transpose()


In [None]:
train_features = train_dataset.copy()
test_features = test_dataset.copy()

train_labels = train_features.pop('expenses')
test_labels = test_features.pop('expenses')


In [None]:
print(train_labels)

In [None]:
train_dataset.describe().transpose()[['mean', 'std']]
normalizer = tf.keras.layers.Normalization(axis=-1)
normalizer.adapt(np.array(train_features))
print(normalizer.mean.numpy())
first = np.array(train_features[:1])

with np.printoptions(precision=2, suppress=True):
  print('First example:', first)
  print()
  print('Normalized:', normalizer(first).numpy())



In [None]:
print(train_features.columns)

In [None]:
# uni variate linear regression
age_lin_reg = np.array(train_features['age'])

age_lin_reg_normalizer = layers.Normalization(input_shape=[1,], axis=None)
age_lin_reg_normalizer.adapt(age_lin_reg)

age_model = tf.keras.Sequential([
    age_lin_reg_normalizer,
    layers.Dense(units=1)
])

age_model.summary()

age_model.predict(age_lin_reg[:10])


In [None]:
age_model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.1),
    loss='mean_absolute_error')

In [None]:
%%time
history = age_model.fit(
    train_features['age'],
    train_labels,
    epochs=100,
    # Suppress logging.
    verbose=0,
    # Calculate validation results on 20% of the training data.
    validation_split = 0.2)


In [None]:
hist = pd.DataFrame(history.history)
hist['epoch'] = history.epoch
hist.tail()


In [None]:
def plot_loss(history):
  plt.plot(history.history['loss'], label='loss')
  plt.plot(history.history['val_loss'], label='val_loss')

  plt.xlabel('Epoch')
  plt.ylabel('Error [expenses]')
  plt.legend()
  plt.grid(True)


In [None]:
plot_loss(history)


In [None]:
test_results = {}

test_results['age_model'] = age_model.evaluate(
    test_features['age'],
    test_labels, verbose=0)


In [None]:
x = tf.linspace(0.0, 100, 101)
y = age_model.predict(x)


In [None]:
def plot_age(x, y):
  plt.scatter(train_features['age'], train_labels, label='Data')
  plt.plot(x, y, color='k', label='Predictions')
  plt.xlabel('age')
  plt.ylabel('expenses')
  plt.legend()


In [None]:
plot_age(x, y)


In [None]:
print(train_features['age'])

In [None]:
#sk univariate linear regression
reg = LinearRegression().fit(train_features[['age']],train_labels)

In [None]:
Y=reg.predict(x.numpy().reshape(-1, 1))

In [None]:
plot_age(x,Y)

In [None]:
#sk multivariate linear regression
multi_reg = LinearRegression().fit(train_features,train_labels)

In [None]:
# keras multivariate linear regression
linear_model = tf.keras.Sequential([
    normalizer,
    layers.Dense(units=1)
])
linear_model.predict(train_features[:10])


In [None]:
linear_model.layers[1].kernel

In [None]:
linear_model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.1),
    loss='mean_absolute_error')


In [None]:
%%time
history = linear_model.fit(
    train_features,
    train_labels,
    epochs=100,
    # Suppress logging.
    verbose=0,
    # Calculate validation results on 20% of the training data.
    validation_split = 0.2)


In [None]:
plot_loss(history)


In [None]:
test_results['linear_model'] = linear_model.evaluate(
    test_features, test_labels, verbose=0)


In [None]:
# NN
def build_and_compile_model(norm):
  model = keras.Sequential([
      norm,
      layers.Dense(64, activation='relu'),
      layers.Dense(64, activation='relu'),
      layers.Dense(1)
  ])

  model.compile(loss='mean_absolute_error',metrics=['mae', 'mse'],
                optimizer=tf.keras.optimizers.Adam(0.001))

  return model

In [None]:
dnn_age_model = build_and_compile_model(age_lin_reg_normalizer)


In [None]:
dnn_age_model.summary()


In [None]:
%%time
history = dnn_age_model.fit(
    train_features['age'],
    train_labels,
    validation_split=0.2,
    verbose=0, epochs=100)


In [None]:
plot_loss(history)

y = dnn_age_model.predict(x)



In [None]:
plot_age(x, y)

In [None]:
test_results['dnn_age_model'] = dnn_age_model.evaluate(
    test_features['age'], test_labels,
    verbose=0)


In [None]:
dnn_model = build_and_compile_model(normalizer)
dnn_model.summary()


In [None]:
%%time
history = dnn_model.fit(
    train_features,
    train_labels,
    validation_split=0.2,
    verbose=0, epochs=100)


In [None]:
plot_loss(history)


In [None]:
test_results['dnn_model'] = dnn_model.evaluate(test_features, test_labels, verbose=0)


In [None]:
pd.DataFrame(test_results, index=['Mean absolute error [expenses]']).T

In [None]:
test_predictions = dnn_model.predict(test_features).flatten()

a = plt.axes(aspect='equal')
plt.scatter(test_labels, test_predictions)
plt.xlabel('True Values [expenses]')
plt.ylabel('Predictions [expenses]')
lims = [0, 70000]
plt.xlim(lims)
plt.ylim(lims)
_ = plt.plot(lims, lims)


In [None]:
error = test_predictions - test_labels
plt.hist(error, bins=25)
plt.xlabel('Prediction Error [expenses]')
_ = plt.ylabel('Count')


In [None]:
# reload
dnn_model.save('dnn_model.keras')
#reloaded = tf.keras.models.load_model('dnn_model.keras')

#test_results['reloaded'] = reloaded.evaluate(
#    test_features, test_labels, verbose=0)
#pd.DataFrame(test_results, index=['Mean absolute error [expenses]']).T

In [None]:
model = dnn_model

In [None]:
test_features.head()

In [None]:
test_dataset.head()

In [None]:
loss, mae, mse = dnn_model.evaluate(train_features, train_labels)

In [None]:
print(mae)

In [None]:
loss, mae, mse = dnn_model.evaluate(test_features, test_labels)

In [None]:
loss, mae, mse = dnn_model.evaluate(test_features, test_labels)

In [None]:
# RUN THIS CELL TO TEST YOUR MODEL. DO NOT MODIFY CONTENTS.
# Test model by checking how well the model generalizes using the test set.
loss, mae, mse = model.evaluate(test_features, test_labels, verbose=2)

print("Testing set Mean Abs Error: {:5.2f} expenses".format(mae))

if mae < 3500:
  print("You passed the challenge. Great job!")
else:
  print("The Mean Abs Error must be less than 3500. Keep trying.")

# Plot predictions.
test_predictions = model.predict(test_features).flatten()

a = plt.axes(aspect='equal')
plt.scatter(test_labels, test_predictions)
plt.xlabel('True values (expenses)')
plt.ylabel('Predictions (expenses)')
lims = [0, 50000]
plt.xlim(lims)
plt.ylim(lims)
_ = plt.plot(lims,lims)
