In [None]:
# Import libraries. You may or may not use all of these.

#!pip install -q git+https://github.com/tensorflow/docs

!pip install tensorflow==2.19
!pip install scikit-learn
!pip install -q seaborn

# Restart the runtime after this

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from IPython.display import clear_output
from six.moves import urllib
import seaborn as sns


np.set_printoptions(precision=3, suppress=True)



try:
  # %tensorflow_version only exists in Colab.
  %tensorflow_version 2.x
except Exception:
  pass
import tensorflow as tf

from tensorflow import keras
from tensorflow.keras import layers

import tensorflow_docs as tfdocs
import tensorflow_docs.plots
import tensorflow_docs.modeling
from sklearn.model_selection import train_test_split


print(tf.__version__)
print(dir(tf))  # Optional: to check if 'estimator' is listed


In [None]:
# Import data
!wget https://cdn.freecodecamp.org/project-data/health-costs/insurance.csv
dataset = pd.read_csv('insurance.csv')
dataset.tail()

In [None]:
# Using 80% as the training dataset & 20% as the testing
train_dataset, test_dataset = train_test_split(dataset, test_size=0.2)
print (train_dataset.shape)
print (test_dataset.shape)

train_labels = train_dataset.pop("expenses")
test_labels = test_dataset.pop("expenses")
print (train_labels.shape)
print (test_labels.shape)


In [None]:
# Understanding the data
print(train_dataset.describe())

In [None]:
print(train_dataset.age.hist())

In [None]:
print(train_dataset.bmi.hist())

In [None]:
print(train_dataset.children.hist(bins=20))

In [None]:
print(train_dataset.sex.value_counts().plot(kind="barh"))

In [None]:
print(train_dataset.smoker.value_counts().plot(kind="barh"))

In [None]:
print(train_dataset.region.value_counts().plot(kind="barh"))

In [None]:
categorial_columns = ["sex", "smoker", "region"]
numeric_columns = ["age", "bmi", "children"]
feature_columns = []

for fn in categorial_columns:
  v = train_dataset[fn].unique()
  feature_columns.append(tf.feature_column.categorical_column_with_vocabulary_list(fn, v))

for fn in numeric_columns:
  feature_columns.append(tf.feature_column.numeric_column(fn, dtype=tf.float32))

print(feature_columns)



In [None]:
#input function
def make_if(data_df, label_df, epochs=10, shuffle=True, batch_size=32):
  def input_f():
    ds = tf.data.Dataset.from_tensor_slices((dict(data_df), label_df))
    if shuffle:
      ds = ds.shuffle(1000)
    ds = ds.batch(batch_size).repeat(epochs)
    return ds
  return input_f

In [None]:
train_input_fn = make_if(train_dataset, train_labels)
eval_input_fn = make_if(test_dataset, test_labels, epochs=1, shuffle=False)

In [None]:
ds = make_if(train_dataset, train_labels, batch_size=10)()
for feature_batch, label_batch in ds.take(1):
    print('Some feature keys:', list(feature_batch.keys()))
    print()
    print('A batch of features for "age":', feature_batch['age'].numpy())  # or any valid key
    print()
    print('A batch of Labels (class):', label_batch.numpy())

In [None]:
lin_est = tf.estimator.LinearRegressor(feature_columns=feature_columns)

lin_est.train(train_input_fn)

result = lin_est.evaluate(eval_input_fn)

clear_output()  # clears consoke output
print(result)

In [None]:
pred_dicts = list(lin_est.predict(eval_input_fn))
print(pred_dicts)

In [None]:
probs = pd.Series([pred['predictions'][0] for pred in pred_dicts])

probs.plot(kind='hist', bins=20, title='Prediction Values')

In [None]:
model

In [None]:
# RUN THIS CELL TO TEST YOUR MODEL. DO NOT MODIFY CONTENTS.
# Test model by checking how well the model generalizes using the test set.
loss, mae, mse = model.evaluate(test_dataset, test_labels, verbose=2)

print("Testing set Mean Abs Error: {:5.2f} expenses".format(mae))

if mae < 3500:
  print("You passed the challenge. Great job!")
else:
  print("The Mean Abs Error must be less than 3500. Keep trying.")

# Plot predictions.
test_predictions = model.predict(test_dataset).flatten()

a = plt.axes(aspect='equal')
plt.scatter(test_labels, test_predictions)
plt.xlabel('True values (expenses)')
plt.ylabel('Predictions (expenses)')
lims = [0, 50000]
plt.xlim(lims)
plt.ylim(lims)
_ = plt.plot(lims,lims)
