# Model to predict Expenses Using Demographic and Health Data"

In [2]:
import numpy as np
import pandas as pd
import math
import matplotlib.pyplot as plt

## Reading in the Dataset
Using the pandas library to read in the dataset.

In [3]:
df = pd.read_csv("./insurance.csv")

In [4]:
df

Unnamed: 0,age,sex,bmi,children,smoker,region,expenses
0,19,female,27.9,0,yes,southwest,16884.92
1,18,male,33.8,1,no,southeast,1725.55
2,28,male,33.0,3,no,southeast,4449.46
3,33,male,22.7,0,no,northwest,21984.47
4,32,male,28.9,0,no,northwest,3866.86
...,...,...,...,...,...,...,...
1333,50,male,31.0,3,no,northwest,10600.55
1334,18,female,31.9,0,no,northeast,2205.98
1335,18,female,36.9,0,no,southeast,1629.83
1336,21,female,25.8,0,no,southwest,2007.95


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   expenses  1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


### Picking X and Y values

In [6]:
x = df.drop(columns=["expenses"])
x

Unnamed: 0,age,sex,bmi,children,smoker,region
0,19,female,27.9,0,yes,southwest
1,18,male,33.8,1,no,southeast
2,28,male,33.0,3,no,southeast
3,33,male,22.7,0,no,northwest
4,32,male,28.9,0,no,northwest
...,...,...,...,...,...,...
1333,50,male,31.0,3,no,northwest
1334,18,female,31.9,0,no,northeast
1335,18,female,36.9,0,no,southeast
1336,21,female,25.8,0,no,southwest


### Changing String Values to Numerical values

In [7]:
x['sex'] = df['sex'].map({'male': 0, 'female': 1})
x['smoker'] = df['smoker'].map({'no': 0, 'yes': 1})
x['region'] = df['region'].map({'northeast': 0, 'northwest': 1, 'southeast': 3, 'southwest':4})
x

Unnamed: 0,age,sex,bmi,children,smoker,region
0,19,1,27.9,0,1,4
1,18,0,33.8,1,0,3
2,28,0,33.0,3,0,3
3,33,0,22.7,0,0,1
4,32,0,28.9,0,0,1
...,...,...,...,...,...,...
1333,50,0,31.0,3,0,1
1334,18,1,31.9,0,0,0
1335,18,1,36.9,0,0,3
1336,21,1,25.8,0,0,4


In [8]:
y = df["expenses"]
y

0       16884.92
1        1725.55
2        4449.46
3       21984.47
4        3866.86
          ...   
1333    10600.55
1334     2205.98
1335     1629.83
1336     2007.95
1337    29141.36
Name: expenses, Length: 1338, dtype: float64

### Normalizing the Y values


### Function for Normalization

In [9]:
def normalize(values, min_value, max_value):
    values = np.array(values)
    normalized_values = (values - min_value) / (max_value - min_value)
    return normalized_values

### Function for Un-Normalization

In [10]:
def unnormalize(normalized_values, min_value, max_value):
    normalized_values = np.array(normalized_values)
    unnormalized_values = normalized_values * (max_value - min_value) + min_value
    return unnormalized_values

In [11]:
y_normalized = normalize(y, y.min(), y.max())

In [12]:
y_normalized

array([0.25161073, 0.00963598, 0.05311519, ..., 0.00810809, 0.01414366,
       0.44724875])

In [13]:
x_train = np.array(x)
x_train

array([[19. ,  1. , 27.9,  0. ,  1. ,  4. ],
       [18. ,  0. , 33.8,  1. ,  0. ,  3. ],
       [28. ,  0. , 33. ,  3. ,  0. ,  3. ],
       ...,
       [18. ,  1. , 36.9,  0. ,  0. ,  3. ],
       [21. ,  1. , 25.8,  0. ,  0. ,  4. ],
       [61. ,  1. , 29.1,  0. ,  1. ,  1. ]])

In [14]:
x_train.shape

(1338, 6)