In [1]:
import pandas as pd
import numpy as np
import matplotlib 
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [52]:
# put the dateset in a dataframe called df
df = pd.read_csv('insurance.csv')
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [53]:
df.describe()

Unnamed: 0,age,bmi,children,charges
count,1338.0,1338.0,1338.0,1338.0
mean,39.207025,30.663397,1.094918,13270.422265
std,14.04996,6.098187,1.205493,12110.011237
min,18.0,15.96,0.0,1121.8739
25%,27.0,26.29625,0.0,4740.28715
50%,39.0,30.4,1.0,9382.033
75%,51.0,34.69375,2.0,16639.912515
max,64.0,53.13,5.0,63770.42801


In [54]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


# use lambda function to assign numbers to data
df['smoker_num'] = df.smoker.apply(lambda x: 1 if x=='yes' else 0)

In [55]:
dummies = pd.get_dummies(df.smoker)
dummies.head()

Unnamed: 0,no,yes
0,0,1
1,1,0
2,1,0
3,1,0
4,1,0


In [56]:
# concatenate the new dummy column to the dataframe
df = pd.concat([df, dummies], axis='columns')
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,no,yes
0,19,female,27.9,0,yes,southwest,16884.924,0,1
1,18,male,33.77,1,no,southeast,1725.5523,1,0
2,28,male,33.0,3,no,southeast,4449.462,1,0
3,33,male,22.705,0,no,northwest,21984.47061,1,0
4,32,male,28.88,0,no,northwest,3866.8552,1,0


In [57]:
# convert region data to numeric data one-hot-encode
dummies = pd.get_dummies(df.sex)
dummies.head()

Unnamed: 0,female,male
0,1,0
1,0,1
2,0,1
3,0,1
4,0,1


In [58]:
# concatenate the new dummy column to the dataframe
df = pd.concat([df, dummies], axis='columns')
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,no,yes,female,male
0,19,female,27.9,0,yes,southwest,16884.924,0,1,1,0
1,18,male,33.77,1,no,southeast,1725.5523,1,0,0,1
2,28,male,33.0,3,no,southeast,4449.462,1,0,0,1
3,33,male,22.705,0,no,northwest,21984.47061,1,0,0,1
4,32,male,28.88,0,no,northwest,3866.8552,1,0,0,1


In [59]:
# convert region data to numeric data one-hot-encode
dummies = pd.get_dummies(df.region)
dummies.head()

Unnamed: 0,northeast,northwest,southeast,southwest
0,0,0,0,1
1,0,0,1,0
2,0,0,1,0
3,0,1,0,0
4,0,1,0,0


In [60]:
# concatenate the new dummy column to the dataframe
df = pd.concat([df, dummies], axis='columns')
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,no,yes,female,male,northeast,northwest,southeast,southwest
0,19,female,27.9,0,yes,southwest,16884.924,0,1,1,0,0,0,0,1
1,18,male,33.77,1,no,southeast,1725.5523,1,0,0,1,0,0,1,0
2,28,male,33.0,3,no,southeast,4449.462,1,0,0,1,0,0,1,0
3,33,male,22.705,0,no,northwest,21984.47061,1,0,0,1,0,1,0,0
4,32,male,28.88,0,no,northwest,3866.8552,1,0,0,1,0,1,0,0


# using the map method instead of get_dummies to reduce data
# southwest=0, southeast=1, northwest=2, northeast=3
df['region_num'] = df.region.map({
   'southwest':0,
    'southeast':1,
    'northwest':2,
    'northeast':3,
})

In [61]:
# remove duplicate data
df[df.duplicated(keep=False)]

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,no,yes,female,male,northeast,northwest,southeast,southwest
195,19,male,30.59,0,no,northwest,1639.5631,1,0,0,1,0,1,0,0
581,19,male,30.59,0,no,northwest,1639.5631,1,0,0,1,0,1,0,0


In [62]:
df.drop_duplicates()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,no,yes,female,male,northeast,northwest,southeast,southwest
0,19,female,27.900,0,yes,southwest,16884.92400,0,1,1,0,0,0,0,1
1,18,male,33.770,1,no,southeast,1725.55230,1,0,0,1,0,0,1,0
2,28,male,33.000,3,no,southeast,4449.46200,1,0,0,1,0,0,1,0
3,33,male,22.705,0,no,northwest,21984.47061,1,0,0,1,0,1,0,0
4,32,male,28.880,0,no,northwest,3866.85520,1,0,0,1,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.54830,1,0,0,1,0,1,0,0
1334,18,female,31.920,0,no,northeast,2205.98080,1,0,1,0,1,0,0,0
1335,18,female,36.850,0,no,southeast,1629.83350,1,0,1,0,0,0,1,0
1336,21,female,25.800,0,no,southwest,2007.94500,1,0,1,0,0,0,0,1


In [68]:
# assign the dependent and independent data
X = df.drop(['smoker', 'sex', 'charges','region'], axis='columns')
y = df['charges']
X.head()

Unnamed: 0,age,bmi,children,no,yes,female,male,northeast,northwest,southeast,southwest
0,19,27.9,0,0,1,1,0,0,0,0,1
1,18,33.77,1,1,0,0,1,0,0,1,0
2,28,33.0,3,1,0,0,1,0,0,1,0
3,33,22.705,0,1,0,0,1,0,1,0,0
4,32,28.88,0,1,0,0,1,0,1,0,0


In [69]:
# split the dataset for trainint
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=50)

In [70]:
# Feature Engineering
from sklearn.preprocessing import StandardScaler
# Feature scaling
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [71]:
# training using random forest regressor
from sklearn.ensemble import RandomForestRegressor
cls = RandomForestRegressor()

In [72]:
cls.fit(X_train, y_train)

In [73]:
# check accuracy of model
from sklearn.metrics import classification_report
cls.score(X_test, y_test)

0.8605559729227603

In [74]:
X.columns

Index(['age', 'bmi', 'children', 'no', 'yes', 'female', 'male', 'northeast',
       'northwest', 'southeast', 'southwest'],
      dtype='object')

In [75]:
import pickle

In [76]:
# Make pickle file of our model
pickle.dump(cls, open('./model.pkl', 'wb'))

In [77]:
import json
columns = {
    'data_columns': [col.lower() for col in X.columns]
}
with open("columns.json", "w") as f:
    f.write(json.dumps(columns))

In [78]:
X.columns

Index(['age', 'bmi', 'children', 'no', 'yes', 'female', 'male', 'northeast',
       'northwest', 'southeast', 'southwest'],
      dtype='object')

In [81]:
np.where(X.columns=='male')[0][0]

6

In [82]:
def insurance_premium(region,age,bmi,children,sex,smoker):
    region_index = np.where(X.columns==region)[0][0]
    sex_index = np.where(X.columns==region)[0][0]
    smoker_index = np.where(X.columns==region)[0][0]

    x = np.zeros(len(X.columns))
    x[0] = age
    x[1] = bmi
    x[2] = children
    if region_index >= 0:
        x[region_index] = 1
    if sex_index >= 0:
        x[sex_index] = 1
    if smoker_index >= 0:
        x[smoker_index] >= 1

    return cls.predict([x])[0]


In [86]:
insurance_premium('northwest',19,28,30,'male','yes')

17790.306528999987

In [89]:
X.columns[3:5]

Index(['no', 'yes'], dtype='object')

In [90]:
X.columns[5:7]

Index(['female', 'male'], dtype='object')

In [91]:
X.columns[7:]

Index(['northeast', 'northwest', 'southeast', 'southwest'], dtype='object')