# Loading

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression, LogisticRegression
from pickle import dump

In [2]:
df = pd.read_csv('insurance.csv')

In [3]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


# Processing

In [4]:
df.rename(columns = {'sex': 'female'}, inplace = True)
df.head()

Unnamed: 0,age,female,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [5]:
df.female.replace(to_replace = ['female', 'male'], value = [1, 0], inplace = True)
df.head()

Unnamed: 0,age,female,bmi,children,smoker,region,charges
0,19,1,27.9,0,yes,southwest,16884.924
1,18,0,33.77,1,no,southeast,1725.5523
2,28,0,33.0,3,no,southeast,4449.462
3,33,0,22.705,0,no,northwest,21984.47061
4,32,0,28.88,0,no,northwest,3866.8552


In [6]:
df.smoker.replace(to_replace = ['yes', 'no'], value = [1, 0], inplace = True)
df.head()

Unnamed: 0,age,female,bmi,children,smoker,region,charges
0,19,1,27.9,0,1,southwest,16884.924
1,18,0,33.77,1,0,southeast,1725.5523
2,28,0,33.0,3,0,southeast,4449.462
3,33,0,22.705,0,0,northwest,21984.47061
4,32,0,28.88,0,0,northwest,3866.8552


In [7]:
df.region.unique()

array(['southwest', 'southeast', 'northwest', 'northeast'], dtype=object)

In [8]:
regions = df.region.unique().tolist()
df.region.replace(to_replace = regions, value = [i for i in range(len(regions))], inplace = True)
df.head()

Unnamed: 0,age,female,bmi,children,smoker,region,charges
0,19,1,27.9,0,1,0,16884.924
1,18,0,33.77,1,0,1,1725.5523
2,28,0,33.0,3,0,1,4449.462
3,33,0,22.705,0,0,2,21984.47061
4,32,0,28.88,0,0,2,3866.8552


# Training

We need two models
1. classification model to predict region from age, gender, bmi, children, smoker
2. regression model to predict charges from age, gender, bmi, children, smoker, region

### Model 1 (Region)

In [9]:
x_region = df.drop(columns = ['region', 'charges'], axis = 'columns')
x_region.head()

Unnamed: 0,age,female,bmi,children,smoker
0,19,1,27.9,0,1
1,18,0,33.77,1,0
2,28,0,33.0,3,0
3,33,0,22.705,0,0
4,32,0,28.88,0,0


In [10]:
y_region = df.region
y_region

0       0
1       1
2       1
3       2
4       2
       ..
1333    2
1334    3
1335    1
1336    0
1337    2
Name: region, Length: 1338, dtype: int64

In [11]:
model_region = LogisticRegression(multi_class = 'multinomial', max_iter = 1000)

In [12]:
model_region.fit(x_region.values, y_region.values)

LogisticRegression(max_iter=1000, multi_class='multinomial')

In [13]:
model_region.predict([
    [19, 1, 27.9, 0, 1],
    [18, 0, 33.770, 1, 0],
    [32, 0, 28.880, 0, 0]
])

array([1, 1, 3], dtype=int64)

In [14]:
dump(model_region, open('model_region.pkl', 'wb'))

### Model 2 (Charges)

In [15]:
x_charges = df.drop(columns = ['charges'], axis = 'columns')
x_charges.head()

Unnamed: 0,age,female,bmi,children,smoker,region
0,19,1,27.9,0,1,0
1,18,0,33.77,1,0,1
2,28,0,33.0,3,0,1
3,33,0,22.705,0,0,2
4,32,0,28.88,0,0,2


In [16]:
y_charges = df.charges
y_charges

0       16884.92400
1        1725.55230
2        4449.46200
3       21984.47061
4        3866.85520
           ...     
1333    10600.54830
1334     2205.98080
1335     1629.83350
1336     2007.94500
1337    29141.36030
Name: charges, Length: 1338, dtype: float64

In [17]:
model_charges = LinearRegression()

In [18]:
model_charges.fit(x_charges.values, y_charges.values)

LinearRegression()

In [19]:
model_charges.predict([
    [20, 0, 28.0, 0, 0, 2],
    [20, 0, 20.0, 0, 0, 2],
    [19, 1, 27.9, 0, 1, 0],
    [20, 0, 20.9, 0, 0, 2]
])

array([ 2157.52228253,  -503.03877541, 25111.24186363,  -203.72565639])

In [20]:
dump(model_charges, open('model_charges.pkl', 'wb'))