In [72]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
import pickle

In [73]:
dt = pd.read_csv('../md_insure/insurance.csv')

In [74]:
dt.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [75]:
dt.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [76]:
dt['bmi'].max()
dt['region'].head()

0    southwest
1    southeast
2    southeast
3    northwest
4    northwest
Name: region, dtype: object

In [77]:
dt.nunique()

age           47
sex            2
bmi          548
children       6
smoker         2
region         4
charges     1337
dtype: int64

In [78]:
encode = LabelEncoder()
dt['sex'] = encode.fit_transform(dt['sex'])
dt['smoker'] = encode.fit_transform(dt['smoker'])
dt['region'] = encode.fit_transform(dt['region'])
dt

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.900,0,1,3,16884.92400
1,18,1,33.770,1,0,2,1725.55230
2,28,1,33.000,3,0,2,4449.46200
3,33,1,22.705,0,0,1,21984.47061
4,32,1,28.880,0,0,1,3866.85520
...,...,...,...,...,...,...,...
1333,50,1,30.970,3,0,1,10600.54830
1334,18,0,31.920,0,0,0,2205.98080
1335,18,0,36.850,0,0,2,1629.83350
1336,21,0,25.800,0,0,3,2007.94500


In [79]:
dt.isnull().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

#### short code

#### X = dt.drop(columns='charges', axis=1, inplace=True)
#### Y = dt['charges']

In [80]:
dt.columns.to_list()

['age', 'sex', 'bmi', 'children', 'smoker', 'region', 'charges']

In [81]:
X = dt[['age', 'sex', 'bmi', 'children', 'smoker', 'region']]
Y = dt['charges']

In [82]:
print(X)

      age  sex     bmi  children  smoker  region
0      19    0  27.900         0       1       3
1      18    1  33.770         1       0       2
2      28    1  33.000         3       0       2
3      33    1  22.705         0       0       1
4      32    1  28.880         0       0       1
...   ...  ...     ...       ...     ...     ...
1333   50    1  30.970         3       0       1
1334   18    0  31.920         0       0       0
1335   18    0  36.850         0       0       2
1336   21    0  25.800         0       0       3
1337   61    0  29.070         0       1       1

[1338 rows x 6 columns]


In [83]:
print(Y)

0       16884.92400
1        1725.55230
2        4449.46200
3       21984.47061
4        3866.85520
           ...     
1333    10600.54830
1334     2205.98080
1335     1629.83350
1336     2007.94500
1337    29141.36030
Name: charges, Length: 1338, dtype: float64


In [84]:
xtrain, xtest, ytrain, ytest = train_test_split(X,Y, test_size=0.2, random_state=2)

In [85]:
random = RandomForestRegressor(n_estimators= 500, n_jobs= 4)
random.fit(xtrain, ytrain)
random.score(X,Y) * 100

94.86686623992018

In [86]:
predict = random.predict(X)

In [87]:
input_data = (31,1,25.74,0,1,0)
input_data_as_numpy_array = np.asarray(input_data)
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

prediction = random.predict(input_data_reshaped)
print(prediction)
print('The insurance cost is USD ', prediction[0])

[18709.43115764]
The insurance cost is USD  18709.43115763997


In [88]:
def save_model(obj,path_to_save):
    with open(path_to_save, 'wb') as f:
        pickle.dump(obj, f)
    return True

In [89]:
save_model(random,'../md_insure/random.pk')

True

In [90]:
def load_model(path_to_model):
    with open(path_to_model, 'rb') as f:
        return pickle.load(f)