In [39]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [40]:
data = pd.read_csv('insurance.csv')

In [41]:
data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,expenses
0,19,female,27.9,0,yes,southwest,16884.92
1,18,male,33.8,1,no,southeast,1725.55
2,28,male,33.0,3,no,southeast,4449.46
3,33,male,22.7,0,no,northwest,21984.47
4,32,male,28.9,0,no,northwest,3866.86


In [42]:
data.shape

(1338, 7)

In [43]:
data.isnull().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
expenses    0
dtype: int64

In [44]:
data.duplicated().sum()

1

In [45]:
data = data.drop_duplicates()

In [46]:
data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,expenses
0,19,female,27.9,0,yes,southwest,16884.92
1,18,male,33.8,1,no,southeast,1725.55
2,28,male,33.0,3,no,southeast,4449.46
3,33,male,22.7,0,no,northwest,21984.47
4,32,male,28.9,0,no,northwest,3866.86


In [47]:
data.duplicated().sum()

0

In [48]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1337 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1337 non-null   int64  
 1   sex       1337 non-null   object 
 2   bmi       1337 non-null   float64
 3   children  1337 non-null   int64  
 4   smoker    1337 non-null   object 
 5   region    1337 non-null   object 
 6   expenses  1337 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 83.6+ KB


In [49]:
data.describe()

Unnamed: 0,age,bmi,children,expenses
count,1337.0,1337.0,1337.0,1337.0
mean,39.222139,30.66552,1.095737,13279.121638
std,14.044333,6.100664,1.205571,12110.359657
min,18.0,16.0,0.0,1121.87
25%,27.0,26.3,0.0,4746.34
50%,39.0,30.4,1.0,9386.16
75%,51.0,34.7,2.0,16657.72
max,64.0,53.1,5.0,63770.43


In [50]:
data.head(3)

Unnamed: 0,age,sex,bmi,children,smoker,region,expenses
0,19,female,27.9,0,yes,southwest,16884.92
1,18,male,33.8,1,no,southeast,1725.55
2,28,male,33.0,3,no,southeast,4449.46


In [51]:
data['region'].value_counts()

region
southeast    364
southwest    325
northwest    324
northeast    324
Name: count, dtype: int64

In [52]:
data['sex'] = data.sex.map({'male':1, 'female':0})
data['smoker'] = data.smoker.map({'yes':1, 'no':0})
data['region'] = data.region.map({'southeast':1,'southwest':2,'northwest':3,'northeast':4})

In [53]:
data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,expenses
0,19,0,27.9,0,1,2,16884.92
1,18,1,33.8,1,0,1,1725.55
2,28,1,33.0,3,0,1,4449.46
3,33,1,22.7,0,0,3,21984.47
4,32,1,28.9,0,0,3,3866.86


In [70]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1337 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1337 non-null   int64  
 1   sex       1337 non-null   int64  
 2   bmi       1337 non-null   float64
 3   children  1337 non-null   int64  
 4   smoker    1337 non-null   int64  
 5   region    1337 non-null   int64  
 6   expenses  1337 non-null   float64
dtypes: float64(2), int64(5)
memory usage: 83.6 KB


In [72]:
data.isnull().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
expenses    0
dtype: int64

In [74]:
data.duplicated().sum()

0

# divide data into X & Y

In [77]:
x = data.drop(columns=['expenses'])
y = data['expenses']

In [79]:
x.head()

Unnamed: 0,age,sex,bmi,children,smoker,region
0,19,0,27.9,0,1,2
1,18,1,33.8,1,0,1
2,28,1,33.0,3,0,1
3,33,1,22.7,0,0,3
4,32,1,28.9,0,0,3


In [81]:
y.head()

0    16884.92
1     1725.55
2     4449.46
3    21984.47
4     3866.86
Name: expenses, dtype: float64

# Train-test-split

In [84]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=42)

In [86]:
x_train.head()

Unnamed: 0,age,sex,bmi,children,smoker,region
1114,23,1,24.5,0,0,4
968,21,1,25.7,2,0,4
599,52,0,37.5,2,0,3
170,63,1,41.5,0,0,1
275,47,0,26.6,2,0,4


In [88]:
x_test.head()

Unnamed: 0,age,sex,bmi,children,smoker,region
900,49,1,22.5,0,0,4
1064,29,0,25.6,4,0,2
1256,51,0,36.4,3,0,3
298,31,1,34.4,3,1,3
237,31,1,38.4,2,0,1


# Model Training

In [93]:
from sklearn.linear_model import LinearRegression

lr = LinearRegression()
lr.fit(x_train,y_train)
y_pred = lr.predict(x_test)

In [99]:
from sklearn.metrics import r2_score
score = r2_score(y_test,y_pred)
score

0.8068735177088385

In [108]:
from sklearn.ensemble import RandomForestRegressor
m2 = RandomForestRegressor()

In [110]:
m2.fit(x_train,y_train)

In [112]:
y_pred = m2.predict(x_test)

In [114]:
from sklearn.metrics import r2_score
score = r2_score(y_test,y_pred)
score

0.8824374226149037

# Model Save

In [117]:
import joblib
joblib.dump(m2, 'Random Forest')

['Random Forest']