In [1]:
#importing the packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
#loading the dataset
df = pd.read_csv('insurance.csv')
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [5]:
df.isnull().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [6]:
df.count()

age         1338
sex         1338
bmi         1338
children    1338
smoker      1338
region      1338
charges     1338
dtype: int64

In [8]:
df.describe()

Unnamed: 0,age,bmi,children,charges
count,1338.0,1338.0,1338.0,1338.0
mean,39.207025,30.663397,1.094918,13270.422265
std,14.04996,6.098187,1.205493,12110.011237
min,18.0,15.96,0.0,1121.8739
25%,27.0,26.29625,0.0,4740.28715
50%,39.0,30.4,1.0,9382.033
75%,51.0,34.69375,2.0,16639.912515
max,64.0,53.13,5.0,63770.42801


In [11]:
df.nunique().values

array([  47,    2,  548,    6,    2,    4, 1337], dtype=int64)

In [13]:
#Data preprocessing
df['sex'] = df['sex'].map({'male':0, 'female':1})

In [15]:
df['smoker'] = df['smoker'].map({'no' : 0, 'yes':1})

In [16]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,1,27.9,0,1,southwest,16884.924
1,18,0,33.77,1,0,southeast,1725.5523
2,28,0,33.0,3,0,southeast,4449.462
3,33,0,22.705,0,0,northwest,21984.47061
4,32,0,28.88,0,0,northwest,3866.8552


In [18]:
df.dtypes

age           int64
sex           int64
bmi         float64
children      int64
smoker        int64
region       object
charges     float64
dtype: object

In [19]:
df['region'].value_counts()

region
southeast    364
southwest    325
northwest    325
northeast    324
Name: count, dtype: int64

In [20]:
#using label encoder
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['region'] = le.fit_transform(df['region'])

In [21]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,1,27.9,0,1,3,16884.924
1,18,0,33.77,1,0,2,1725.5523
2,28,0,33.0,3,0,2,4449.462
3,33,0,22.705,0,0,1,21984.47061
4,32,0,28.88,0,0,1,3866.8552


In [22]:
df['region'].value_counts()

region
2    364
3    325
1    325
0    324
Name: count, dtype: int64

In [23]:
df.corr()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
age,1.0,0.020856,0.109272,0.042469,-0.025019,0.002127,0.299008
sex,0.020856,1.0,-0.046371,-0.017163,-0.076185,-0.004588,-0.057292
bmi,0.109272,-0.046371,1.0,0.012759,0.00375,0.157566,0.198341
children,0.042469,-0.017163,0.012759,1.0,0.007673,0.016569,0.067998
smoker,-0.025019,-0.076185,0.00375,0.007673,1.0,-0.002181,0.787251
region,0.002127,-0.004588,0.157566,0.016569,-0.002181,1.0,-0.006208
charges,0.299008,-0.057292,0.198341,0.067998,0.787251,-0.006208,1.0


In [24]:
#determining X and y
X= df[['age', 'sex', 'bmi', 'smoker']]
y = df['charges']

In [26]:
#splitting the data into training and testing set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [27]:
X_train

Unnamed: 0,age,sex,bmi,smoker
216,53,1,26.600,0
731,53,0,21.400,0
866,18,0,37.290,0
202,60,1,24.035,0
820,45,0,33.700,0
...,...,...,...,...
715,60,0,28.900,0
905,26,1,29.355,0
1096,51,1,34.960,1
235,40,1,22.220,1


In [28]:
X_test

Unnamed: 0,age,sex,bmi,smoker
559,19,0,35.530,0
1087,57,0,31.540,0
1020,51,0,37.000,0
460,49,1,36.630,0
802,21,0,22.300,0
...,...,...,...,...
682,39,0,35.300,1
629,44,1,38.950,1
893,47,0,38.940,1
807,19,1,36.575,0


In [30]:
#building the model - using linear regression
from sklearn.linear_model import LinearRegression


In [34]:
model = LinearRegression()
model.fit(X_train, y_train)

In [35]:
#make predictions
y_predicted = model.predict(X_test)

In [37]:
#evaluations
from sklearn.metrics import r2_score
r2_score(y_test, y_predicted)

0.7565120021066973

In [38]:
y_predicted

array([ 4543.00133231, 13169.66650114, 13275.94567294, 12855.44839   ,
        1038.78328448, 31046.3467424 , 12913.11465154, 12225.27389237,
        3726.38626254, 30380.77937791, 12025.21546224, 17211.55673678,
        8751.97693332,  8723.25467461,  3949.64781148, 10431.83858983,
        4269.17546832,  6515.46509509, 15411.39592854, 15178.88850059,
       12394.83472948, 32413.9752163 ,  9375.95352743,  9855.26854017,
        2847.69589328,  8073.56243097,  8273.74634136, 11593.26562764,
        7578.45641132,  4329.91538651, 14415.68965839,  5833.85145583,
       33371.5914037 , 27188.16711139, 33061.8999682 ,  9938.9349355 ,
       31064.0599377 , 25815.08223773, 15921.61065624, 33697.22500753,
        6314.16324018, 14295.04328228, 10439.07032392, 15377.85820259,
        4233.25499926, 13051.16405754,  4735.05730822, 29445.65402196,
        7724.75923208, 12541.97531784, 14214.54250013, 12302.26967953,
        2227.5379948 ,  8496.8275125 , 25542.92760161, 10760.94797224,
      

In [39]:
model.score(X_train, y_train)

0.7450135987062217

In [40]:
#co-efficients
model.coef_

array([  258.93890006,   209.86543507,   304.01329161, 23742.65376178])