In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score , mean_squared_error

In [2]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [3]:
path = "/content/drive/MyDrive/Data/insurance.csv"
df = pd.read_csv(path)

In [4]:
df

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.900,0,yes,southwest,16884.92400
1,18,male,33.770,1,no,southeast,1725.55230
2,28,male,33.000,3,no,southeast,4449.46200
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.880,0,no,northwest,3866.85520
...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.54830
1334,18,female,31.920,0,no,northeast,2205.98080
1335,18,female,36.850,0,no,southeast,1629.83350
1336,21,female,25.800,0,no,southwest,2007.94500


In [5]:
df.isna().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [6]:
df.isna().any()

age         False
sex         False
bmi         False
children    False
smoker      False
region      False
charges     False
dtype: bool

In [7]:
df.drop(columns=['region'],inplace=True)

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(2)
memory usage: 62.8+ KB


In [9]:
df.columns

Index(['age', 'sex', 'bmi', 'children', 'smoker', 'charges'], dtype='object')

In [10]:
cat_cols,num_cols = [], []
for col in df.columns:
  if df[col].dtype == "object":
    cat_cols.append(col)
  else:
    num_cols.append(col)
cat_cols , num_cols

(['sex', 'smoker'], ['age', 'bmi', 'children', 'charges'])

In [12]:
for col in cat_cols:
  print(col,df[col].unique())

sex ['female' 'male']
smoker ['yes' 'no']


In [14]:
scaler = MinMaxScaler()

scaled_df = df
for col in num_cols:
  scaled_df[col] = scaler.fit_transform(df[[col]])
scaled_df

Unnamed: 0,age,sex,bmi,children,smoker,charges
0,0.021739,female,0.321227,0.0,yes,0.251611
1,0.000000,male,0.479150,0.2,no,0.009636
2,0.217391,male,0.458434,0.6,no,0.053115
3,0.326087,male,0.181464,0.0,no,0.333010
4,0.304348,male,0.347592,0.0,no,0.043816
...,...,...,...,...,...,...
1333,0.695652,male,0.403820,0.6,no,0.151299
1334,0.000000,female,0.429379,0.0,no,0.017305
1335,0.000000,female,0.562012,0.0,no,0.008108
1336,0.065217,female,0.264730,0.0,no,0.014144


In [15]:
scaled_df.shape

(1338, 6)

In [16]:
df['sex'] = df['sex'].map({'male':0,'female':1})
df['smoker'] = df['smoker'].map({'yes':1,'no':0})

In [17]:
scaled_df

Unnamed: 0,age,sex,bmi,children,smoker,charges
0,0.021739,1,0.321227,0.0,1,0.251611
1,0.000000,0,0.479150,0.2,0,0.009636
2,0.217391,0,0.458434,0.6,0,0.053115
3,0.326087,0,0.181464,0.0,0,0.333010
4,0.304348,0,0.347592,0.0,0,0.043816
...,...,...,...,...,...,...
1333,0.695652,0,0.403820,0.6,0,0.151299
1334,0.000000,1,0.429379,0.0,0,0.017305
1335,0.000000,1,0.562012,0.0,0,0.008108
1336,0.065217,1,0.264730,0.0,0,0.014144


In [None]:
x = scaled_df.drop(columns="charges")
y = scaled_df['charges']

In [None]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.25,random_state=42)
x_train.shape,x_test.shape,y_train.shape,y_test.shape

((1003, 5), (335, 5), (1003,), (335,))

In [None]:
lr = LinearRegression()
lr.fit(x_train,y_train)

In [None]:
score = lr.score(x_test,y_test)
score*100

76.52077247609824

In [None]:
y_pred = lr.predict(x_test)
y_pred.shape

(335,)

In [None]:
mse = mean_squared_error(y_test,y_pred)
r2 = r2_score(y_test,y_pred)
mse , r2

(0.009026625793793704, 0.7652077247609824)