In [56]:
import pandas as pd 
import numpy as np


In [57]:
df = pd.read_csv("insurance.csv")

In [58]:
# checking for null values 

In [59]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [60]:
df.isnull().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [61]:
# no missing values 
# no need for Simple Imputer

In [62]:
x = df.drop(columns=["sex"])
y = df["sex"]

In [63]:
# Label Encoding:
from sklearn.preprocessing import LabelEncoder

In [64]:
le = LabelEncoder()
df["sex"] = le.fit_transform(df["sex"])
df["smoker"] = le.fit_transform(df["smoker"])
df["region"] = le.fit_transform(df["region"])

In [65]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.9,0,1,3,16884.924
1,18,1,33.77,1,0,2,1725.5523
2,28,1,33.0,3,0,2,4449.462
3,33,1,22.705,0,0,1,21984.47061
4,32,1,28.88,0,0,1,3866.8552


In [66]:
# ordinal encoding

In [67]:
df = pd.read_csv("insurance.csv")

In [68]:
# there are no missing values in the data

In [69]:
from sklearn.preprocessing import OrdinalEncoder

In [70]:
oe = OrdinalEncoder(categories = [["male" , "female"] , ["no" , "yes"] , ["northeast" ,"southwest" , "southeast" , "northwest"]])

In [71]:
categorical_columns = ["sex", "smoker", "region"]
df[categorical_columns] = oe.fit_transform(df[categorical_columns])

In [72]:
df[categorical_columns]    # ordinal encoding done

Unnamed: 0,sex,smoker,region
0,1.0,1.0,1.0
1,0.0,0.0,2.0
2,0.0,0.0,2.0
3,0.0,0.0,3.0
4,0.0,0.0,3.0
...,...,...,...
1333,0.0,0.0,3.0
1334,1.0,0.0,0.0
1335,1.0,0.0,2.0
1336,1.0,0.0,1.0


In [73]:
# One hot Encoder 

In [74]:
df = df = pd.read_csv("insurance.csv")

In [75]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [76]:
df.isnull().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [77]:
# no missing values

In [78]:
from sklearn.preprocessing import OneHotEncoder

In [79]:
ohe = OneHotEncoder(drop = "first" , sparse_output = False , dtype = np.int32)

In [80]:
df_new = ohe.fit_transform(df[["sex" , "smoker" , "region"]])

In [81]:
df_new_new = pd.DataFrame(df_new)

In [82]:
df_new_new  # One Hot Encoding Done

Unnamed: 0,0,1,2,3,4
0,0,1,0,0,1
1,1,0,0,1,0
2,1,0,0,1,0
3,1,0,1,0,0
4,1,0,1,0,0
...,...,...,...,...,...
1333,1,0,1,0,0
1334,0,0,0,0,0
1335,0,0,0,1,0
1336,0,0,0,0,1


In [83]:
# get_dummies

In [84]:
df = pd.read_csv("insurance.csv")

In [85]:
df.isnull().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [86]:
# no missing values

In [87]:
df_get_dummies = pd.get_dummies(df, columns=['sex', 'smoker' , 'region'] , drop_first = True)


In [88]:
df_get_dummies.astype(int)

Unnamed: 0,age,bmi,children,charges,sex_male,smoker_yes,region_northwest,region_southeast,region_southwest
0,19,27,0,16884,0,1,0,0,1
1,18,33,1,1725,1,0,0,1,0
2,28,33,3,4449,1,0,0,1,0
3,33,22,0,21984,1,0,1,0,0
4,32,28,0,3866,1,0,1,0,0
...,...,...,...,...,...,...,...,...,...
1333,50,30,3,10600,1,0,1,0,0
1334,18,31,0,2205,0,0,0,0,0
1335,18,36,0,1629,0,0,0,1,0
1336,21,25,0,2007,0,0,0,0,1


In [92]:
x = df_get_dummies.drop(columns = ['sex_male'])
y = df_get_dummies['sex_male']

In [94]:
# Splitting the data into training and testing sets
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2 , random_state=42)


In [99]:
x_train

Unnamed: 0,age,bmi,children,charges,smoker_yes,region_northwest,region_southeast,region_southwest
560,46,19.950,2,9193.83850,False,True,False,False
1285,47,24.320,0,8534.67180,False,False,False,False
1142,52,24.860,0,27117.99378,False,False,True,False
969,39,34.320,5,8596.82780,False,False,True,False
486,54,21.470,3,12475.35130,False,True,False,False
...,...,...,...,...,...,...,...,...
1095,18,31.350,4,4561.18850,False,False,False,False
1130,39,23.870,5,8582.30230,False,False,True,False
1294,58,25.175,0,11931.12525,False,False,False,False
860,37,47.600,2,46113.51100,True,False,False,True


In [96]:
# Standardisation
from sklearn.preprocessing import StandardScaler

In [97]:
si = StandardScaler()

In [100]:
x_train_sc = si.fit_transform(x_train)

In [101]:
x_train_new = pd.DataFrame(x_train_sc)

In [None]:
np.round(x_train_new.describe() , 2)         # standardization done 

Unnamed: 0,0,1,2,3,4,5,6,7
count,1070.0,1070.0,1070.0,1070.0,1070.0,1070.0,1070.0,1070.0
mean,-0.0,-0.0,-0.0,0.0,0.0,-0.0,-0.0,0.0
std,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
min,-1.52,-2.42,-0.91,-1.02,-0.51,-0.56,-0.6,-0.57
25%,-0.88,-0.72,-0.91,-0.7,-0.51,-0.56,-0.6,-0.57
50%,0.01,-0.06,-0.09,-0.31,-0.51,-0.56,-0.6,-0.57
75%,0.83,0.65,0.73,0.28,-0.51,-0.56,1.67,-0.57
max,1.75,3.74,3.2,4.1,1.97,1.78,1.67,1.75


In [103]:
from sklearn.preprocessing import MinMaxScaler

In [104]:
mn = MinMaxScaler()

In [105]:
x_train_mn = mn.fit_transform(x_train)

In [108]:
x_train_mn_df = pd.DataFrame(x_train_mn)
np.round(x_train_mn_df.describe(), 2)                       #normalisation done

Unnamed: 0,0,1,2,3,4,5,6,7
count,1070.0,1070.0,1070.0,1070.0,1070.0,1070.0,1070.0,1070.0
mean,0.46,0.39,0.22,0.2,0.21,0.24,0.26,0.25
std,0.31,0.16,0.24,0.2,0.4,0.43,0.44,0.43
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.2,0.28,0.0,0.06,0.0,0.0,0.0,0.0
50%,0.47,0.38,0.2,0.14,0.0,0.0,0.0,0.0
75%,0.72,0.5,0.4,0.25,0.0,0.0,1.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


Hello, World!
