In [1]:
import pandas as pd
from pandas.core.dtypes.common import is_numeric_dtype

In [2]:
df = pd.read_csv("insurance.csv")

In [3]:
df

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.900,0,yes,southwest,16884.92400
1,18,male,33.770,1,no,southeast,1725.55230
2,28,male,33.000,3,no,southeast,4449.46200
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.880,0,no,northwest,3866.85520
...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.54830
1334,18,female,31.920,0,no,northeast,2205.98080
1335,18,female,36.850,0,no,southeast,1629.83350
1336,21,female,25.800,0,no,southwest,2007.94500


In [4]:
df.isnull().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

# LABEL ENCODER

In [5]:
from sklearn.preprocessing import LabelEncoder
LE = LabelEncoder()

In [6]:
dfL = df.copy()

In [7]:
for col in dfL.columns:
    if not is_numeric_dtype(dfL[col]):
        dfL[col] = LE.fit_transform(df[col])

In [8]:
dfL

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.900,0,1,3,16884.92400
1,18,1,33.770,1,0,2,1725.55230
2,28,1,33.000,3,0,2,4449.46200
3,33,1,22.705,0,0,1,21984.47061
4,32,1,28.880,0,0,1,3866.85520
...,...,...,...,...,...,...,...
1333,50,1,30.970,3,0,1,10600.54830
1334,18,0,31.920,0,0,0,2205.98080
1335,18,0,36.850,0,0,2,1629.83350
1336,21,0,25.800,0,0,3,2007.94500


# ONE-HOT Encoder

In [9]:
dfOH = df.copy()

In [10]:
for col in dfOH.columns:
    if not is_numeric_dtype(dfOH[col]):
        dum = pd.get_dummies(dfOH[col], prefix=col, drop_first=True)
        dfOH.drop(dfOH[[col]], axis=1, inplace=True)
        dfOH = pd.concat([dfOH, dum], axis=1)

In [11]:
dfOH

Unnamed: 0,age,bmi,children,charges,sex_male,smoker_yes,region_northwest,region_southeast,region_southwest
0,19,27.900,0,16884.92400,0,1,0,0,1
1,18,33.770,1,1725.55230,1,0,0,1,0
2,28,33.000,3,4449.46200,1,0,0,1,0
3,33,22.705,0,21984.47061,1,0,1,0,0
4,32,28.880,0,3866.85520,1,0,1,0,0
...,...,...,...,...,...,...,...,...,...
1333,50,30.970,3,10600.54830,1,0,1,0,0
1334,18,31.920,0,2205.98080,0,0,0,0,0
1335,18,36.850,0,1629.83350,0,0,0,1,0
1336,21,25.800,0,2007.94500,0,0,0,0,1


# ORDINAL ENCODER

In [12]:
from sklearn.preprocessing import OrdinalEncoder


In [13]:
dfOE = df.copy()

In [14]:
for col in dfOE:
    if not is_numeric_dtype(dfOE[col]):
        unique = dfOE[col].unique()
        order = OrdinalEncoder(categories=[unique])
        encoded = order.fit_transform(dfOE[[col]])
        dfOE[col] = encoded

In [15]:
dfOE

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0.0,27.900,0,0.0,0.0,16884.92400
1,18,1.0,33.770,1,1.0,1.0,1725.55230
2,28,1.0,33.000,3,1.0,1.0,4449.46200
3,33,1.0,22.705,0,1.0,2.0,21984.47061
4,32,1.0,28.880,0,1.0,2.0,3866.85520
...,...,...,...,...,...,...,...
1333,50,1.0,30.970,3,1.0,2.0,10600.54830
1334,18,0.0,31.920,0,1.0,3.0,2205.98080
1335,18,0.0,36.850,0,1.0,1.0,1629.83350
1336,21,0.0,25.800,0,1.0,0.0,2007.94500
