In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('insurance.csv') # to read out the csv file

In [3]:
df

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.900,0,yes,southwest,16884.92400
1,18,male,33.770,1,no,southeast,1725.55230
2,28,male,33.000,3,no,southeast,4449.46200
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.880,0,no,northwest,3866.85520
...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.54830
1334,18,female,31.920,0,no,northeast,2205.98080
1335,18,female,36.850,0,no,southeast,1629.83350
1336,21,female,25.800,0,no,southwest,2007.94500


In [4]:
# measure of central tendencey
df.age.mean()

39.20702541106129

In [5]:
df.isnull().sum() # to check whether there is any missing value

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [6]:
#multiple  copy the dataframe to use in different encoding process
df1 = df.copy()
df2 = df.copy()
df3 = df.copy()
df4 = df.copy()
df5 = df.copy()
df6 = df.copy()
df7 = df.copy()
df8 = df.copy()
df9 = df.copy()
df10 = df.copy()

# Encoding

In [7]:
# label Encoder
df1.region.value_counts()

southeast    364
southwest    325
northwest    325
northeast    324
Name: region, dtype: int64

In [8]:
from sklearn.preprocessing import LabelEncoder # install/load the labelencoder library
le = LabelEncoder() # make an object

In [9]:
df.region = le.fit_transform(df[['region']]) # fit and transform in a single line

In [10]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,3,16884.924
1,18,male,33.77,1,no,2,1725.5523
2,28,male,33.0,3,no,2,4449.462
3,33,male,22.705,0,no,1,21984.47061
4,32,male,28.88,0,no,1,3866.8552


In [11]:
# Loop label encoder
for col in df3.columns:
    if df3[col].dtype==np.number:
        continue
    else:
        df3[col] = le.fit_transform(df3[col])

In [12]:
df3.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,1,0,27.9,0,1,3,16884.924
1,0,1,33.77,1,0,2,1725.5523
2,10,1,33.0,3,0,2,4449.462
3,15,1,22.705,0,0,1,21984.47061
4,14,1,28.88,0,0,1,3866.8552


In [13]:
from pandas.core.dtypes.common import is_numeric_dtype # load is_numeric library

In [14]:
df4.head() 

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [15]:
# loop encoding using is_numeric
for col in df4.columns:
    if is_numeric_dtype(df4[col]):
        continue
    df4[col] = le.fit_transform(df4[col])

In [16]:
df4.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.9,0,1,3,16884.924
1,18,1,33.77,1,0,2,1725.5523
2,28,1,33.0,3,0,2,4449.462
3,33,1,22.705,0,0,1,21984.47061
4,32,1,28.88,0,0,1,3866.8552


In [17]:
# One hot encoding

In [18]:
df5.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [19]:
dummy = pd.get_dummies(df5.region) # create dummy variable

In [20]:
dummy.head()

Unnamed: 0,northeast,northwest,southeast,southwest
0,0,0,0,1
1,0,0,1,0
2,0,0,1,0
3,0,1,0,0
4,0,1,0,0


In [21]:
df5.drop('region', axis =1, inplace = True) # drop the region column

In [22]:
df5.head()

Unnamed: 0,age,sex,bmi,children,smoker,charges
0,19,female,27.9,0,yes,16884.924
1,18,male,33.77,1,no,1725.5523
2,28,male,33.0,3,no,4449.462
3,33,male,22.705,0,no,21984.47061
4,32,male,28.88,0,no,3866.8552


In [23]:
dummy = pd.get_dummies(df6['region'], drop_first = True) #  drop the 1st column of dummy variable

In [24]:
dummy.head()

Unnamed: 0,northwest,southeast,southwest
0,0,0,1
1,0,1,0
2,0,1,0
3,1,0,0
4,1,0,0


In [25]:
new_df6 = pd.concat([df5, dummy], axis = 1) # concate the dummy variable with df

In [26]:
new_df6.head()

Unnamed: 0,age,sex,bmi,children,smoker,charges,northwest,southeast,southwest
0,19,female,27.9,0,yes,16884.924,0,0,1
1,18,male,33.77,1,no,1725.5523,0,1,0
2,28,male,33.0,3,no,4449.462,0,1,0
3,33,male,22.705,0,no,21984.47061,1,0,0
4,32,male,28.88,0,no,3866.8552,1,0,0


In [27]:
# Loop for one hot decoding

for col in df8.columns:
    if is_numeric_dtype(df8[col]):
        continue
    one = pd.get_dummies(df8[col], drop_first = True)
    df8.drop(df8[[col]], axis =1, inplace = True)
    df8 = pd.concat([df8, one], axis =1)

In [28]:
df8.head()

Unnamed: 0,age,bmi,children,charges,male,yes,northwest,southeast,southwest
0,19,27.9,0,16884.924,0,1,0,0,1
1,18,33.77,1,1725.5523,1,0,0,1,0
2,28,33.0,3,4449.462,1,0,0,1,0
3,33,22.705,0,21984.47061,1,0,1,0,0
4,32,28.88,0,3866.8552,1,0,1,0,0


In [29]:
# Ordinal decoder
df9.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [30]:
from sklearn.preprocessing import OrdinalEncoder # load ordinal decoder

In [31]:
order = df9.region.values # assign region into order variable

In [32]:
order

array(['southwest', 'southeast', 'southeast', ..., 'southeast',
       'southwest', 'northwest'], dtype=object)

In [33]:
order = df9.region.unique() 

In [34]:
order

array(['southwest', 'southeast', 'northwest', 'northeast'], dtype=object)

In [35]:
ordinal = OrdinalEncoder(categories = [order]) # make region in a order

In [36]:
ordinal

OrdinalEncoder(categories=[array(['southwest', 'southeast', 'northwest', 'northeast'], dtype=object)])

In [37]:
encoded = ordinal.fit_transform(df9[['region']]) # encoding with fit and transform in a sinle line

In [38]:
encoded

array([[0.],
       [1.],
       [1.],
       ...,
       [1.],
       [0.],
       [2.]])

In [39]:
df9.region = encoded # assigh encoded output into df9.region

In [40]:
df9.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,0.0,16884.924
1,18,male,33.77,1,no,1.0,1725.5523
2,28,male,33.0,3,no,1.0,4449.462
3,33,male,22.705,0,no,2.0,21984.47061
4,32,male,28.88,0,no,2.0,3866.8552


In [41]:
df10.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [42]:
df10.region = df10.region.replace(['southwest', 'southeast', 'northwest','northeast'], [1,2,3,4]) # encoding using replace function

In [43]:
df10.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,1,16884.924
1,18,male,33.77,1,no,2,1725.5523
2,28,male,33.0,3,no,2,4449.462
3,33,male,22.705,0,no,3,21984.47061
4,32,male,28.88,0,no,3,3866.8552
