 # Import Libraries and Dataset

In [1]:
import pandas as pd
df1 = pd.read_csv('insurance.csv')
df1.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16.884.924
1,18,male,33.77,1,no,southeast,17.255.523
2,28,male,33.0,3,no,southeast,4.449.462
3,33,male,22.705,0,no,northwest,2.198.447.061
4,32,male,28.88,0,no,northwest,38.668.552


# Copying dataset

In [2]:
df2 = df1.copy()
df3 = df1.copy()
df4 = df1.copy()
df5 = df1.copy()
df6 = df1.copy()
df7 = df1.copy()
df8 = df1.copy()
df9 = df1.copy()
df10 = df1.copy()

# Data Info

In [3]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   object 
dtypes: float64(1), int64(2), object(4)
memory usage: 73.3+ KB


# Checking the NULL value

In [4]:
df1.isnull().sum() # there's no NULL value!

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [5]:
df1.shape # 1338 rows and 7 columns

(1338, 7)

In [6]:
df1.isnull() # no true value

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...
1333,False,False,False,False,False,False,False
1334,False,False,False,False,False,False,False
1335,False,False,False,False,False,False,False
1336,False,False,False,False,False,False,False


# If it any NULL value exists then we may apply following techniques

In [7]:
# df = df.dropna()
# df = df.dropna(axis=0) -> for dropping row
# df = df.dropna(axis=1) -> for dropping column
# df = df.dropna(inplace=True) -> for updating 
# df = df.dropna(inplace=False) -> by default 
# df.region = df.region.fillna(df.region.mean()) -> for filling the value with mean 
# df.region = df.region.fillna(df.region.median()) -< using median

# Data Encoding

# 1) Without Encoding Technique

In [8]:
df1.region = df1.region.replace(['northeast', 'northwest', 'southwest', 'southeast'],[2,4,6,8])
df1.head() # this technique is NOT suggested

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,6,16.884.924
1,18,male,33.77,1,no,8,17.255.523
2,28,male,33.0,3,no,8,4.449.462
3,33,male,22.705,0,no,4,2.198.447.061
4,32,male,28.88,0,no,4,38.668.552


# 2) Label Encoder Technique

In [9]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [10]:
df2.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16.884.924
1,18,male,33.77,1,no,southeast,17.255.523
2,28,male,33.0,3,no,southeast,4.449.462
3,33,male,22.705,0,no,northwest,2.198.447.061
4,32,male,28.88,0,no,northwest,38.668.552


In [11]:
df2.region = le.fit_transform(df2.region)
df2.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,3,16.884.924
1,18,male,33.77,1,no,2,17.255.523
2,28,male,33.0,3,no,2,4.449.462
3,33,male,22.705,0,no,1,2.198.447.061
4,32,male,28.88,0,no,1,38.668.552


# 2) Label Encoder by using loop

In [12]:
import numpy as np
df2.columns

Index(['age', 'sex', 'bmi', 'children', 'smoker', 'region', 'charges'], dtype='object')

In [13]:
import numpy as np
from pandas.core.dtypes.common import is_numeric_dtype

# import warnings as wr
# wr.filterwarnings('ignore') -> for removing warnings

for column in df2.columns:
    if is_numeric_dtype(df2[column]):
        continue
    df2[column] = le.fit_transform(df2[column])

In [14]:
df2.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.9,0,1,3,368
1,18,1,33.77,1,0,2,387
2,28,1,33.0,3,0,2,829
3,33,1,22.705,0,0,1,468
4,32,1,28.88,0,0,1,784


# 3) One Hot Encoder

In [15]:
dummy = pd.get_dummies(df3['region'], prefix='area')
dummy

Unnamed: 0,area_northeast,area_northwest,area_southeast,area_southwest
0,0,0,0,1
1,0,0,1,0
2,0,0,1,0
3,0,1,0,0
4,0,1,0,0
...,...,...,...,...
1333,0,1,0,0
1334,1,0,0,0
1335,0,0,1,0
1336,0,0,0,1


In [16]:
dummy = pd.get_dummies(df3['region'], prefix='area', drop_first=True) 

In [17]:
dummy.head() # dummy var trap

Unnamed: 0,area_northwest,area_southeast,area_southwest
0,0,0,1
1,0,1,0
2,0,1,0
3,1,0,0
4,1,0,0


In [18]:
df3.head() # here should have seen without region column!

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16.884.924
1,18,male,33.77,1,no,southeast,17.255.523
2,28,male,33.0,3,no,southeast,4.449.462
3,33,male,22.705,0,no,northwest,2.198.447.061
4,32,male,28.88,0,no,northwest,38.668.552


In [19]:
# df4 = df4.drop(['region'], axis=1) # the dropped column [area_northeast] exists yet ! why?

In [20]:
# df4 # why should I take this step for dropping 'region'. it should automatically be removed! 

# Adding two columns by using concat mehtod

In [21]:
new_df4 = pd.concat([df4,dummy],axis=1)
new_df4.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,area_northwest,area_southeast,area_southwest
0,19,female,27.9,0,yes,southwest,16.884.924,0,0,1
1,18,male,33.77,1,no,southeast,17.255.523,0,1,0
2,28,male,33.0,3,no,southeast,4.449.462,0,1,0
3,33,male,22.705,0,no,northwest,2.198.447.061,1,0,0
4,32,male,28.88,0,no,northwest,38.668.552,1,0,0


# Dropping the 'region' column

In [22]:
y = new_df4.region
x = new_df4.drop('region',axis=1)
x.head()

Unnamed: 0,age,sex,bmi,children,smoker,charges,area_northwest,area_southeast,area_southwest
0,19,female,27.9,0,yes,16.884.924,0,0,1
1,18,male,33.77,1,no,17.255.523,0,1,0
2,28,male,33.0,3,no,4.449.462,0,1,0
3,33,male,22.705,0,no,2.198.447.061,1,0,0
4,32,male,28.88,0,no,38.668.552,1,0,0


# Loop in One-Hot-Encoder

In [23]:
cols = ['region']
for column in cols:
    one = pd.get_dummies(df4[column])
    df4 = pd.concat([df4,one], axis=1).drop(column, axis=1)

df4

Unnamed: 0,age,sex,bmi,children,smoker,charges,northeast,northwest,southeast,southwest
0,19,female,27.900,0,yes,16.884.924,0,0,0,1
1,18,male,33.770,1,no,17.255.523,0,0,1,0
2,28,male,33.000,3,no,4.449.462,0,0,1,0
3,33,male,22.705,0,no,2.198.447.061,0,1,0,0
4,32,male,28.880,0,no,38.668.552,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,106.005.483,0,1,0,0
1334,18,female,31.920,0,no,22.059.808,1,0,0,0
1335,18,female,36.850,0,no,16.298.335,0,0,1,0
1336,21,female,25.800,0,no,2.007.945,0,0,0,1


In [24]:
df4.head()

Unnamed: 0,age,sex,bmi,children,smoker,charges,northeast,northwest,southeast,southwest
0,19,female,27.9,0,yes,16.884.924,0,0,0,1
1,18,male,33.77,1,no,17.255.523,0,0,1,0
2,28,male,33.0,3,no,4.449.462,0,0,1,0
3,33,male,22.705,0,no,2.198.447.061,0,1,0,0
4,32,male,28.88,0,no,38.668.552,0,1,0,0


# Dummy Varibale Trap

In [25]:
df4 = df4.drop(['northeast'], axis=1)

In [26]:
df4.head()

Unnamed: 0,age,sex,bmi,children,smoker,charges,northwest,southeast,southwest
0,19,female,27.9,0,yes,16.884.924,0,0,1
1,18,male,33.77,1,no,17.255.523,0,1,0
2,28,male,33.0,3,no,4.449.462,0,1,0
3,33,male,22.705,0,no,2.198.447.061,1,0,0
4,32,male,28.88,0,no,38.668.552,1,0,0


# 4) Ordinal 

In [27]:
df5.region.unique()

array(['southwest', 'southeast', 'northwest', 'northeast'], dtype=object)

In [28]:
directions = ['southwest', 'southeast', 'northwest', 'northeast']

from sklearn.preprocessing import OrdinalEncoder
ordinal = OrdinalEncoder(categories=[directions])

ordinal

OrdinalEncoder(categories=[['southwest', 'southeast', 'northwest',
                            'northeast']])

In [29]:
encoded = ordinal.fit_transform(df5[['region']])
encoded

array([[0.],
       [1.],
       [1.],
       ...,
       [1.],
       [0.],
       [2.]])

In [30]:
en = pd.DataFrame(encoded,columns=['region1'])
en.head()

Unnamed: 0,region1
0,0.0
1,1.0
2,1.0
3,2.0
4,2.0


# Using loop in Ordinal 

In [31]:
df7.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16.884.924
1,18,male,33.77,1,no,southeast,17.255.523
2,28,male,33.0,3,no,southeast,4.449.462
3,33,male,22.705,0,no,northwest,2.198.447.061
4,32,male,28.88,0,no,northwest,38.668.552


In [32]:
cols = ['region']

for column in cols:
    unique = df7[column].unique()
    df7[column] = OrdinalEncoder(categories=[directions]).fit_transform(df7[[column]])
    

In [33]:
df7.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,0.0,16.884.924
1,18,male,33.77,1,no,1.0,17.255.523
2,28,male,33.0,3,no,1.0,4.449.462
3,33,male,22.705,0,no,2.0,2.198.447.061
4,32,male,28.88,0,no,2.0,38.668.552


# Concatenating

In [34]:
cols = ['region']
for column in cols:
    ordinal = pd.get_dummies(df7[column])
    df7 = pd.concat([df7, ordinal], axis=1).drop(column, axis=1)



In [35]:
df7.head()

Unnamed: 0,age,sex,bmi,children,smoker,charges,0.0,1.0,2.0,3.0
0,19,female,27.9,0,yes,16.884.924,1,0,0,0
1,18,male,33.77,1,no,17.255.523,0,1,0,0
2,28,male,33.0,3,no,4.449.462,0,1,0,0
3,33,male,22.705,0,no,2.198.447.061,0,0,1,0
4,32,male,28.88,0,no,38.668.552,0,0,1,0


In [36]:
new_dummy = pd.get_dummies(df7[0.0], prefix='area', drop_first=True) 

In [37]:
new_dummy.head()

Unnamed: 0,area_1
0,1
1,0
2,0
3,0
4,0


In [38]:
df7.head()

Unnamed: 0,age,sex,bmi,children,smoker,charges,0.0,1.0,2.0,3.0
0,19,female,27.9,0,yes,16.884.924,1,0,0,0
1,18,male,33.77,1,no,17.255.523,0,1,0,0
2,28,male,33.0,3,no,4.449.462,0,1,0,0
3,33,male,22.705,0,no,2.198.447.061,0,0,1,0
4,32,male,28.88,0,no,38.668.552,0,0,1,0


# Dummy Variable Trap

In [39]:
df7 = df7.drop([0.0], axis=1)
df7.head()

Unnamed: 0,age,sex,bmi,children,smoker,charges,1.0,2.0,3.0
0,19,female,27.9,0,yes,16.884.924,0,0,0
1,18,male,33.77,1,no,17.255.523,1,0,0
2,28,male,33.0,3,no,4.449.462,1,0,0
3,33,male,22.705,0,no,2.198.447.061,0,1,0
4,32,male,28.88,0,no,38.668.552,0,1,0


# Corrected Assignment 👇 

In [40]:
df8.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16.884.924
1,18,male,33.77,1,no,southeast,17.255.523
2,28,male,33.0,3,no,southeast,4.449.462
3,33,male,22.705,0,no,northwest,2.198.447.061
4,32,male,28.88,0,no,northwest,38.668.552


In [43]:
cols = ['sex', 'smoker', 'region']

for column in cols:
    one = pd.get_dummies(df8[column], drop_first=True, prefix='dummy')
    df8 = pd.concat([df8,one], axis=1,).drop(column, axis=1)


In [44]:
df8.head()

Unnamed: 0,age,bmi,children,charges,dummy_male,dummy_yes,dummy_northwest,dummy_southeast,dummy_southwest
0,19,27.9,0,16.884.924,0,1,0,0,1
1,18,33.77,1,17.255.523,1,0,0,1,0
2,28,33.0,3,4.449.462,1,0,0,1,0
3,33,22.705,0,2.198.447.061,1,0,1,0,0
4,32,28.88,0,38.668.552,1,0,1,0,0
