# Practice

In [66]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
df = pd.read_excel("insurance.xlsx")
df.head()

Unnamed: 0,age,gender,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [67]:
#Checking for the null values
df.columns[df.isna().any()]

Index([], dtype='object')

In [68]:
#Checking the sum of null
df.isnull().sum()

age         0
gender      0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

# Without Encoding (manually using replace method)

In [69]:
df1 = df.copy()
df1.head()

Unnamed: 0,age,gender,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [71]:
a = df1.gender.unique() #Taking unique value of Area column
a

array(['female', 'male'], dtype=object)

In [73]:
df1.gender = df1.gender.replace(['male', 'female'],[1,2]) #manually replace the string value with integer value

In [74]:
df1.head()

Unnamed: 0,age,gender,bmi,children,smoker,region,charges
0,19,2,27.9,0,yes,southwest,16884.924
1,18,1,33.77,1,no,southeast,1725.5523
2,28,1,33.0,3,no,southeast,4449.462
3,33,1,22.705,0,no,northwest,21984.47061
4,32,1,28.88,0,no,northwest,3866.8552


# Label Encoding

In [75]:
df2 = df.copy()
df2.head()

Unnamed: 0,age,gender,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [76]:
from sklearn.preprocessing import LabelEncoder
g = LabelEncoder()
df2['gender'] = g.fit_transform(df2['gender']) #using label encoder
df2.head()

Unnamed: 0,age,gender,bmi,children,smoker,region,charges
0,19,0,27.9,0,yes,southwest,16884.924
1,18,1,33.77,1,no,southeast,1725.5523
2,28,1,33.0,3,no,southeast,4449.462
3,33,1,22.705,0,no,northwest,21984.47061
4,32,1,28.88,0,no,northwest,3866.8552


In [79]:
#We can also do this using loop if there are more column
for i in df2.columns:
    if df2[i].dtype == np.number:
        continue
    else:
        df2[i] = area.fit_transform(df2[i])

In [78]:
df2.head()

Unnamed: 0,age,gender,bmi,children,smoker,region,charges
0,1,0,27.9,0,1,3,16884.924
1,0,1,33.77,1,0,2,1725.5523
2,10,1,33.0,3,0,2,4449.462
3,15,1,22.705,0,0,1,21984.47061
4,14,1,28.88,0,0,1,3866.8552


In [185]:
#Another way of loop
from pandas.core.dtypes.common import is_numeric_dtype
for i in df2.columns:
    if is_numeric_dtype(df2[i]):
        continue
    else:
        df2[i] = g.fit_transform(df2[i])
df2.head()

Unnamed: 0,age,gender,bmi,children,smoker,region,charges
0,1,0,27.9,0,1,3,16884.924
1,0,1,33.77,1,0,2,1725.5523
2,10,1,33.0,3,0,2,4449.462
3,15,1,22.705,0,0,1,21984.47061
4,14,1,28.88,0,0,1,3866.8552


#  One Hot Encoding

In [160]:
df3 = df.copy()
df3.head()

Unnamed: 0,age,gender,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [161]:
#making smoker column values into integer corresponding to yes and no. when yes in active tehn it is 1 and no is 0
dummies = pd.get_dummies(df3['smoker']).astype(int)
dummies.head(5) 

Unnamed: 0,no,yes
0,0,1
1,1,0
2,1,0
3,1,0
4,1,0


In [162]:
df3 = pd.concat([df3,dummies],axis='columns')#adding column of dummies
df3.head() 

Unnamed: 0,age,gender,bmi,children,smoker,region,charges,no,yes
0,19,female,27.9,0,yes,southwest,16884.924,0,1
1,18,male,33.77,1,no,southeast,1725.5523,1,0
2,28,male,33.0,3,no,southeast,4449.462,1,0
3,33,male,22.705,0,no,northwest,21984.47061,1,0
4,32,male,28.88,0,no,northwest,3866.8552,1,0


In [163]:
df3.drop(['smoker'],axis='columns',inplace=True)
df3.head()

Unnamed: 0,age,gender,bmi,children,region,charges,no,yes
0,19,female,27.9,0,southwest,16884.924,0,1
1,18,male,33.77,1,southeast,1725.5523,1,0
2,28,male,33.0,3,southeast,4449.462,1,0
3,33,male,22.705,0,northwest,21984.47061,1,0
4,32,male,28.88,0,northwest,3866.8552,1,0


In [164]:
#We can also do this using loop if there are more columns
cols = ['gender','region']
if df3 is not None:
    for i in cols:
        dummies = pd.get_dummies(df3[i]).astype(int)
        df3 = pd.concat([df3, dummies], axis='columns')
        df3.drop([i], axis='columns', inplace=True)

df3.head()

Unnamed: 0,age,bmi,children,charges,no,yes,female,male,northeast,northwest,southeast,southwest
0,19,27.9,0,16884.924,0,1,1,0,0,0,0,1
1,18,33.77,1,1725.5523,1,0,0,1,0,0,1,0
2,28,33.0,3,4449.462,1,0,0,1,0,0,1,0
3,33,22.705,0,21984.47061,1,0,0,1,0,1,0,0
4,32,28.88,0,3866.8552,1,0,0,1,0,1,0,0


# Ordinal Encoding

লেভেল এনকোডিং এ যেমন অ্যাসেন্ডিং অর্ডার অথবা ডিসএন্ডিং অর্ডারে মান গুলোকে  একটা করে ইউনিক ভ্যালু দেওয়া হয় কিন্তু অর্ডিনাল এনকোডিংয়ে প্রথমে যে মানটা থাকে সেটার করসপন্ডিং একটা ইউনিক ভ্যালু দেওয়া হয়,এখানে কোন এসেন্ডিং বা ডিসেন্ডিং অর্ডার মানা হয় না

In [171]:
df4 = df.copy()
df4.head()

Unnamed: 0,age,gender,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [180]:
from sklearn.preprocessing import OrdinalEncoder
a = OrdinalEncoder()
df4['gender'] = a.fit_transform(df4[['gender']])
df4.head()

Unnamed: 0,age,gender,bmi,children,smoker,region,charges
0,19,0.0,27.9,0,yes,southwest,16884.924
1,18,1.0,33.77,1,no,southeast,1725.5523
2,28,1.0,33.0,3,no,southeast,4449.462
3,33,1.0,22.705,0,no,northwest,21984.47061
4,32,1.0,28.88,0,no,northwest,3866.8552


In [189]:
#using loop 
a = OrdinalEncoder()
for i in df4.columns:
    if is_numeric_dtype(df4[i]):
        continue
    else:
        df4[i] = a.fit_transform(df4[[i]])
df4.head()

Unnamed: 0,age,gender,bmi,children,smoker,region,charges
0,19,0.0,27.9,0,1.0,3.0,16884.924
1,18,1.0,33.77,1,0.0,2.0,1725.5523
2,28,1.0,33.0,3,0.0,2.0,4449.462
3,33,1.0,22.705,0,0.0,1.0,21984.47061
4,32,1.0,28.88,0,0.0,1.0,3866.8552
