In [1]:
import pandas as pd
df = pd.read_csv('insurance.csv')

In [2]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


# Measures of central tendency

In [3]:
df.charges.mean()

13270.422265141257

In [4]:
df.charges.median()

9382.033

In [5]:
df.corr()


Unnamed: 0,age,bmi,children,charges
age,1.0,0.109272,0.042469,0.299008
bmi,0.109272,1.0,0.012759,0.198341
children,0.042469,0.012759,1.0,0.067998
charges,0.299008,0.198341,0.067998,1.0


In [6]:
df.describe()

Unnamed: 0,age,bmi,children,charges
count,1338.0,1338.0,1338.0,1338.0
mean,39.207025,30.663397,1.094918,13270.422265
std,14.04996,6.098187,1.205493,12110.011237
min,18.0,15.96,0.0,1121.8739
25%,27.0,26.29625,0.0,4740.28715
50%,39.0,30.4,1.0,9382.033
75%,51.0,34.69375,2.0,16639.912515
max,64.0,53.13,5.0,63770.42801


In [7]:
df.isnull().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

# Label Encoder

In [8]:
from sklearn.preprocessing import LabelEncoder

In [9]:
le = LabelEncoder()

In [10]:
df.sex = le.fit_transform(df['sex'])

In [11]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.9,0,yes,southwest,16884.924
1,18,1,33.77,1,no,southeast,1725.5523
2,28,1,33.0,3,no,southeast,4449.462
3,33,1,22.705,0,no,northwest,21984.47061
4,32,1,28.88,0,no,northwest,3866.8552


In [12]:
df.region = le.fit_transform(df['region'])

In [13]:
df.smoker = le.fit_transform(df['smoker'])

In [14]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.9,0,1,3,16884.924
1,18,1,33.77,1,0,2,1725.5523
2,28,1,33.0,3,0,2,4449.462
3,33,1,22.705,0,0,1,21984.47061
4,32,1,28.88,0,0,1,3866.8552


# using loop

In [15]:
import pandas as pd
df = pd.read_csv('insurance.csv')
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [20]:
from pandas.core.dtypes.common import is_numeric_dtype

In [21]:
for column in df.columns:
    if is_numeric_dtype(df[column]):
        continue
    else:
        df[column] = le.fit_transform(df[column])

In [22]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,1,0,27.9,0,1,3,16884.924
1,0,1,33.77,1,0,2,1725.5523
2,10,1,33.0,3,0,2,4449.462
3,15,1,22.705,0,0,1,21984.47061
4,14,1,28.88,0,0,1,3866.8552


# One Hot Encoding

In [23]:
import pandas as pd
df = pd.read_csv('insurance.csv')
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [24]:
x=['region','sex', 'smoker']
dummy=pd.get_dummies(df[x])

In [26]:
dummy.head()

Unnamed: 0,region_northeast,region_northwest,region_southeast,region_southwest,sex_female,sex_male,smoker_no,smoker_yes
0,0,0,0,1,1,0,0,1
1,0,0,1,0,0,1,1,0
2,0,0,1,0,0,1,1,0
3,0,1,0,0,0,1,1,0
4,0,1,0,0,0,1,1,0


In [27]:
dummy = pd.get_dummies(df[x], drop_first=True)

In [28]:
dummy.head()

Unnamed: 0,region_northwest,region_southeast,region_southwest,sex_male,smoker_yes
0,0,0,1,0,1
1,0,1,0,1,0
2,0,1,0,1,0
3,1,0,0,1,0
4,1,0,0,1,0


In [29]:
df=df.drop(x, axis=1)

In [30]:
df.head()

Unnamed: 0,age,bmi,children,charges
0,19,27.9,0,16884.924
1,18,33.77,1,1725.5523
2,28,33.0,3,4449.462
3,33,22.705,0,21984.47061
4,32,28.88,0,3866.8552


In [32]:
df2=pd.concat([df,dummy], axis=1)


In [33]:
df2.head()

Unnamed: 0,age,bmi,children,charges,region_northwest,region_southeast,region_southwest,sex_male,smoker_yes
0,19,27.9,0,16884.924,0,0,1,0,1
1,18,33.77,1,1725.5523,0,1,0,1,0
2,28,33.0,3,4449.462,0,1,0,1,0
3,33,22.705,0,21984.47061,1,0,0,1,0
4,32,28.88,0,3866.8552,1,0,0,1,0


# using loop

In [34]:
import pandas as pd
df = pd.read_csv('insurance.csv')
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [35]:
columns=['sex', 'smoker', 'region']
for column in columns:
  one_hot_en=pd.get_dummies(df[column], drop_first=True) 
  df=pd.concat((df,one_hot_en), axis=1)

In [36]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,male,yes,northwest,southeast,southwest
0,19,female,27.9,0,yes,southwest,16884.924,0,1,0,0,1
1,18,male,33.77,1,no,southeast,1725.5523,1,0,0,1,0
2,28,male,33.0,3,no,southeast,4449.462,1,0,0,1,0
3,33,male,22.705,0,no,northwest,21984.47061,1,0,1,0,0
4,32,male,28.88,0,no,northwest,3866.8552,1,0,1,0,0


In [37]:
df1=df.drop(columns, axis=1)

In [38]:
df1.head()

Unnamed: 0,age,bmi,children,charges,male,yes,northwest,southeast,southwest
0,19,27.9,0,16884.924,0,1,0,0,1
1,18,33.77,1,1725.5523,1,0,0,1,0
2,28,33.0,3,4449.462,1,0,0,1,0
3,33,22.705,0,21984.47061,1,0,1,0,0
4,32,28.88,0,3866.8552,1,0,1,0,0


# Replace Function

In [39]:
import pandas as pd
df = pd.read_csv('insurance.csv')
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [40]:
df.sex.unique()

array(['female', 'male'], dtype=object)

In [41]:
df.sex = df.sex.replace(['female', 'male'], [1,2])
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,1,27.9,0,yes,southwest,16884.924
1,18,2,33.77,1,no,southeast,1725.5523
2,28,2,33.0,3,no,southeast,4449.462
3,33,2,22.705,0,no,northwest,21984.47061
4,32,2,28.88,0,no,northwest,3866.8552


In [44]:
df.smoker.unique()

array(['yes', 'no'], dtype=object)

In [45]:
df.smoker=df.smoker.replace(['yes', 'no'], [1,0])
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,1,27.9,0,1,southwest,16884.924
1,18,2,33.77,1,0,southeast,1725.5523
2,28,2,33.0,3,0,southeast,4449.462
3,33,2,22.705,0,0,northwest,21984.47061
4,32,2,28.88,0,0,northwest,3866.8552


In [46]:
df.region.unique()

array(['southwest', 'southeast', 'northwest', 'northeast'], dtype=object)

In [47]:
df.region=df.region.replace(['southwest', 'southeast', 'northwest', 'northeast'], [1,2,3,4])
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,1,27.9,0,1,1,16884.924
1,18,2,33.77,1,0,2,1725.5523
2,28,2,33.0,3,0,2,4449.462
3,33,2,22.705,0,0,3,21984.47061
4,32,2,28.88,0,0,3,3866.8552


# ordinal Encoder

In [48]:
import pandas as pd
df = pd.read_csv('insurance.csv')
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [49]:
df.sex.unique()

array(['female', 'male'], dtype=object)

In [50]:
gender=['female', 'male']

In [51]:
from sklearn.preprocessing import OrdinalEncoder

In [52]:
ordinal = OrdinalEncoder(categories=[gender])

In [53]:
encoded = ordinal.fit_transform(df[['sex']])

In [54]:
newdata_frame = pd.DataFrame(encoded, columns=['Sex'])

In [55]:
newdata_frame.head()

Unnamed: 0,Sex
0,0.0
1,1.0
2,1.0
3,1.0
4,1.0


In [65]:
df.sex = newdata_frame.Sex

In [66]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0.0,27.9,0,yes,southwest,16884.924
1,18,1.0,33.77,1,no,southeast,1725.5523
2,28,1.0,33.0,3,no,southeast,4449.462
3,33,1.0,22.705,0,no,northwest,21984.47061
4,32,1.0,28.88,0,no,northwest,3866.8552


In [58]:
df.region.unique()

array(['southwest', 'southeast', 'northwest', 'northeast'], dtype=object)

In [59]:
city = ['southwest', 'southeast', 'northwest', 'northeast']

In [60]:
ordinal01 = OrdinalEncoder(categories=[city])

In [62]:
encoded01 = ordinal01.fit_transform(df[['region']])

In [63]:
newdata_frame01 = pd.DataFrame(encoded01, columns=['Region'])

In [64]:
newdata_frame01.head()

Unnamed: 0,Region
0,0.0
1,1.0
2,1.0
3,2.0
4,2.0


In [67]:
df.region = newdata_frame01.Region

In [68]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0.0,27.9,0,yes,0.0,16884.924
1,18,1.0,33.77,1,no,1.0,1725.5523
2,28,1.0,33.0,3,no,1.0,4449.462
3,33,1.0,22.705,0,no,2.0,21984.47061
4,32,1.0,28.88,0,no,2.0,3866.8552


In [69]:
df.smoker.unique()

array(['yes', 'no'], dtype=object)

In [70]:
smoke = ['yes', 'no']

In [71]:
ordinal02 = OrdinalEncoder(categories=[smoke])

In [73]:
encoded02 = ordinal02.fit_transform(df[['smoker']])

In [74]:
newdata_frame02 = pd.DataFrame(encoded02, columns=['Smoker'])

In [75]:
newdata_frame02.head()

Unnamed: 0,Smoker
0,0.0
1,1.0
2,1.0
3,1.0
4,1.0


In [77]:
df.smoker = newdata_frame02.Smoker

In [78]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0.0,27.9,0,0.0,0.0,16884.924
1,18,1.0,33.77,1,1.0,1.0,1725.5523
2,28,1.0,33.0,3,1.0,1.0,4449.462
3,33,1.0,22.705,0,1.0,2.0,21984.47061
4,32,1.0,28.88,0,1.0,2.0,3866.8552


# using loop

In [79]:
import pandas as pd
df = pd.read_csv('insurance.csv')
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [None]:
columns=['sex', 'smoker', 'region']

for column in columns:
  unique= df[colu].unique()
  df[col] =  OrdinalEncoder(categories=[unique]).fit_transform(df[[col]])