# Importing Libraries

In [86]:
import numpy as np
import pandas as pd

# Importing the Data for ML 

In [87]:
data = pd.read_csv('C:/Users/aksin52/Documents/Machine Learning Express/Introduction to Machine Learning/Marketing Analytics Specialization/Part 2 - Data Preprocessing/Data/Insurance.csv')

In [88]:
data_dict = pd.read_excel('C:/Users/aksin52/Documents/Machine Learning Express/Introduction to Machine Learning/Marketing Analytics Specialization/Part 2 - Data Preprocessing/Data/Data_Dictionary_Insurance.xlsx')

In [89]:
# Checking Imported data size 

In [90]:
data.shape

(1338, 8)

# Printing a few rows of the data

In [91]:
data.head()

Unnamed: 0,id,age,sex,bmi,children,smoker,region,charges
0,C140,19,female,27.9,0,yes,southwest,16884.924
1,C117,18,male,33.77,1,no,southeast,1725.5523
2,C128,28,male,33.0,3,no,southeast,4449.462
3,C147,33,male,22.705,0,no,northwest,21984.47061
4,C116,32,male,28.88,0,no,northwest,3866.8552


# Print the Data Dictionary

In [92]:
data_dict

Unnamed: 0,age,age of primary beneficiary
0,sex,"insurance contractor gender, female, male"
1,bmi,Body mass index
2,children,Number of children covered by health insuranc...
3,smoker,Smoking
4,region,"the beneficiary's residential area in the US,..."
5,charges,Individual medical costs billed by health ins...


# Check for data types in the data

In [93]:
data.dtypes

id           object
age           int64
sex          object
bmi         float64
children      int64
smoker       object
region       object
charges     float64
dtype: object

# Dropping any given ID feature

In [94]:
data=data.drop(['id'],axis=1)

# Defining Independent and Target Feature

In [95]:
# Create a indepedent feature dataframe
X=data.drop(['charges'],axis=1)

# Create a dependent feature dataframe
Y=data[['charges']]

# Check for Missing Value Percentage across all features

In [96]:
X.isnull().mean()

age         0.0
sex         0.0
bmi         0.0
children    0.0
smoker      0.0
region      0.0
dtype: float64

In [97]:
Y.isnull().mean()

charges    0.0
dtype: float64

# Dropping variables that have more than 25% missing values

In [98]:
X = X.loc[:, X.isnull().mean() <= .25]

# Imputation of missing values

In [99]:
# Create a dataframe for numeric features
num=X.select_dtypes(include='number')

# Create a dataframe for categorical features
char=X.select_dtypes(include='object')

In [100]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
num_1=pd.DataFrame(imputer.fit_transform(num),index=num.index,columns=num.columns)

In [101]:
from sklearn.impute import SimpleImputer
imputer=SimpleImputer(missing_values=np.nan, strategy='most_frequent')
char_1=pd.DataFrame(imputer.fit_transform(char),index=char.index,columns=char.columns)

In [102]:
num.describe(percentiles=[0.01,0.1,0.15,0.2,0.25,0.5,0.75,0.9,0.95,0.99])

Unnamed: 0,age,bmi,children
count,1338.0,1338.0,1338.0
mean,39.207025,30.663397,1.094918
std,14.04996,6.098187,1.205493
min,18.0,15.96,0.0
1%,18.0,17.89515,0.0
10%,19.0,22.99,0.0
15%,22.0,24.26625,0.0
20%,24.0,25.326,0.0
25%,27.0,26.29625,0.0
50%,39.0,30.4,1.0


# Removal of Extreme Values and Outliers from the numerical features

In [103]:
def outlier_cap(x):
    x=x.clip_upper(x.quantile(0.99))
    x=x.clip_lower(x.quantile(0.01))
    return(x)

In [104]:
num=num.apply(lambda x : outlier_cap(x))
num.describe(percentiles=[0.01,0.1,0.15,0.2,0.25,0.5,0.75,0.9,0.95,0.99])

Unnamed: 0,age,bmi,children
count,1338.0,1338.0,1338.0
mean,39.207025,30.649718,1.094918
std,14.04996,6.025065,1.205493
min,18.0,17.89515,0.0
1%,18.0,17.917295,0.0
10%,19.0,22.99,0.0
15%,22.0,24.26625,0.0
20%,24.0,25.326,0.0
25%,27.0,26.29625,0.0
50%,39.0,30.4,1.0


# Removing features with 0 Variance

In [105]:
from sklearn.feature_selection import VarianceThreshold
varcutoff=VarianceThreshold()
num_1=pd.DataFrame(varcutoff.fit_transform(num_1),index=num_1.index,columns=num_1.columns)
num_1.describe()

Unnamed: 0,age,bmi,children
count,1338.0,1338.0,1338.0
mean,39.207025,30.663397,1.094918
std,14.04996,6.098187,1.205493
min,18.0,15.96,0.0
25%,27.0,26.29625,0.0
50%,39.0,30.4,1.0
75%,51.0,34.69375,2.0
max,64.0,53.13,5.0


# Encoding Categorical Independent Features

In [106]:
char_1.head()

Unnamed: 0,sex,smoker,region
0,female,yes,southwest
1,male,no,southeast
2,male,no,southeast
3,male,no,northwest
4,male,no,northwest


In [107]:
char_encode=pd.get_dummies(char_1)

In [108]:
char_encode.head()

Unnamed: 0,sex_female,sex_male,smoker_no,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest
0,1,0,0,1,0,0,0,1
1,0,1,1,0,0,0,1,0
2,0,1,1,0,0,0,1,0
3,0,1,1,0,0,1,0,0
4,0,1,1,0,0,1,0,0


# Feature Scaling

In [109]:
from sklearn.preprocessing import StandardScaler
std=StandardScaler()
X_all_std=pd.DataFrame(std.fit_transform(X_all),index=X_all.index,columns=X_all.columns)

# Feature Discretization

In [110]:
from sklearn.preprocessing import KBinsDiscretizer
Bins=KBinsDiscretizer(n_bins=10,encode='ordinal',strategy='quantile')
df_new=pd.DataFrame(Bins.fit_transform(num_1),index=num_1.index, columns=num_1.columns).add_suffix('_Bin')
df_new.head()

  'decreasing the number of bins.' % jj)


Unnamed: 0,age_Bin,bmi_Bin,children_Bin
0,1.0,3.0,0.0
1,0.0,7.0,1.0
2,2.0,6.0,3.0
3,3.0,0.0,0.0
4,3.0,4.0,0.0


# Joining dataframes to create a complete feature set

In [111]:
X_all=pd.concat([num_1,char_encode],axis=1,join='inner')
X_all.shape

(1338, 11)

# Splitting the data into Train and Test Sets

In [112]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X_all,Y, test_size=0.3,random_state=42)