<a href="https://colab.research.google.com/github/lisabroadhead/data_science_machine-learning/blob/main/pre_processing_exercise.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Pre-Processing Exercise (Practice)
- Lisa Broadhead
- June 22, 2022

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.compose import make_column_selector
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

In [2]:
file = '/content/drive/MyDrive/Colab Notebooks/coding_dojo/Machine Learning/files/insurance.csv'

In [4]:
df = pd.read_csv(file)
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


### Define features (X) and target (y)

In [5]:
y = df['charges']
X = df.drop(columns='charges')

In [6]:
y.head()

0    16884.92400
1     1725.55230
2     4449.46200
3    21984.47061
4     3866.85520
Name: charges, dtype: float64

In [7]:
X.head()

Unnamed: 0,age,sex,bmi,children,smoker,region
0,19,female,27.9,0,yes,southwest
1,18,male,33.77,1,no,southeast
2,28,male,33.0,3,no,southeast
3,33,male,22.705,0,no,northwest
4,32,male,28.88,0,no,northwest


### Train test split the data to prepare for machine learning

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=42)

In [10]:
len(df), len(X_train), len(X_test)

(1338, 1003, 335)

### Identify each feature as numerical, ordinal, or nominal. (Please provide this answer in a text cell in your Colab notebook)

1. age - Numeric
2. sex - Nominal 
3. bmi - Numeric 
4. childern - Numeric 
5. smoker - Nominal 
6. region - Nominal 

### Ordinal encode any ordinal features

- Skipped: Didn't think any of thw categories were ordinal

### One Hot Encode any nominal features 

In [13]:
cat_selector = make_column_selector(dtype_include='object')

<sklearn.compose._column_transformer.make_column_selector at 0x7f9b02b2d690>

In [14]:
cat_selector(X_train)

['sex', 'smoker', 'region']

In [24]:
train_cat_data = X_train[cat_selector(X_train)]
test_cat_data = X_test[cat_selector(X_test)]
train_cat_data

Unnamed: 0,sex,smoker,region
693,male,no,northwest
1297,female,no,southeast
634,male,no,southwest
1022,male,yes,southeast
178,female,no,southwest
...,...,...,...
1095,female,no,northeast
1130,female,no,southeast
1294,male,no,northeast
860,female,yes,southwest


In [22]:
ohe_encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
ohe_encoder.fit(train_cat_data)

train_ohe = ohe_encoder.transform(train_cat_data)
test_ohe = ohe_encoder.transform(test_cat_data) 

train_ohe 

array([[0., 1., 1., ..., 1., 0., 0.],
       [1., 0., 1., ..., 0., 1., 0.],
       [0., 1., 1., ..., 0., 0., 1.],
       ...,
       [0., 1., 1., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 1.],
       [0., 1., 1., ..., 0., 0., 1.]])

In [29]:
ohe_column_names = ohe_encoder.get_feature_names_out(train_cat_data.columns)

train_ohe = pd.DataFrame(train_ohe, columns=ohe_column_names)
test_ohe = pd.DataFrame(test_ohe, columns=ohe_column_names)

train_ohe

Unnamed: 0,sex_female,sex_male,smoker_no,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest
0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0
1,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
2,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0
3,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
4,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...
998,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
999,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1000,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0
1001,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0


### Scale any numeric features

In [75]:
scaler = StandardScaler()
num_selector = make_column_selector(dtype_include='number')

train_num_data = X_train[num_selector(X_train)]
test_num_data = X_test[num_selector(X_test)]

scaler.fit(train_num_data)

StandardScaler()

In [77]:
train_scaled = scaler.transform(train_num_data)
test_scaled = scaler.transform(test_num_data)

array([[-1.08716652, -1.14087456, -0.91749963],
       [-0.80210593, -0.66584152,  0.7436053 ],
       [ 0.83699246,  1.52879447, -0.08694717],
       [ 0.55193187,  0.92647587, -0.08694717],
       [ 0.48066672, -0.26817814,  0.7436053 ]])

In [87]:
X_train_scaled = pd.DataFrame(train_scaled, columns=train_num_data.columns)
X_test_scaled = pd.DataFrame(test_scaled, columns=train_num_data.columns)

### Concatenate all features back into one dataframe.

In [88]:
num_selector = make_column_selector(dtype_include='number')

X_train_processed = pd.concat([X_train_scaled, train_ohe], axis=1)
X_test_processed = pd.concat([X_test_scaled,test_ohe], axis=1)

X_train_processed

Unnamed: 0,age,bmi,children,sex_female,sex_male,smoker_no,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest
0,-1.087167,-1.140875,-0.917500,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0
1,-0.802106,-0.665842,0.743605,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
2,0.836992,1.528794,-0.086947,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0
3,0.551932,0.926476,-0.086947,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
4,0.480667,-0.268178,0.743605,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...
998,-1.514757,0.139468,2.404710,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
999,-0.018189,-1.105101,3.235263,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1000,1.335848,-0.887967,-0.917500,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0
1001,-0.160720,2.843247,0.743605,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
