# CPSC 4970 AI + ML: Module 4 -- encoding


## Basic usage of [OneHotEncoder](https://scikit-learn.org/stable/modules/preprocessing.html#preprocessing-categorical-features)
## [ColumnTransformer](https://scikit-learn.org/stable/modules/compose.html#column-transformer)

In [13]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from IPython.display import display
import pandas as pd

df = pd.DataFrame(data={
    'Letter': ['A', 'A', 'B', 'C', 'B', 'B', 'C', 'A'],
    'Age': [1.1, 4.2, 4.6, 16.2, 1.1, 9.3, 8.7, 3.3]})
display(df)

ct = ColumnTransformer(
    [
        ('one-hot', OneHotEncoder(), ['Letter'])
    ],
    remainder='passthrough'
)
ct.fit(df)
df_transformed = pd.DataFrame(ct.transform(df), columns=ct.get_feature_names())
display(df_transformed)

Unnamed: 0,Letter,Age
0,A,1.1
1,A,4.2
2,B,4.6
3,C,16.2
4,B,1.1
5,B,9.3
6,C,8.7
7,A,3.3


Unnamed: 0,one-hot__x0_A,one-hot__x0_B,one-hot__x0_C,Age
0,1.0,0.0,0.0,1.1
1,1.0,0.0,0.0,4.2
2,0.0,1.0,0.0,4.6
3,0.0,0.0,1.0,16.2
4,0.0,1.0,0.0,1.1
5,0.0,1.0,0.0,9.3
6,0.0,0.0,1.0,8.7
7,1.0,0.0,0.0,3.3


In [15]:
url = 'https://raw.githubusercontent.com/ageron/handson-ml/master/datasets/housing/housing.csv'
housing = pd.read_csv(url)
display(housing)
ct = ColumnTransformer(
    [
        ('prox', OneHotEncoder(), ['ocean_proximity'])
    ],
    remainder='passthrough'
)
ct.fit(housing)
housing_transformed = pd.DataFrame(ct.transform(housing), columns=ct.get_feature_names())
display(housing_transformed)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY
...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,78100.0,INLAND
20636,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,77100.0,INLAND
20637,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7000,92300.0,INLAND
20638,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,84700.0,INLAND


Unnamed: 0,prox__x0_<1H OCEAN,prox__x0_INLAND,prox__x0_ISLAND,prox__x0_NEAR BAY,prox__x0_NEAR OCEAN,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,0.0,0.0,0.0,1.0,0.0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0
1,0.0,0.0,0.0,1.0,0.0,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0
2,0.0,0.0,0.0,1.0,0.0,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0
3,0.0,0.0,0.0,1.0,0.0,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0
4,0.0,0.0,0.0,1.0,0.0,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20635,0.0,1.0,0.0,0.0,0.0,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,78100.0
20636,0.0,1.0,0.0,0.0,0.0,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,77100.0
20637,0.0,1.0,0.0,0.0,0.0,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7000,92300.0
20638,0.0,1.0,0.0,0.0,0.0,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,84700.0
