## Import Data

In [4]:
# Data
import pandas as pd
import numpy as np

# Sklearn
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.pipeline import  Pipeline

# Tools
from my_mltools.geospatial import CoordinateTransformer
from my_mltools.cat_encode import Embedder

In [5]:
# Import data
df = pd.read_csv('../tests/test_data/geospatial_test_data.csv')
df

Unnamed: 0,coord_longitude,coord_latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-118.28,34.18,50.0,2195.0,336.0,878.0,309.0,6.8840,365600.0,<1H OCEAN
1,-121.50,38.59,43.0,88.0,21.0,119.0,19.0,1.7250,67500.0,INLAND
2,-121.25,37.76,22.0,2430.0,417.0,1292.0,391.0,3.4009,182400.0,INLAND
3,-122.18,37.77,52.0,2744.0,547.0,1479.0,554.0,2.2768,96200.0,NEAR BAY
4,-118.95,34.17,9.0,2372.0,312.0,1039.0,321.0,7.6016,344900.0,<1H OCEAN
...,...,...,...,...,...,...,...,...,...,...
95,-118.37,33.77,26.0,6339.0,876.0,2540.0,880.0,10.1447,500001.0,NEAR OCEAN
96,-121.93,37.76,5.0,2255.0,269.0,876.0,258.0,10.3345,461400.0,<1H OCEAN
97,-118.98,37.64,17.0,3769.0,908.0,1160.0,453.0,3.0500,188900.0,INLAND
98,-121.25,38.03,29.0,2465.0,327.0,859.0,315.0,6.6605,220700.0,INLAND


## Working With Sci-kit Learn Pipeline

In [7]:
# Columns to apply imputation and standardization
num_cols = list(set(df.columns.tolist()) - {'ocean_proximity', 'coord_latitude', 'coord_longitude'})
num_cols

['total_rooms',
 'median_income',
 'housing_median_age',
 'households',
 'median_house_value',
 'total_bedrooms',
 'population']

In [8]:
# Numerical pipeline just for the numeric columns
numerical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
    ('std_scaler', StandardScaler(copy=True, with_mean=True, with_std=True))
])

In [12]:
# Pipeline
pipeline = ColumnTransformer(
    # Select coordinate columns using regex
    [('cluster_coord', CoordinateTransformer(), make_column_selector(pattern='^coord')),
     # Embed 'ocean_proximity' into 6 numerical columns
     ('embedding', Embedder(dimension=6), ['ocean_proximity']),
     # Apply imputation and standardization to numeric columns
     ('numerical', numerical_pipeline, num_cols)]
)

In [13]:
out = pipeline.fit_transform(df)
pd.DataFrame(out)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,0.0,0.068841,0.090478,-0.768441,0.104365,0.212001,0.263126,-0.228314,1.240334,1.820128,-0.538789,1.262242,-0.540708,-0.545657
1,3.0,0.435026,-0.259837,-0.061047,0.308040,-0.029449,0.319795,-1.528328,-1.046032,1.241260,-1.590120,-1.230038,-1.558353,-1.552032
2,1.0,0.435026,-0.259837,-0.061047,0.308040,-0.029449,0.319795,-0.083319,-0.303306,-0.495346,-0.241516,-0.269411,-0.279028,0.003275
3,1.0,0.112723,0.366112,-0.244953,-0.606863,-0.351457,-0.120561,0.110418,-0.801485,1.985519,0.349404,-0.990091,0.140952,0.251223
4,0.0,0.068841,0.090478,-0.768441,0.104365,0.212001,0.263126,-0.119105,1.558360,-1.570388,-0.495286,1.089179,-0.618243,-0.332183
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.0,-0.735953,-0.120182,-0.166953,-0.039377,-0.481510,0.479122,2.328525,2.685412,-0.164564,1.531245,2.385908,1.203825,1.658026
96,1.0,0.068841,0.090478,-0.768441,0.104365,0.212001,0.263126,-0.191294,2.769527,-1.901170,-0.723678,2.063183,-0.757160,-0.548309
97,2.0,0.435026,-0.259837,-0.061047,0.308040,-0.029449,0.319795,0.742841,-0.458818,-0.908824,-0.016749,-0.215067,1.307205,-0.171747
98,1.0,0.435026,-0.259837,-0.061047,0.308040,-0.029449,0.319795,-0.061724,1.141284,0.083522,-0.517037,0.050798,-0.569784,-0.570849
