In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import numpy as np
import matplotlib.pyplot as plt
import torch.optim as optim
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
import time
torch.autograd.set_detect_anomaly(True)
torch.manual_seed(128)



<torch._C.Generator at 0x7fe011fc4150>

In [4]:
data = pd.read_csv('housing.csv')

In [5]:
def build_key_value_map(data, categorical, target):
    keys = data.groupby(categorical)[target].mean().index
    values = data.groupby(categorical)[target].mean().tolist()
    
    key_value_map = {}

    for k, v in zip(keys, values):
        key_value_map[k] = v
        
    key_value_map['default'] = data[target].mean()
        
    return key_value_map

def apply_mean_encoding(data_to_train, data_to_apply, categorical, target):
    kv_map = build_key_value_map(data_to_train, categorical, target)
    return data_to_apply[categorical].apply(lambda x: kv_map[x] if x in kv_map.keys() else kv_map['default'])

In [6]:
# care ocean_proximity == island has only 5 values - can be problematic
#
# before we simply jump into the pytorch I suggest to do the following 
# 1) create OHE for ocean_proximity
# 2) create mean encoding for ocean proximity
# 3) use sklearn to 
#    a) split in train / test
#    b) normalize and standardize the data
# 4) jump into pytorch for some experiments
#    a) vanilla MLP used in Task 1
#    b) use AE to encode the features, then MLP on the features
#    c) maybe something more exotic

In [None]:
# we gonna the caveman approach without pipelines etc for the simplicity

proximity_ohe = pd.get_dummies(data['ocean_proximity'], prefix='op')
data = pd.concat([data, proximity_ohe], axis=1)

train_set = data.sample(int(0.7 * len(data)))
test_set  = data[~data.index.isin(train_set.index)]

# do not leak data through mean encoding, use training set only for key-value map
train_set['mean_encoded'] = apply_mean_encoding(train_set, train_set, 'ocean_proximity', 'median_house_value')
test_set['mean_encoded'] = apply_mean_encoding(train_set, test_set, 'ocean_proximity', 'median_house_value')

train_set = train_set.drop('ocean_proximity', axis=1)
test_set = test_set.drop('ocean_proximity', axis=1)


target = 'median_house_value'
X_train = train_set.drop(target, axis=1)
y_train = train_set[target]
X_test = test_set.drop(target, axis=1)
y_test = test_set[target]

SC = StandardScaler()

X_train = SC.fit_transform(X_train)
X_test = SC.fit_transform(X_test)

X_train = torch.Tensor(X_train)
X_test = torch.Tensor(X_test)

y_train = torch.Tensor(y_train)
y_test = torch.Tensor(y_test)

In [6]:
SC = StandardScaler()


In [7]:
x = SC.fit_transform(X_train)

In [8]:
xx = SC.transform(X_test)

In [9]:
torch.Tensor(x)

tensor([[ 0.7196, -0.7418,  0.4983,  ..., -0.3530, -0.3861,  0.5918],
        [ 0.5548, -0.7559,  0.0209,  ..., -0.3530, -0.3861,  0.5918],
        [-1.9082,  1.5733, -0.0587,  ..., -0.3530, -0.3861,  0.5918],
        ...,
        [-0.4644,  1.5171, -1.4909,  ..., -0.3530, -0.3861, -1.4637],
        [ 0.9245, -0.9480, -0.2178,  ..., -0.3530, -0.3861,  0.5918],
        [ 0.3150, -0.6903, -2.0479,  ..., -0.3530, -0.3861,  0.5918]])

In [1]:
X_transformed_df

NameError: name 'X_transformed_df' is not defined

In [7]:
data.groupby('ocean_proximity')['median_house_value'].mean().tolist()

[240084.28546409807,
 124805.39200122119,
 380440.0,
 259212.31179039303,
 249433.97742663656]

In [8]:
keys = data.groupby('ocean_proximity')['median_house_value'].mean().index
values = data.groupby('ocean_proximity')['median_house_value'].mean().tolist()

for k, v in zip(keys, values):
    print(f"{k} -- {v}")

<1H OCEAN -- 240084.28546409807
INLAND -- 124805.39200122119
ISLAND -- 380440.0
NEAR BAY -- 259212.31179039303
NEAR OCEAN -- 249433.97742663656


In [22]:
kv_map = build_key_value_map(data, 'ocean_proximity', 'median_house_value')

In [24]:
data['ocean_proximity'].apply(lambda x: kv_map[x] if x in kv_map.keys() else kv_map['default'])

0        259212.311790
1        259212.311790
2        259212.311790
3        259212.311790
4        259212.311790
             ...      
20635    124805.392001
20636    124805.392001
20637    124805.392001
20638    124805.392001
20639    124805.392001
Name: ocean_proximity, Length: 20640, dtype: float64