# Final preparation

In [1]:
import platform; print(platform.platform())
import sys; print("Python", sys.version)

Windows-10-10.0.19045-SP0
Python 3.10.5 (tags/v3.10.5:f377153, Jun  6 2022, 16:14:13) [MSC v.1929 64 bit (AMD64)]


In [2]:
class cfg:
    seed = 42
    nfolds = 5
    njobs = 2

In [3]:
import numpy as np
import pandas as pd
import os

import matplotlib.pyplot as plt 
import seaborn as sns
sns.set_theme(style='darkgrid', palette='Set1')

from sklearn.ensemble import IsolationForest, HistGradientBoostingRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import  KFold, train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error

## Read data

In [4]:
# load data
train0 = pd.read_csv('../data/raw/train.csv', index_col=0).reset_index(drop=True)
test = pd.read_csv('../data/raw/test.csv', index_col=0).reset_index(drop=True)
train_orig = pd.read_csv('../data/extra/train_orig.csv')

train0.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
0,2.3859,15.0,3.82716,1.1121,1280.0,2.486989,34.6,-120.12,0.98
1,3.7188,17.0,6.013373,1.054217,1504.0,3.813084,38.69,-121.22,0.946
2,4.775,27.0,6.535604,1.103175,1061.0,2.464602,34.71,-120.45,1.576
3,2.4138,16.0,3.350203,0.965432,1255.0,2.089286,32.66,-117.09,1.336
4,3.75,52.0,4.284404,1.069246,1793.0,1.60479,37.8,-122.41,4.5


In [5]:
# sample a validation set then add original data to train set
train0['IsSynthetic'] = 1.0
test['IsSynthetic'] = 1.0
train_orig['IsSynthetic'] = 0.0

train = train0.sample(frac = 0.6, random_state=cfg.seed)
valid = train0.drop(train.index)

train = pd.concat([train, train_orig], axis=0)

train.reset_index(drop=True, inplace=True)
valid.reset_index(drop=True, inplace=True)

In [6]:
# quick info
TARGET = 'MedHouseVal'
FEATURES = [c for c in train.columns if c not in [TARGET]]

print(f'Target: {TARGET}\nFeatures: {FEATURES}')
print('Train set shape:', train.shape)
print('Validation set shape:', valid.shape)
print('Test set shape:', test.shape)

Target: MedHouseVal
Features: ['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude', 'IsSynthetic']
Train set shape: (42922, 10)
Validation set shape: (14855, 10)
Test set shape: (24759, 9)


## Outliers

In [7]:
# isolation forest for outlier detection
iso = IsolationForest(
    n_estimators=1000,
    max_samples=0.3,
    contamination='auto',
    verbose=0,
    n_jobs=cfg.njobs,
    random_state=cfg.seed
)

# fit isolation forest on synthetic data
_ = iso.fit(train.loc[train['IsSynthetic']==1, FEATURES])

In [8]:
# predict anomaly scores
train['AnomalyScore'] = iso.score_samples(train[FEATURES])
valid['AnomalyScore'] = iso.score_samples(valid[FEATURES])
test['AnomalyScore'] = iso.score_samples(test)

## Cartesian coordinates

In [9]:
# taken from (https://www.kaggle.com/code/dmitryuarov/ps-s3e1-coordinates-key-to-victory)

def crt_crds(df): 
    '''Obtain rotation of the caartesian cordinates'''

    df['rot_15_x'] = (np.cos(np.radians(15)) * df['Longitude']) + (np.sin(np.radians(15)) * df['Latitude'])
    df['rot_15_y'] = (np.cos(np.radians(15)) * df['Latitude']) + (np.sin(np.radians(15)) * df['Longitude'])
    df['rot_30_x'] = (np.cos(np.radians(30)) * df['Longitude']) + (np.sin(np.radians(30)) * df['Latitude'])
    df['rot_30_y'] = (np.cos(np.radians(30)) * df['Latitude']) + (np.sin(np.radians(30)) * df['Longitude'])
    df['rot_45_x'] = (np.cos(np.radians(45)) * df['Longitude']) + (np.sin(np.radians(45)) * df['Latitude'])

    return df

train = crt_crds(train)
valid = crt_crds(valid)
test = crt_crds(test)

In [10]:
from sklearn.decomposition import PCA

def pca_crds(df):
    '''obtain pca coordinates'''
    
    coordinates = df[['Latitude', 'Longitude']]#.values
    pca_obj = PCA().fit(coordinates)
    df['pca_lon'] = pca_obj.transform(df[['Latitude', 'Longitude']])[:,0]
    df['pca_lat'] = pca_obj.transform(df[['Latitude', 'Longitude']])[:,1]
    return df

train = pca_crds(train)
valid = pca_crds(valid)
test = pca_crds(test)

In [11]:
train

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal,IsSynthetic,AnomalyScore,rot_15_x,rot_15_y,rot_30_x,rot_30_y,rot_45_x,pca_lon,pca_lat
0,5.0855,17.0,6.190805,1.105747,1538.0,3.373563,38.02,-121.36,1.035,1.0,-0.376129,-107.384458,5.314221,-86.090843,-27.753714,-58.930279,2.993817,0.345997
1,3.3636,5.0,4.239899,1.020202,3278.0,2.066007,33.68,-117.53,1.969,1.0,-0.405021,-104.808237,2.113379,-84.943966,-29.597264,-59.290904,-2.791694,0.166093
2,5.0417,29.0,6.602317,1.023529,1119.0,2.705479,33.84,-117.95,2.638,1.0,-0.362130,-105.172515,2.159224,-85.227696,-29.668700,-59.474751,-2.387518,-0.030482
3,4.1364,22.0,5.448584,0.986717,1750.0,3.020478,38.47,-121.77,1.188,1.0,-0.371259,-107.664019,5.642771,-86.220913,-27.569003,-58.901995,3.602516,0.355263
4,1.6031,39.0,4.742204,1.075092,1168.0,3.250000,36.75,-119.80,0.669,1.0,-0.395629,-106.206314,4.491253,-85.374843,-28.073566,-58.725218,1.000091,0.613496
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42917,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09,0.781,0.0,-0.400421,-106.745782,6.794353,-85.127016,-26.354317,-57.706984,3.873094,1.542397
42918,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21,0.771,0.0,-0.439911,-106.859105,6.772954,-85.225939,-26.405657,-57.784766,3.962542,1.461780
42919,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22,0.923,0.0,-0.394929,-106.884294,6.712411,-85.264599,-26.462618,-57.834264,3.925657,1.413412
42920,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32,0.847,0.0,-0.402928,-106.980886,6.686529,-85.351202,-26.512618,-57.904974,3.994123,1.340526


In [12]:
# get place from coordinates
import reverse_geocoder as rg

places = [
    'Los Angeles County', 'Orange County', 'Kern County',
    'Alameda County', 'San Francisco County', 'Ventura County',
    'Santa Clara County', 'Fresno County', 'Santa Barbara County',
    'Contra Costa County', 'Yolo County', 'Monterey County',
    'Riverside County', 'Napa County'
]

def place_crds(df):
    '''obtain place from coordinates'''

    coordinates = list(zip(df['Latitude'], df['Longitude']))
    results = rg.search(coordinates)
    df['place'] = [x['admin2'] for x in results]

    def replace(x):
        if x in places:
            return x
        else:
            return 'Other'
    
    df['place'] = df['place'].apply(lambda x: replace(x))

    ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
    places_ohe = ohe.fit_transform(df['place'].values.reshape(-1, 1))
    names = [n.split('_')[1] for n in ohe.get_feature_names_out()]
    tmp = pd.DataFrame(places_ohe, columns=names)

    df = pd.concat([df.drop(columns='place'), tmp], axis=1)

    return df

train = place_crds(train)
valid = place_crds(valid)
test = place_crds(test)

Loading formatted geocoded file...


## Save data

In [13]:
# quick info
TARGET = 'MedHouseVal'
FEATURES = [c for c in train.columns if c not in [TARGET]]

print(f'Target: {TARGET}\nFeatures: {FEATURES}')
print('Train set shape:', train.shape)
print('Valid set shape:', valid.shape)
print('Test set shape:', test.shape)

Target: MedHouseVal
Features: ['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude', 'IsSynthetic', 'AnomalyScore', 'rot_15_x', 'rot_15_y', 'rot_30_x', 'rot_30_y', 'rot_45_x', 'pca_lon', 'pca_lat', 'Alameda County', 'Contra Costa County', 'Fresno County', 'Kern County', 'Los Angeles County', 'Monterey County', 'Napa County', 'Orange County', 'Other', 'Riverside County', 'San Francisco County', 'Santa Barbara County', 'Santa Clara County', 'Ventura County', 'Yolo County']
Train set shape: (42922, 33)
Valid set shape: (14855, 33)
Test set shape: (24759, 32)


In [14]:
out_path = '../data/final/'
os.makedirs(out_path, exist_ok=True)

train.to_csv(out_path + 'train.csv', index=False)
valid.to_csv(out_path + 'valid.csv', index=False)
test.to_csv(out_path + 'test.csv', index=False)