# Final preparation

In [1]:
import platform; print(platform.platform())
import sys; print("Python", sys.version)

Windows-10-10.0.19045-SP0
Python 3.10.5 (tags/v3.10.5:f377153, Jun  6 2022, 16:14:13) [MSC v.1929 64 bit (AMD64)]


In [2]:
import numpy as np
import pandas as pd
import os

import matplotlib.pyplot as plt 
import seaborn as sns
sns.set_theme(style='darkgrid', palette='Set1')

from sklearn.ensemble import IsolationForest, HistGradientBoostingRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import  KFold, train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [3]:
class cfg:
    seed = 42
    nfolds = 5
    njobs = 2

## Read data

In [4]:
# load data
train = pd.read_csv('../data/extra/train_comb.csv')
test = pd.read_csv('../data/raw/test.csv', index_col=0).reset_index(drop=True)
test['IsSynthetic'] = 1.0

train.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal,IsSynthetic
0,2.3859,15.0,3.82716,1.1121,1280.0,2.486989,34.6,-120.12,0.98,1.0
1,3.7188,17.0,6.013373,1.054217,1504.0,3.813084,38.69,-121.22,0.946,1.0
2,4.775,27.0,6.535604,1.103175,1061.0,2.464602,34.71,-120.45,1.576,1.0
3,2.4138,16.0,3.350203,0.965432,1255.0,2.089286,32.66,-117.09,1.336,1.0
4,3.75,52.0,4.284404,1.069246,1793.0,1.60479,37.8,-122.41,4.5,1.0


In [5]:
# quick info
TARGET = 'MedHouseVal'
FEATURES = [c for c in train.columns if c not in [TARGET]]

print(f'Target: {TARGET}\nFeatures: {FEATURES}')
print('Train set shape:', train.shape)
print('Test set shape:', test.shape)

Target: MedHouseVal
Features: ['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude', 'IsSynthetic']
Train set shape: (57777, 10)
Test set shape: (24759, 9)


## Outliers

In [6]:
# isolation forest for outlier detection
iso = IsolationForest(
    n_estimators=1000,
    max_samples=0.3,
    contamination='auto',
    verbose=0,
    n_jobs=cfg.njobs,
    random_state=cfg.seed
)

# fit isolation forest on synthetic data
_ = iso.fit(train.loc[train['IsSynthetic']==1, FEATURES])

In [7]:
# predict anomaly scores
train['AnomalyScore'] = iso.score_samples(train[FEATURES])
test['AnomalyScore'] = iso.score_samples(test)

## Cartesian coordinates

In [8]:
# taken from (https://www.kaggle.com/code/dmitryuarov/ps-s3e1-coordinates-key-to-victory)

def crt_crds(df): 
    '''Obtain rotation of the caartesian cordinates'''

    df['rot_15_x'] = (np.cos(np.radians(15)) * df['Longitude']) + (np.sin(np.radians(15)) * df['Latitude'])
    df['rot_15_y'] = (np.cos(np.radians(15)) * df['Latitude']) + (np.sin(np.radians(15)) * df['Longitude'])
    df['rot_30_x'] = (np.cos(np.radians(30)) * df['Longitude']) + (np.sin(np.radians(30)) * df['Latitude'])
    df['rot_30_y'] = (np.cos(np.radians(30)) * df['Latitude']) + (np.sin(np.radians(30)) * df['Longitude'])
    df['rot_45_x'] = (np.cos(np.radians(45)) * df['Longitude']) + (np.sin(np.radians(45)) * df['Latitude'])

    return df

train = crt_crds(train)
test = crt_crds(test)

In [9]:
from sklearn.decomposition import PCA

def pca_crds(df):
    '''obtain pca coordinates'''
    
    coordinates = df[['Latitude', 'Longitude']]#.values
    pca_obj = PCA().fit(coordinates)
    df['pca_lon'] = pca_obj.transform(df[['Latitude', 'Longitude']])[:,0]
    df['pca_lat'] = pca_obj.transform(df[['Latitude', 'Longitude']])[:,1]
    return df

train = pca_crds(train)
test = pca_crds(test)

In [10]:
# get place from coordinates
import reverse_geocoder as rg

def geocoder(df):
    coordinates = list(zip(df['Latitude'], df['Longitude']))
    results = rg.search(coordinates)
    return results

results = geocoder(train)
train['place'] = [x['admin2'] for x in results]
results = geocoder(test)
test['place'] = [x['admin2'] for x in results]

places = [
    'Los Angeles County', 'Orange County', 'Kern County',
    'Alameda County', 'San Francisco County', 'Ventura County',
    'Santa Clara County', 'Fresno County', 'Santa Barbara County',
    'Contra Costa County', 'Yolo County', 'Monterey County',
    'Riverside County', 'Napa County'
]

def replace(x):
    if x in places:
        return x
    else:
        return 'Other'
    
train['place'] = train['place'].apply(lambda x: replace(x))
test['place'] = test['place'].apply(lambda x: replace(x))

ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
places_train = ohe.fit_transform(train['place'].values.reshape(-1, 1))
places_test = ohe.transform(test['place'].values.reshape(-1, 1))

names = [n.split('_')[1] for n in ohe.get_feature_names_out()]
train = pd.concat([train.drop(columns='place'), pd.DataFrame(places_train, columns=names)], axis=1)
test = pd.concat([test.drop(columns='place'), pd.DataFrame(places_test, columns=names)], axis=1) 

Loading formatted geocoded file...


## Save data

In [11]:
# quick info
TARGET = 'MedHouseVal'
FEATURES = [c for c in train.columns if c not in [TARGET]]

print(f'Target: {TARGET}\nFeatures: {FEATURES}')
print('Train set shape:', train.shape)
print('Test set shape:', test.shape)

Target: MedHouseVal
Features: ['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude', 'IsSynthetic', 'AnomalyScore', 'rot_15_x', 'rot_15_y', 'rot_30_x', 'rot_30_y', 'rot_45_x', 'pca_lon', 'pca_lat', 'Alameda County', 'Contra Costa County', 'Fresno County', 'Kern County', 'Los Angeles County', 'Monterey County', 'Napa County', 'Orange County', 'Other', 'Riverside County', 'San Francisco County', 'Santa Barbara County', 'Santa Clara County', 'Ventura County', 'Yolo County']
Train set shape: (57777, 33)
Test set shape: (24759, 32)


In [12]:
out_path = '../data/final/'
os.makedirs(out_path, exist_ok=True)

train.to_csv(out_path + 'train.csv', index=False)
test.to_csv(out_path + 'test.csv', index=False)