In [1]:
# standard libs
import os
import sys
import logging

# project lib
PROJECT_SRC_PATH = os.path.join(os.path.abspath(''), '..', 'src')
sys.path.append(PROJECT_SRC_PATH)

import utils
import dataset
import visualizations
from prediction_age import AgePredictor, AgeClassifier, AgePredictorComparison
from preprocessing import *

# external libs
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

import geopandas as gpd
from shapely import wkt

import shap

import gpboost as gpb
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, AdaBoostClassifier
from xgboost import XGBRegressor, XGBClassifier, XGBRFClassifier

pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.


In [2]:
logging.basicConfig(format='%(asctime)s | %(levelname)s : %(message)s', level=logging.INFO)

In [3]:
import warnings
warnings.filterwarnings('ignore', message='pandas.Int64Index is deprecated')
logging.captureWarnings(True)

In [4]:
%load_ext autoreload
%autoreload 2

## Data

In [5]:
gdf_nl = utils.load_data('netherlands', geo=True)
gdf_nl_sample = utils.sample_cities(gdf_nl, 0.1)


2022-03-30 09:50:25,288 | INFO : Loading csv...
2022-03-30 09:50:42,588 | INFO : Finished loading csv.
2022-03-30 09:50:42,794 | INFO : Finished id.
2022-03-30 09:50:54,239 | INFO : Finished TouchesIndexes.
2022-03-30 09:50:54,240 | INFO : Finished parsing csv.


In [None]:
xgb_params_optimal = {
    'max_depth': 10,
    'learning_rate': 0.1,
    'n_estimators': 1000,
    'colsample_bytree': 0.5,
    'subsample': 1.0,
}

## Preprocessing

## Experiments

In [6]:
predictor = AgePredictor(
    model=XGBRegressor(),
    df=gdf_nl_sample,
    cross_validation_split=cross_validation,
    preprocessing_stages=[remove_outliers]
)
# predictor.evaluate_regression()
predictor.mae(across_folds=True)


2022-03-30 09:55:59,571 | INFO : Dataset length: 81652
2022-03-30 09:55:59,628 | INFO : Dataset standard deviation: 32.724917318050515
2022-03-30 09:55:59,630 | INFO : Dataset mean age: 1973.7581320726988
2022-03-30 09:55:59,630 | INFO : Training dataset length: 65321
2022-03-30 09:55:59,631 | INFO : Test dataset length: 16331
2022-03-30 09:55:59,936 | INFO : Test dataset standard deviation after preprocessing: 25.26322618556549
2022-03-30 09:55:59,937 | INFO : Test dataset mean age after preprocessing: 1976.4474854387174
2022-03-30 09:55:59,937 | INFO : Training dataset length after preprocessing: 63846
2022-03-30 09:55:59,938 | INFO : Test dataset length after preprocessing: 15967
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

2022-03-30 09:56:13,739 | INFO : Dataset sta

[11.611390822979132,
 11.767550250736871,
 11.5132972035168,
 11.652123668257058,
 11.737673575418038]

In [8]:
df = gdf_nl_sample.copy()
df['coord'] = df[['lat', 'lon']].values.tolist()
df

Unnamed: 0_level_0,id,geometry,FootprintArea,Perimeter,Phi,LongestAxisLength,Elongation,Convexity,Orientation,Corners,...,height,type_source,type,age,floors,source_file,block,sbb,country,coord
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,9291596,"POLYGON ((4082537.251 3228455.629, 4082537.250...",20.299288,18.342617,0.588441,6.615381,0.678617,1.000000,26.354343,0,...,2.6130,,,1928,,NL_lod12_2d_pand_2,142305,214927,netherlands,"[6.517369378715507, 52.11466153809706]"
1,9954777,"POLYGON ((4083010.547 3229077.373, 4083010.548...",168.749001,65.314082,0.405287,22.270940,0.989896,0.799024,31.726235,5,...,5.1233,,,1920,,NL_lod12_2d_pand_2,143233,214974,netherlands,"[6.523920383695003, 52.12034230371198]"
2,7485350,"POLYGON ((4083324.068 3228482.965, 4083324.067...",9.353644,12.287960,0.625239,4.367886,0.805422,1.000000,26.754328,0,...,2.6614,,,2017,,NL_lod12_2d_pand_2,144074,214891,netherlands,"[6.528835847725001, 52.11523609902465]"
3,12341087,"POLYGON ((4083524.649 3228283.392, 4083524.050...",103.730853,42.999841,0.563622,15.069619,0.732881,0.957247,26.402138,2,...,7.3216,,,1974,,NL_lod12_2d_pand_2,144075,214922,netherlands,"[6.531983915474355, 52.11352427478226]"
4,14909890,"POLYGON ((4081636.846 3228294.064, 4081636.348...",171.926513,61.768368,0.374673,23.268603,0.417272,0.928910,26.476839,2,...,6.5803,,,2002,,NL_lod12_2d_pand_2,145613,214909,netherlands,"[6.504418161502096, 52.11282002984434]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
650403,3411885,"POLYGON ((4066561.904 3213548.648, 4066561.904...",71.259330,43.494399,0.315503,16.897885,0.491395,0.740020,39.781406,3,...,3.3444,,,1918,,NL_lod12_2d_pand_2,208757,126056,netherlands,"[6.295601277841145, 51.9737154617998]"
650404,8921524,"POLYGON ((4066591.459 3213959.098, 4066591.459...",49.359334,28.984240,0.564682,10.557951,0.603676,1.000000,35.175281,0,...,7.8156,,,1976,,NL_lod12_2d_pand_2,206526,126095,netherlands,"[6.2956541354031765, 51.97751714545771]"
650405,3415153,"POLYGON ((4068099.000 3213530.828, 4068098.999...",58.248964,31.494537,0.564271,11.473475,0.603240,1.000000,0.584037,0,...,5.5899,,,1969,,NL_lod12_2d_pand_2,206716,126040,netherlands,"[6.317950671076, 51.974339364316336]"
650406,3419029,"POLYGON ((4068365.575 3213505.510, 4068365.574...",20.762207,18.582845,0.588844,6.699994,0.666965,1.000000,0.051529,0,...,2.5356,,,1971,,NL_lod12_2d_pand_2,208125,126063,netherlands,"[6.321800314442671, 51.97423136700026]"


In [15]:
df = gdf_nl_sample.sample(frac=0.01)
# coords = pd.Series(df[['lat', 'lon']].values.tolist())
# coords = df[['lat', 'lon']]
df_train, df_test = split_80_20(df)
coords_test = df_test[['lat', 'lon']]
coords = df_train[['lat', 'lon']]
gp_model = gpb.GPModel(gp_coords=coords, cov_function="exponential")
aux_cols = list(set(df_test.columns).intersection(dataset.AUX_VARS))
print(aux_cols)
aux_vars_train = df_train[aux_cols]
aux_vars_test = df_test[aux_cols]
# coords_test = pd.Series(aux_vars_test[['lat', 'lon']].values.tolist())
X_train = df_train.drop(columns=aux_cols + ['age'])
y_train = df_train[['age']]

X_test = df_test.drop(columns=aux_cols + ['age'])
y_test = df_test[['age']]

data_train = gpb.Dataset(X_train, y_train)
params = { 'objective': 'regression_l2', 'learning_rate': 0.3,
            'max_depth': 3, 'min_data_in_leaf': 10, 
            'num_leaves': 100, 'verbose': 2 }
# Training
bst = gpb.train(params=params, train_set=data_train,
                gp_model=gp_model, num_boost_round=247)
# gp_model.summary() # Estimated covariance parameters
# Prediction
pred = bst.predict(data=X_test, gp_coords_pred=coords_test,
                    predict_var=True)
# Sum the predictions of the trees and the GP
y_pred = pred['fixed_effect'] + pred['random_effect_mean']

ValueError: gp_coords: Series.dtypes must be int, float or bool

In [10]:
from sklearn import metrics

print(metrics.mean_absolute_error(y_test, y_pred))
print(metrics.r2_score(y_test, y_pred))

16.134482017971102
0.24416232775109503


In [18]:

gdf = gpd.GeoDataFrame(gdf_nl_sample, geometry=gpd.points_from_xy(gdf_nl_sample.lon, gdf_nl_sample.lat))
gdf.dissolve('sbb').centroid
# clusters_centroids = [gpd.MultiPoint(ea).centroid for ea in clusters]



sbb
15194     POINT (51.93269 6.33380)
15274     POINT (51.92622 6.35376)
15285     POINT (51.92667 6.35622)
15288     POINT (51.92728 6.34263)
15301     POINT (51.92745 6.34895)
                    ...           
230153    POINT (52.12236 6.51406)
230251    POINT (52.17377 6.50648)
233763    POINT (52.17486 6.52624)
240680    POINT (51.94599 6.26857)
254710    POINT (52.16522 5.50691)
Length: 1363, dtype: geometry