In [4]:
import geopandas as gpd

import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import os

In [6]:
koeln_merged = gpd.read_file("../data/interim/merged_data_amenities.gpkg",encoding="utf-8")


In [7]:
splitted_area_names=pd.Series(koeln_merged.Area_Types.unique()).str.split("_")
unique_area_cols = pd.Series(np.concatenate(splitted_area_names)).unique().tolist()
for unique_area_col in unique_area_cols:
    koeln_merged['is_{}'.format(unique_area_col)]= koeln_merged.Area_Types.str.contains(unique_area_col).astype(int)
koeln_merged = koeln_merged.drop(columns='Area_Types')

In [12]:
numeric_cols = koeln_merged.select_dtypes(include=['int32','int64']).columns
non_numeric_cols = koeln_merged.select_dtypes(exclude=['int32','int64']).columns

In [13]:
agg_operations= dict(zip(numeric_cols, ['mean']*len(numeric_cols) ))
agg_operations["Land_Value"]="first"
koeln_merged_neighborhood_level =  koeln_merged.groupby(['Neighborhood_Name']).agg(agg_operations).reset_index()
koeln_merged_neighborhood_level= koeln_merged_neighborhood_level.drop(columns='index_right')#.describe()

In [14]:
# Prepare data for regression
X = koeln_merged_neighborhood_level.drop(columns=["Neighborhood_Name"	,"Neighborhood_FID","Land_Value"])
y = koeln_merged_neighborhood_level.Land_Value

# Scale X data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split data into training and testing sets
X_train, X_val, y_train, y_val = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Train linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

In [15]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import ElasticNet

In [18]:
def eval_metrics(actual, pred):
    rmse = np.sqrt(mean_squared_error(actual, pred))
    mae = mean_absolute_error(actual, pred)
    r2 = r2_score(actual, pred)
    return rmse, mae, r2

import mlflow
with mlflow.start_run():
    alpha=0.5
    l1_ratio=0.5
    lr = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, random_state=42)
    lr.fit(X_train, y_train)

    predicted_qualities = lr.predict(X_val)

    (rmse, mae, r2) = eval_metrics(y_val, predicted_qualities)

    print("Elasticnet model (alpha={:f}, l1_ratio={:f}):".format(alpha, l1_ratio))
    print("  RMSE: %s" % rmse)
    print("  MAE: %s" % mae)
    print("  R2: %s" % r2)

    mlflow.log_param("alpha", alpha)
    mlflow.log_param("l1_ratio", l1_ratio)
    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("r2", r2)
    mlflow.log_metric("mae", mae)

Elasticnet model (alpha=0.500000, l1_ratio=0.500000):
  RMSE: 565.449195869096
  MAE: 318.96068484931305
  R2: 0.6377788093341034


  model = cd_fast.enet_coordinate_descent(


In [22]:
import sys
import os
ROOT_PATH =  os.path.abspath(os.path.join(os.getcwd(),".."))
sys.path.insert(0, ROOT_PATH)
from models.metrics import calculate_model_metrics
from models.boosting import get_lgb_datasets, run_lgb_model

In [27]:
import lightgbm

In [45]:
callbacks=[lightgbm.log_evaluation(10)]
params= {
    'num_boost_round':200,
    'device_type' : 'cpu',
    'num_threads': 8,
    'verbose': 0,
    'seed': 42,
    'metric': ['l2'],
    'early_stopping_round': 250,
    'force_col_wise': 'true',
    'min_data_in_leaf':10
    }
lgb_train, lgb_val = get_lgb_datasets(X_train,y_train,X_val,y_val)
gbm, evals_result, runtime = run_lgb_model(params,lgb_train,lgb_val,callbacks=callbacks)
rmse, mape, mae, me, smape, pred_val = calculate_model_metrics(X_val,y_val,model=gbm)

# TODO: Use information gain and/or compare 
feature_importances = gbm.feature_importance()

[10]	training's l2: 86525.7	valid_1's l2: 638069
[20]	training's l2: 52593.5	valid_1's l2: 546731
[30]	training's l2: 37626.8	valid_1's l2: 492191
[40]	training's l2: 27825.4	valid_1's l2: 459524
[50]	training's l2: 21281.2	valid_1's l2: 440584
[60]	training's l2: 15794.1	valid_1's l2: 419584
[70]	training's l2: 11908.2	valid_1's l2: 399168
[80]	training's l2: 9333.26	valid_1's l2: 383325
[90]	training's l2: 7231.17	valid_1's l2: 372539
[100]	training's l2: 5529.47	valid_1's l2: 361889
[110]	training's l2: 4392.77	valid_1's l2: 356843
[120]	training's l2: 3378.18	valid_1's l2: 347785
[130]	training's l2: 2688.02	valid_1's l2: 342883
[140]	training's l2: 2139.16	valid_1's l2: 340698
[150]	training's l2: 1751.81	valid_1's l2: 337798
[160]	training's l2: 1429	valid_1's l2: 336256
[170]	training's l2: 1180.75	valid_1's l2: 332305
[180]	training's l2: 974.11	valid_1's l2: 328562
[190]	training's l2: 827.199	valid_1's l2: 325653
[200]	training's l2: 713.368	valid_1's l2: 323095




In [46]:
feature_importances

array([ 1,  0,  9, 20,  0,  0, 18,  0,  0,  0,  0,  7, 26, 35,  1,  0, 15,
        0,  0,  2,  0,  3,  1,  2,  2, 18,  4, 41,  0,  0,  0,  1,  9, 17,
       18, 30,  5, 17, 12,  2,  0,  8, 10,  2, 13,  9,  0, 10,  7,  9,  0,
       11,  4,  3, 19,  7, 12, 11,  3,  4,  5,  0, 45,  3,  3,  8, 12,  6,
        3,  9,  0,  0,  3,  0,  0,  3,  8, 28,  0,  0,  5,  2,  0,  9,  4,
        0,  7,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  2,  0,  2, 10,  3,
        2,  0,  1,  1,  0,  3,  2,  0,  0,  2,  1,  0,  2,  0,  0,  0,  0,
        0,  3,  0,  0,  0,  0,  0,  1,  0,  0,  2, 11,  0,  0,  4, 16,  0,
        0, 16,  3, 13,  0,  0,  1,  1,  8,  0,  0,  0,  0,  0,  0,  0, 17,
        0,  0,  0,  1,  0,  0,  0,  0,  1,  1,  0,  7, 10,  4,  4,  5,  8,
       22,  5, 13, 13,  2, 14,  7,  0,  1,  0,  1,  6,  3, 14, 18,  4,  4,
        8, 14,  5,  0,  0,  0,  0,  0,  3,  0,  2,  0])

In [47]:
X_train.shape

(68, 199)

In [48]:
rmse

568.4146755977441

In [49]:
mae

286.032468712274