In [None]:
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import LinearRegression, LassoCV
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score, GridSearchCV, KFold
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Set a fixed random seed for reproducibility
random.seed(19)

df = pd.read_csv("../data/data_final.csv.gz")
df['date'] = pd.to_datetime(df['date'])
to_pred_df = df.loc[df['date'] >= pd.to_datetime('2017-01-01 00:00:00')].reset_index(drop=True)
df = df.loc[df['date'] < pd.to_datetime('2017-01-01 00:00:00')].reset_index(drop=True)

In [None]:
tiles = df['tile_index'].unique().tolist()
random.shuffle(tiles)

train_ratio = 0.75
split_index = int(train_ratio * len(tiles))
train_tiles = tiles[:split_index] 
test_tiles = tiles[split_index:]  

train_df = df.loc[df['tile_index'].isin(train_tiles)]
test_df = df.loc[df['tile_index'].isin(test_tiles)]

X_cols = [col for col in df.columns if col.startswith('feature')]
y_col = 'avg_urban_imperviousness'
X_train = train_df[X_cols]
y_train = train_df[y_col]
X_test = test_df[X_cols]
y_test = test_df[y_col]

In [7]:
X_train.shape

(13395, 768)

In [11]:
print(y_train.describe())

       avg_urban_imperviousness
count              13395.000000
mean                   3.210876
std                    5.788118
min                    0.000000
25%                    0.366940
50%                    1.387734
75%                    3.229994
max                   59.641462


In [14]:
# Establish a baseline model using the mean
baseline_pred = np.full_like(y_test, y_train.mean().values, dtype=np.float64)

# Calculate baseline performance
baseline_rmse = np.sqrt(mean_squared_error(y_test, baseline_pred))
baseline_r2 = r2_score(y_test, baseline_pred)
print(f"Baseline RMSE: {baseline_rmse:.4f}")
print(f"Baseline R^2: {baseline_r2:.4f}")

Baseline RMSE: 5.7381
Baseline R^2: -0.0002


In [15]:
# Simple Linear Regression
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
lr_pred = lr_model.predict(X_test)

# Evaluate Linear Regression model
lr_rmse = np.sqrt(mean_squared_error(y_test, lr_pred))
lr_r2 = r2_score(y_test, lr_pred)
print(f"Linear Regression RMSE: {lr_rmse:.4f}")
print(f"Linear Regression R^2: {lr_r2:.4f}")

Linear Regression RMSE: 2.5847
Linear Regression R^2: 0.7971


In [17]:
# PCA + Linear Regression
pca = PCA(n_components=0.95)  # Retain 95% of the variance
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

lr_pca_model = LinearRegression()
lr_pca_model.fit(X_train_pca, y_train)
lr_pca_pred = lr_pca_model.predict(X_test_pca)

# Evaluate PCA + Linear Regression model
lr_pca_rmse = np.sqrt(mean_squared_error(y_test, lr_pca_pred))
lr_pca_r2 = r2_score(y_test, lr_pca_pred)
print(f"PCA + Linear Regression RMSE: {lr_pca_rmse:.4f}")
print(f"PCA + Linear Regression R^2: {lr_pca_r2:.4f}")

PCA + Linear Regression RMSE: 2.6655
PCA + Linear Regression R^2: 0.7842


In [18]:
# LASSO Regression with Cross-Validation
lasso_model = LassoCV(alphas=np.logspace(-4, 0, 50), cv=5)
lasso_model.fit(X_train, y_train)
lasso_pred = lasso_model.predict(X_test)

# Evaluate LASSO model
lasso_rmse = np.sqrt(mean_squared_error(y_test, lasso_pred))
lasso_r2 = r2_score(y_test, lasso_pred)
print(f"LASSO Regression RMSE: {lasso_rmse:.4f}")
print(f"LASSO Regression R^2: {lasso_r2:.4f}")
print(f"Optimal alpha for LASSO: {lasso_model.alpha_:.4e}")

  y = column_or_1d(y, warn=True)


LASSO Regression RMSE: 2.5538
LASSO Regression R^2: 0.8019
Optimal alpha for LASSO: 5.4287e-04


  model = cd_fast.enet_coordinate_descent(


In [None]:
# Random Forest with Cross-Validation for Hyperparameter Tuning
rf_model = RandomForestRegressor(random_state=19)
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}
rf_cv = GridSearchCV(rf_model, param_grid, cv=5, scoring='neg_mean_squared_error')
rf_cv.fit(X_train, y_train.values.ravel())
rf_best = rf_cv.best_estimator_
rf_pred = rf_best.predict(X_test)

# Evaluate Random Forest model
rf_rmse = np.sqrt(mean_squared_error(y_test, rf_pred))
rf_r2 = r2_score(y_test, rf_pred)
print(f"Random Forest RMSE: {rf_rmse:.4f}")
print(f"Random Forest R^2: {rf_r2:.4f}")
print(f"Best params for Random Forest: {rf_cv.best_params_}")

In [None]:

# LightGBM with Cross-Validation for Hyperparameter Tuning
lgb_model = LGBMRegressor(random_state=19)
lgb_param_grid = {
    'num_leaves': [31, 50],
    'learning_rate': [0.01, 0.1],
    'n_estimators': [100, 200]
}
lgb_cv = GridSearchCV(lgb_model, lgb_param_grid, cv=5, scoring='neg_mean_squared_error')
lgb_cv.fit(X_train, y_train.values.ravel())
lgb_best = lgb_cv.best_estimator_
lgb_pred = lgb_best.predict(X_test)

# Evaluate LightGBM model
lgb_rmse = np.sqrt(mean_squared_error(y_test, lgb_pred))
lgb_r2 = r2_score(y_test, lgb_pred)
print(f"LightGBM RMSE: {lgb_rmse:.4f}")
print(f"LightGBM R^2: {lgb_r2:.4f}")
print(f"Best params for LightGBM: {lgb_cv.best_params_}")


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.057517 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 195840
[LightGBM] [Info] Number of data points in the train set: 10716, number of used features: 768
[LightGBM] [Info] Start training from score 2.851744
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.063586 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 195840
[LightGBM] [Info] Number of data points in the train set: 10716, number of used features: 768
[LightGBM] [Info] Start training from score 3.093454
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.063642 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 195840
[LightGBM] [Info] Number of data points in the train set: 10716, number of used features: 768
[LightGBM] [Info] 

In [None]:
# Create a dictionary to store the model results
results = {
    'Model': ['Baseline', 'Linear Regression', 'PCA + Linear Regression', 'LASSO Regression', 'Random Forest', 'LightGBM'],
    'RMSE': [baseline_rmse, lr_rmse, lr_pca_rmse, lasso_rmse, rf_rmse, lgb_rmse],
    'R^2': [baseline_r2, lr_r2, lr_pca_r2, lasso_r2, rf_r2, lgb_r2]
}

# Convert the dictionary to a DataFrame
results_df = pd.DataFrame(results)

# Display the results DataFrame
print(results_df)

# Plot the scores for easy comparison
plt.figure(figsize=(12, 6))

# RMSE Plot
plt.subplot(1, 2, 1)
plt.barh(results_df['Model'], results_df['RMSE'], color='skyblue')
plt.xlabel('RMSE')
plt.title('RMSE of Models')

# R^2 Plot
plt.subplot(1, 2, 2)
plt.barh(results_df['Model'], results_df['R^2'], color='lightgreen')
plt.xlabel('R^2 Score')
plt.title('R^2 Score of Models')

plt.tight_layout()
plt.show()