In [1]:
import pandas as pd
import pickle as pkl
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import aadt_predictor as ap

### Hyperparameter Results

In [5]:
# Load the results
with open('../results/hyperparameter_tuning_results.pkl', 'rb') as f:
    results = pkl.load(f)

In [9]:
OUTDIR = "../results"
DATA_DIR = "../data/hpms_aadt_subset.csv"
RESPONSE_VARS = ['AADT_MDV', 'AADT_HDV']
RF_PREDICTOR_VARS = ["STATEFP", "COUNTYFP", "F_SYSTEM", "THROUGH_LANES", "AADT"]
RANDOM_STATE = 42

predictor = ap.AADTPredictor(DATA_DIR, RESPONSE_VARS, outdir = OUTDIR, random_state = RANDOM_STATE)

Loading data from ../data/hpms_aadt_subset.csv


  self.data = pd.read_csv(self.data_path)


Pre-processing data...
Data loaded successfully: 4806937 rows and 21 columns.


In [20]:
metrics = []
for r in results:
    predictor.split_data(r['response_var'], RF_PREDICTOR_VARS, state_fips = r['state'])
    #  Default parameters
    print(f"State: {r['state']}, Response Variable: {r['response_var']}")
    predictor.initialize_model("Random Forest")
    predictor.fit_model()
    r2, mae, mse = predictor.test_model()
    print(f"Default Params: R2: {r2}, MAE: {mae}, MSE: {mse}")
    metrics.append({
        "state": r['state'],
        "parameters": {},
        "response_var": r['response_var'],
        "r2": r2,
        "mae": mae,
        "mse": mse
    })

    #  Best parameters
    predictor.initialize_model("Random Forest", 
                                max_depth = r['best_params']['max_depth'],
                                n_estimators = r['best_params']['n_estimators'], 
                                min_samples_leaf = r['best_params']['min_samples_leaf'],
                                min_samples_split = r['best_params']['min_samples_split'])
    
    predictor.fit_model()
    r2, mae, mse = predictor.test_model()
    print(f"Best Params: R2: {r2}, MAE: {mae}, MSE: {mse}")
    metrics.append({
        "state": r['state'],
        "parameters": r['best_params'],
        "response_var": r['response_var'],
        "r2": r2,
        "mae": mae,
        "mse": mse
    })

Training and testing data split with test size 0.2 on State 56 and not stratified ...
State: 56, Response Variable: AADT_MDV
Random Forest model initialized with- {}
Default Params: R2: 0.999708097347464, MAE: 1.3999978445085037, MSE: 432.392314468537
Random Forest model initialized with- {'max_depth': 20, 'n_estimators': 150, 'min_samples_leaf': 1, 'min_samples_split': 2}
Best Params: R2: 0.999695372637411, MAE: 2.0043191201571795, MSE: 451.2412929994561
Training and testing data split with test size 0.2 on State 56 and not stratified ...
State: 56, Response Variable: AADT_HDV
Random Forest model initialized with- {}
Default Params: R2: 0.9980653619543062, MAE: 0.6116501185902417, MSE: 63.67522504393412
Random Forest model initialized with- {'max_depth': 30, 'n_estimators': 50, 'min_samples_leaf': 1, 'min_samples_split': 2}
Best Params: R2: 0.9979218431463396, MAE: 0.6319432058163537, MSE: 68.39889540472831
Training and testing data split with test size 0.2 on State 51 and not stratif

In [22]:
metrics_df = pd.DataFrame(metrics)
metrics_df

Unnamed: 0,state,parameters,response_var,r2,mae,mse
0,56,{},AADT_MDV,0.999708,1.399998,432.392314
1,56,"{'max_depth': 20, 'min_samples_leaf': 1, 'min_...",AADT_MDV,0.999695,2.004319,451.241293
2,56,{},AADT_HDV,0.998065,0.61165,63.675225
3,56,"{'max_depth': 30, 'min_samples_leaf': 1, 'min_...",AADT_HDV,0.997922,0.631943,68.398895
4,51,{},AADT_MDV,0.996941,3.570771,735.555587
5,51,"{'max_depth': 30, 'min_samples_leaf': 1, 'min_...",AADT_MDV,0.997508,3.544174,599.226408
6,51,{},AADT_HDV,0.999456,5.304578,2545.09287
7,51,"{'max_depth': 30, 'min_samples_leaf': 1, 'min_...",AADT_HDV,0.999485,5.386608,2411.419051
8,6,{},AADT_MDV,0.986183,28.225255,71005.769964
9,6,"{'max_depth': 30, 'min_samples_leaf': 1, 'min_...",AADT_MDV,0.98625,28.27971,70658.071229


In [23]:
# # Visualize the results
# metrics_df = pd.DataFrame(metrics)

# fig, ax = plt.subplots(1, 2, figsize = (10, 5))
# sns.barplot(x = "state", y = "r2", hue = "parameters", data = metrics_df, ax = ax[0])
# ax[0].set_title("R2")
# ax[0].set_ylim(0, 1)
# sns.barplot(x = "state", y = "mae", hue = "parameters", data = metrics_df, ax = ax[1])
# ax[1].set_title("MAE")
# plt.tight_layout()


In [2]:
print("Imputing missing AADT values", flush=True)

OUTDIR = "../results"
DATA_DIR = "../data/hpms_aadt_subset.csv"
RESPONSE_VARS = ['AADT_MDV', 'AADT_HDV']
RF_PREDICTOR_VARS = ["COUNTYFP", "F_SYSTEM", "THROUGH_LANES", "AADT"]
RANDOM_STATE = 42

full_data = pd.read_csv(DATA_DIR)
len(full_data)

Imputing missing AADT values


  full_data = pd.read_csv(DATA_DIR)


In [3]:
predictor = ap.AADTPredictor(DATA_DIR, RESPONSE_VARS[0], outdir = OUTDIR, random_state = RANDOM_STATE)

Loading data from ../data/hpms_aadt_subset.csv


  self.data = pd.read_csv(self.data_path)


Pre-processing data...
Data loaded successfully: 4807444 rows and 21 columns.


In [26]:
missing_data = full_data[(full_data.STATEFP == 26) & (full_data[RESPONSE_VARS[0]].isna())]
predictor.split_data(RF_PREDICTOR_VARS, state_fips = '26', test_size=1e-10)

Training and testing data split with test size 1e-10 on State 26 and not stratified ...


In [18]:
predictor.X_train.shape

(120646, 4)

In [19]:
predictor.y_test.shape

(1,)

In [None]:

for response_var in RESPONSE_VARS:
    predictor = ap.AADTPredictor(DATA_DIR, response_var, outdir = OUTDIR, random_state = RANDOM_STATE)
    for state in full_data.STATEFP.unique():
        print(f"Imputing {response_var} for state {state}", flush=True)
        missing_data = full_data[(full_data.STATEFP == state) & (full_data[response_var].isna())]
        predictor.split_data(RF_PREDICTOR_VARS, state_fips = state, test_size=0.0)
        predictor.initialize_model("Random Forest")
        predictor.model.predict()