In [85]:
import json
import pandas as pd
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder, MinMaxScaler, RobustScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso, RidgeCV, LassoCV
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, VotingRegressor, StackingRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error, make_scorer
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold, RandomizedSearchCV, StratifiedKFold, cross_validate
from scipy.stats import expon, reciprocal, uniform
from sklearn.gaussian_process.kernels import RBF, WhiteKernel, DotProduct, ExpSineSquared, RationalQuadratic
import os
os.environ["OMP_NUM_THREADS"] = "1"
import numpy as np
from sklearn.feature_selection import RFE, SelectFromModel, RFECV
from sklearn.compose import ColumnTransformer
from mango import Tuner, scheduler
from xgboost import XGBRegressor
import xgboost as xgb
from hyperopt import fmin, tpe, hp, Trials
import lightgbm as lgb
from lightgbm import LGBMRegressor
# from mlxtend.feature_selection import ExhaustiveFeatureSelector as EFS

In [86]:
def load_data(file_list, df_activities, df_links_network):
    data_frames = []
    for file in file_list:
        with open(file, 'r') as f:
            data = json.load(f)
            if isinstance(data['link_counts'], dict):
                data['link_counts'] = data['link_counts'].values()
            df_links = pd.DataFrame({
                'link_id': data['links_id'],
                'link_from': data['link_from'],
                'link_to': data['link_to'],
                'link_length': data['link_length'],
                'link_freespeed': data['link_freespeed'],
                'link_capacity': data['link_capacity'],
                'link_permlanes': data['link_permlanes'],
                'link_counts': data['link_counts']
            })
            df_nodes = pd.DataFrame({
                'node_id': data['nodes_id'],
                'node_x': data['nodes_x'],
                'node_y': data['nodes_y']
            })
            df_od_pairs = pd.DataFrame(data['o_d_pairs'], columns=['origin', 'destination'])
            
            df_work = pd.DataFrame({
                        'work_x': data['work_x'],
                        'work_y': data['work_y'],
                        'go_to_work': data['go_to_work']
            })
            df_home = pd.DataFrame({
                'home_x': data['home_x'],
                'home_y': data['home_y'],
                'go_to_home': data['go_to_home']
            })
            
            df_links = df_links.merge(df_nodes, how='left', left_on='link_from', right_on='node_id')
            df_links = df_links.rename(columns={'node_x': 'start_node_x', 'node_y': 'start_node_y'})
            df_links.drop('node_id', axis=1, inplace=True)
            df_links = df_links.merge(df_nodes, how='left', left_on='link_to', right_on='node_id')
            df_links = df_links.rename(columns={'node_x': 'end_node_x', 'node_y': 'end_node_y'})
            df_links.drop('node_id', axis=1, inplace=True) 
            
            origin_counts = df_od_pairs['origin'].value_counts()
            df_origin_counts = origin_counts.reset_index()
            df_origin_counts.columns = ['origin', 'start_count']
            destination_counts = df_od_pairs['destination'].value_counts()
            df_destination_counts = destination_counts.reset_index()
            df_destination_counts.columns = ['destination', 'end_count']
            df_links = df_links.merge(df_origin_counts, how='left', left_on='link_from', right_on='origin')
            df_links.drop('origin', axis=1, inplace=True)
            df_links = df_links.merge(df_destination_counts, how='left', left_on='link_to', right_on='destination')
            df_links.drop('destination', axis=1, inplace=True)
            df_links[['start_count','end_count']] = df_links[['start_count','end_count']].fillna(-1)
            
            # Calculate time of go_to_work and go_to_sum
            df_act_work = df_activities[df_activities['activity_type_main']=='work'].drop(['end_time'], axis=1)
            df_act_work = df_act_work.merge(df_work, how='left', left_on=['x','y'], right_on=['work_x','work_y'])
            df_act_work.drop(['x','y'], axis=1, inplace=True)
            df_act_work_agg = df_act_work.groupby(by="link")['go_to_work'].sum().reset_index(drop=False)
            df_act_home = df_activities[df_activities['activity_type_main']=='home'].drop(['end_time'], axis=1)
            df_act_home = df_act_home.merge(df_home, how='left', left_on=['x','y'], right_on=['home_x','home_y'])
            df_act_home.drop(['x','y'], axis=1, inplace=True)
            df_act_home_agg = df_act_home.groupby(by="link")['go_to_home'].sum().reset_index(drop=False)
            df_act_agg = df_act_home_agg.merge(df_act_work_agg, how='outer', on='link')
            df_act_agg.fillna(0, inplace=True)
            df_act_agg['go_to_sum'] = df_act_agg['go_to_home'] + df_act_agg['go_to_work']

            df_rushhr = df_activities[df_activities['end_time']!=-1]
            df_rushhr.loc[:, 'rush_hour'] = 0
            df_rushhr.loc[df_rushhr['end_time'].between(pd.to_timedelta('08:00:00'), pd.to_timedelta('10:00:00'), inclusive='both'), 'rush_hour'] = 1
            df_rushhr.loc[df_rushhr['end_time'].between(pd.to_timedelta('16:00:00'), pd.to_timedelta('19:00:00'), inclusive='both'), 'rush_hour'] = 1
            df_rushhr.drop(['end_time', 'max_dur', 'zoneId', 'cemdapStopDuration_s'], axis=1, inplace=True)
            df_rushhragg = df_rushhr.groupby(by="link").sum()['rush_hour'].reset_index(drop=False)
            
            df_maxduragg = df_activities[df_activities['max_dur']!=-1].groupby(by='link')['max_dur'].sum().reset_index(drop=False)
            
            df_activities['cemdapStopDuration_s'] = df_activities['cemdapStopDuration_s'].astype(float)
            df_cemagg = df_activities[df_activities['cemdapStopDuration_s']!=-1].groupby(by='link')['cemdapStopDuration_s'].sum().reset_index(drop=False)
            
            df_temp = df_links.merge(df_links_network, how='left', on=['start_node_x','start_node_y','end_node_x','end_node_y'])
            df_temp = df_temp[['link_id_x','link_from','link_to','link_id_y','from', 'to', 'type']]
            df_temp = df_temp.merge(df_act_agg, how='left', left_on='link_id_y', right_on='link')
            df_temp.drop('link', axis=1, inplace=True)
            df_temp = df_temp.merge(df_rushhragg, how='left', left_on='link_id_y', right_on='link')
            df_temp.drop('link', axis=1, inplace=True)
            df_temp = df_temp.merge(df_maxduragg, how='left', left_on='link_id_y', right_on='link')
            df_temp.drop('link', axis=1, inplace=True)
            df_temp = df_temp.merge(df_cemagg, how='left', left_on='link_id_y', right_on='link')
            df_temp.fillna({'cemdapStopDuration_s':-1, 'max_dur':-1, 'rush_hour': -1, 'go_to_sum': -1}, inplace=True)
            df_temp = df_temp[['link_id_x', 'go_to_sum', 'rush_hour', 'max_dur', 'cemdapStopDuration_s', 'type']]
            
            df_links = df_links.merge(df_temp, how='left', left_on='link_id', right_on='link_id_x')
            df_links.drop('link_id_x', axis=1, inplace=True)
            df_links['length_per_capacity_ratio'] = df_links['link_length'] / df_links['link_capacity']
            df_links['speed_capacity_ratio'] = df_links['link_freespeed'] / df_links['link_capacity']
            df_links['length_times_lanes'] = df_links['link_length'] * df_links['link_permlanes']
            df_links['speed_times_capacity'] = df_links['link_freespeed'] * df_links['link_capacity']
            df_links['length_times'] = df_links['link_length'] / df_links['link_freespeed']
            df_links['capacity_divided_by_lanes'] = df_links['link_capacity'] / df_links['link_permlanes']

        data_frames.append(df_links)
    return pd.concat(data_frames, ignore_index=True)


In [87]:
train_files = ['s-0.json', 's-1.json', 's-2.json', 's-3.json', 's-4.json','s-5.json', 's-6.json', 's-7.json', 's-8.json', 's-9.json'] 
test_files = ['s-15.json', 's-16.json', 's-17.json', 's-18.json','s-19.json']
validate_files = ['s-10.json', 's-11.json', 's-12.json', 's-13.json','s-14.json']
train_files = ['Data/cutoutWorlds/Train/po-1_pn-1.0_sn-1/' + i for i in train_files]
test_files = ['Data/cutoutWorlds/Test/po-1_pn-1.0_sn-1/' + j for j in test_files]
validate_files = ['Data/cutoutWorlds/Validate/po-1_pn-1.0_sn-1/' + k for k in validate_files]
df_activities = pd.read_pickle("Data/cutoutWorlds/Train/po-1_pn-1.0_sn-1/df_activities.pkl")
df_links_network = pd.read_pickle("Data/cutoutWorlds/Train/po-1_pn-1.0_sn-1/df_links_network.pkl")
train_data = load_data(train_files, df_activities, df_links_network)
test_data = load_data(test_files, df_activities, df_links_network)
validate_data = load_data(validate_files, df_activities, df_links_network)
Big_train_data = pd.concat([train_data, validate_data], ignore_index=True)

In [88]:
Big_train_data.replace(-1, 0, inplace=True)

In [89]:
numerical_features = ['link_id', 'link_from', 'link_to', 'start_node_x', 'start_node_y', 'end_node_x', 'end_node_y',
                      'link_length', 'link_freespeed', 'link_capacity', 'link_permlanes', 'start_count', 'end_count',
                      'go_to_sum', 'rush_hour', 'max_dur', 'cemdapStopDuration_s', 'length_per_capacity_ratio', 'speed_capacity_ratio',
                      'length_times_lanes', 'speed_times_capacity', 'length_times', 'capacity_divided_by_lanes'
                     ]
category_feature = ['type']
X_t = Big_train_data.drop(columns=['link_counts'])
y_t = Big_train_data['link_counts']
# X_v = validate_data.drop(columns=['link_counts'])
# y_v = validate_data['link_counts']
X_te = test_data.drop(columns=['link_counts'])
y_te = test_data['link_counts']
scaler = StandardScaler()
le = LabelEncoder()
ohe = OneHotEncoder(sparse_output=False)
ct = ColumnTransformer(
     [("num_preprocess", scaler, numerical_features),
      ("text_preprocess", ohe, category_feature)], remainder='passthrough').set_output(transform="pandas")
X_t = ct.fit_transform(X_t)
# X_v = ct.fit_transform(X_v)  
X_te = ct.fit_transform(X_te)
# X_t[numerical_features] = scaler.fit_transform(X_t[numerical_features])
# X_v[numerical_features] = scaler.fit_transform(X_v[numerical_features])
# X_te[numerical_features] = scaler.fit_transform(X_te[numerical_features])
# X_t[category_feature] = le.fit_transform(X_t[category_feature])
# X_v[category_feature] = le.fit_transform(X_v[category_feature])
# X_te[category_feature] = le.fit_transform(X_te[category_feature])


In [90]:
# Initialize models
kf = KFold(n_splits=5, shuffle=True, random_state=101)
models = {
    'Linear Regression': LinearRegression(),
    # 'Lasso': LassoCV(cv=kf, random_state=42, max_iter=200000),
    'KNN': KNeighborsRegressor(algorithm='kd_tree', n_neighbors=5, weights='distance'),
    # 'XGB': XGBRegressor(max_depth=15, n_estimators=200, learning_rate=0.01, subsample=0.8),
    # 'lgbm': LGBMRegressor(),
    'Ridge': RidgeCV(cv=kf),
    # 'SVR': SVR(C=100, epsilon=0.2, gamma='scale', kernel='rbf'),
    # 'Random Forest': RandomForestRegressor(bootstrap=False, max_depth=15, max_features=0.7,
    #                   min_samples_leaf=9, n_estimators=200, random_state=101),
    # 'Gradient Boosting': GradientBoostingRegressor(learning_rate=0.01, max_depth=14, min_samples_leaf=3,
    #                       min_samples_split=4, n_estimators=950, random_state=101, subsample=0.8),
    # 'Artificial Neural Network': MLPRegressor(activation='logistic', alpha=0.01, hidden_layer_sizes=(50, 50),
    #          max_iter=4000, random_state=101),
    # 'Gaussian Process Regression': GaussianProcessRegressor(0.1 ** 2 * RBF(length_scale=0.1) + WhiteKernel(noise_level=0.1 ** 2, noise_level_bounds=(1e-5, 1e5)))
}


In [91]:
# Function to train and evaluate models
def evaluate_models(models, X_train, y_train):
    results = {}
    for name, model in models.items():
        scoring=['neg_mean_absolute_error','neg_mean_squared_error']
        # Define the cross-validation strategy (e.g., 5-fold cross-validation)
        kf = KFold(n_splits=5, shuffle=True, random_state=101)

        # Perform k-fold cross-validation and calculate MSE and MAE
        scores = cross_validate(model, X_train, y_train, cv=kf, scoring=scoring, n_jobs=-1)

        mean_mae = -scores['test_neg_mean_absolute_error'].mean()
        mean_mse = -scores['test_neg_mean_squared_error'].mean()
        std_mse = scores['test_neg_mean_squared_error'].std()

        print(name + " done")
        print(mean_mae, mean_mse, std_mse)
        results[name] = {'MAE': mean_mae, 'MSE': mean_mse, 'MSE_std': std_mse}
    
    return results

# Train and evaluate
results = evaluate_models(models, X_t, y_t)


Linear Regression done
12.370569158659897 760.0952053396537 99.01984078466396
KNN done
8.652426124936847 489.6239171922831 89.08742466704129
Ridge done
12.336436053849024 760.016160870372 99.65934549980219


In [74]:
hyper_params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': ['l1','l2'],
    'learning_rate': 0.005,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.7,
    'bagging_freq': 10,
    'verbose': 0,
    "max_depth": 8,
    "num_leaves": 128,  
    "max_bin": 512,
    "num_iterations": 100000
}
gbm = lgb.LGBMRegressor(**hyper_params)
gbm.fit(X_t, y_t,
        eval_set=[(X_te, y_te)],
        eval_metric='l1',
        early_stopping=1000)
y_pred = gbm.predict(X_t, num_iteration=gbm.best_iteration_)
print('The rmse of prediction is:', mean_absolute_error(y_pred, y_t))

TypeError: fit() got an unexpected keyword argument 'early_stopping'

In [55]:
space = {
    'max_depth': hp.quniform('max_depth', 2, 8, 1),
    'learning_rate': hp.loguniform('learning_rate', -5, -2),
    'subsample': hp.uniform('subsample', 0.5, 1)
}
def objective(params):
    global X_t, y_t, X_te, y_te
    xgb_model = xgb.XGBClassifier(**params)
    xgb_model.fit(X_t, y_t)
    y_pred = xgb_model.predict(X_te)
    score = accuracy_score(y_te, y_pred)
    return {'loss': -score, 'status': STATUS_OK}

# Perform the optimization
best_params = fmin(objective, space, algo=tpe.suggest, max_evals=100)
print("Best set of hyperparameters: ", best_params)

  0%|                               | 0/100 [00:00<?, ?trial/s, best loss=?]

job exception: Invalid classes inferred from unique values of `y`.  Expected: [  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
  18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35
  36  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53
  54  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71
  72  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89
  90  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106 107
 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125
 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143
 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161
 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179
 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197
 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215
 216 217 218 219 220 221 222 223 224 225 226 2

  0%|                               | 0/100 [00:00<?, ?trial/s, best loss=?]


ValueError: Invalid classes inferred from unique values of `y`.  Expected: [  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
  18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35
  36  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53
  54  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71
  72  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89
  90  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106 107
 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125
 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143
 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161
 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179
 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197
 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215
 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232], got [  0.   1.   2.   3.   4.   5.   6.   7.   8.   9.  10.  11.  12.  13.
  14.  15.  16.  17.  18.  19.  20.  21.  22.  23.  24.  25.  26.  27.
  28.  29.  30.  31.  32.  33.  34.  35.  36.  37.  38.  39.  40.  41.
  42.  43.  44.  45.  46.  47.  48.  49.  50.  51.  52.  53.  54.  55.
  56.  57.  58.  59.  60.  61.  62.  63.  64.  65.  66.  67.  68.  69.
  70.  71.  72.  73.  74.  75.  76.  77.  78.  79.  80.  81.  82.  83.
  84.  85.  86.  87.  88.  89.  90.  91.  92.  93.  94.  95.  96.  97.
  98.  99. 100. 101. 102. 103. 104. 105. 106. 107. 108. 109. 110. 111.
 112. 113. 114. 115. 116. 117. 118. 119. 120. 121. 122. 123. 124. 125.
 126. 127. 128. 129. 130. 131. 132. 133. 134. 135. 136. 137. 138. 139.
 140. 141. 143. 144. 145. 147. 148. 150. 151. 152. 153. 155. 156. 158.
 159. 160. 161. 162. 163. 164. 165. 166. 167. 168. 169. 171. 172. 176.
 177. 178. 179. 186. 187. 188. 189. 190. 194. 196. 200. 202. 206. 207.
 212. 214. 215. 216. 217. 221. 222. 227. 235. 236. 237. 239. 240. 250.
 255. 261. 264. 266. 278. 286. 287. 295. 304. 312. 319. 343. 380. 383.
 417. 419. 451. 452. 465. 468. 478. 486. 505. 571. 574. 608. 614. 615.
 640. 647. 686. 696. 698. 763. 772. 836. 872.]

In [25]:
import pickle
with open('result_cutout_after_featureselection(BO_RFandGB).pkl', 'wb') as file:
    pickle.dump(results_feature, file)


In [8]:
param_space = dict(
    n_neighbors=range(1, 50),
    weights=['uniform', 'distance'],
    algorithm=['auto', 'ball_tree', 'kd_tree', 'brute']
)

conf_Dict = dict()
conf_Dict['num_iteration'] = 10
kf = KFold(n_splits=5, shuffle=True, random_state=101)

@scheduler.parallel(n_jobs=-1)
def objective(**hyper_par):
    scoring=['neg_mean_absolute_error','neg_mean_squared_error']
    global X_t, y_t
    clf = KNeighborsRegressor(**hyper_par)
    result = cross_validate(clf, X_t, y_t, scoring=scoring, cv=kf, n_jobs=-1)
    mean_mae = -result['test_neg_mean_absolute_error'].mean()
    return mean_mae

tuner = Tuner(param_space, objective, conf_Dict)
results = tuner.minimize()

print('best parameters:', results['best_params'])
print('best accuracy:', results['best_objective'])

  0%|          | 0/10 [00:00<?, ?it/s]

best parameters: {'algorithm': 'kd_tree', 'n_neighbors': 5, 'weights': 'distance'}
best accuracy: 8.652426124936847


In [13]:
param_space = dict(
    C=[0.001, 0.01, 0.1, 1, 10, 100],  # Extended range for the regularization parameter
    gamma=['scale', 'auto'],  # Including specific gamma values
    kernel=['linear', 'poly', 'rbf', 'sigmoid'],  # Focusing on RBF kernel
    epsilon=[0.01, 0.1, 0.2],  # Epsilon in the epsilon-SVR model
)

conf_Dict = dict()
conf_Dict['num_iteration'] = 10
kf = KFold(n_splits=5, shuffle=True, random_state=101)

@scheduler.parallel(n_jobs=-1)
def objective(**hyper_par):
    scoring=['neg_mean_absolute_error','neg_mean_squared_error']
    global X_t, y_t
    clf = SVR(**hyper_par)
    result = cross_validate(clf, X_t, y_t, scoring=scoring, cv=kf, n_jobs=-1)
    mean_mae = -result['test_neg_mean_absolute_error'].mean()
    mean_mse = -result['test_neg_mean_squared_error'].mean()
    return mean_mae

tuner = Tuner(param_space, objective, conf_Dict)
results = tuner.minimize()

print('best parameters:', results['best_params'])
print('best accuracy:', results['best_objective'])

  0%|          | 0/10 [00:00<?, ?it/s]

best parameters: {'C': 100, 'epsilon': 0.2, 'gamma': 'scale', 'kernel': 'rbf'}
best accuracy: 8.718359330762542


In [14]:
param_space =  dict(
    max_features=['sqrt', 'log2', .1, .3, .5, .7, .9],
    n_estimators=range(50, 1001, 50), # 10 to 1000 in steps of 50
    bootstrap=[True, False],
    max_depth=range(1, 20),
    min_samples_leaf=range(1, 10)
)

conf_Dict = dict()
conf_Dict['num_iteration'] = 10
kf = KFold(n_splits=5, shuffle=True, random_state=101)

@scheduler.parallel(n_jobs=-1)
def objective(**hyper_par):
    scoring=['neg_mean_absolute_error','neg_mean_squared_error']
    global X_t, y_t

    clf = RandomForestRegressor(**hyper_par, criterion='absolute_error', random_state=101, warm_start=True, n_jobs=-1)
    result = cross_validate(clf, X_t, y_t, scoring=scoring, cv=kf, n_jobs=-1)
    mean_mae = -result['test_neg_mean_absolute_error'].mean()

    return mean_mae

tuner = Tuner(param_space, objective, conf_Dict)
results = tuner.minimize()
print('best parameters:', results['best_params'])
print('best accuracy:', results['best_objective'])

  0%|          | 0/10 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [11]:
param_space = dict(
    n_estimators=range(50, 1001, 50),
    learning_rate=[0.001, 0.01, 0.1, 0.2, 0.3],  # Varied learning rates for gradient boosting
    max_depth=range(3, 20),
    min_samples_split=range(2, 11, 1),
    min_samples_leaf=range(1, 10),
    subsample= [0.8, 0.9, 1.0]  # Fraction of samples to be used for fitting individual base learners
)

conf_Dict = dict()
conf_Dict['num_iteration'] = 5


kf = KFold(n_splits=5, shuffle=True, random_state=101)

@scheduler.parallel(n_jobs=-1)
def objective(**hyper_par):
    scoring=['neg_mean_absolute_error','neg_mean_squared_error']
    global X_t, y_t

    clf = GradientBoostingRegressor(**hyper_par, loss='absolute_error', random_state=101, warm_start=True, n_iter_no_change=10, tol=0.001)
    result = cross_validate(clf, X_t, y_t, scoring=scoring, cv=kf, n_jobs=-1)
    mean_mae = -result['test_neg_mean_absolute_error'].mean()

    return mean_mae

tuner = Tuner(param_space, objective, conf_Dict)
results = tuner.minimize()
print('best parameters:', results['best_params'])
print('best accuracy:', results['best_objective'])

  0%|          | 0/5 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [18]:
param_space = dict(
    hidden_layer_sizes=[(50,), (100,), (50, 50), (100, 100), (30, 30, 30)],
    activation= ['tanh', 'relu', 'identity', 'logistic'],
    solver= ['sgd', 'adam'],
    alpha= [0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100],
    max_iter=range(1000, 5001, 1000)
)

kf = KFold(n_splits=5, shuffle=True, random_state=101)
grid_search_mlp = RandomizedSearchCV(MLPRegressor(max_iter=5000, random_state=101), param_space, cv=kf, n_jobs=-1, verbose=10, 
                                     scoring='neg_mean_absolute_error', n_iter=40)
grid_search_mlp.fit(X_t, y_t)
print(grid_search_mlp.best_params_)
print(grid_search_mlp.best_estimator_)
print(grid_search_mlp.best_score_)

Fitting 5 folds for each of 40 candidates, totalling 200 fits
{'solver': 'adam', 'max_iter': 4000, 'hidden_layer_sizes': (50, 50), 'alpha': 0.01, 'activation': 'logistic'}
MLPRegressor(activation='logistic', alpha=0.01, hidden_layer_sizes=(50, 50),
             max_iter=4000, random_state=101)
-8.582321472242217


In [35]:
from sklearn.gaussian_process.kernels import RBF, ConstantKernel, DotProduct
\
import numpy as np
param_grid = {
    'kernel': [ConstantKernel (1.0, (1e-1, 1e1)) * RBF(1.0, (1e-2, 1e2))],
    'alpha': [ 1e-2, 0.1, 1.0]
}

gpr = GaussianProcessRegressor(copy_X_train=False)

# Initialize GridSearchCV
grid_search_gpr = RandomizedSearchCV(gpr, param_grid, n_iter=5, cv=0, scoring='neg_mean_squared_error', n_jobs=-1, verbose=10)
grid_search_gpr.fit(X_t, y_t)

print(grid_search_gpr.best_params_)
print(grid_search_gpr.best_estimator_)

ValueError: k-fold cross-validation requires at least one train/test split by setting n_splits=2 or more, got n_splits=0.

In [61]:
def evaluate_models_with_test(model, X_train, y_train, X_test, y_test):
    results = {}
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    # mape = mean_absolute_percentage_error(y_test, y_pred)
    # r2 = r2_score(y_test, y_pred)
      
    results = {'MAE': mae, 'MSE': mse}
    
    return results

In [75]:
import pickle

result_final_with_test = {}
for name, model in models.items():
    # if name in ['Random Forest', 'Gradient Boosting']:
    #     X_t_reduced = X_t[results_feature[name]['selected_feature']]
    #     X_te_reduced = X_te[results_feature[name]['selected_feature']]
    #     result_final_with_test[name] = evaluate_models_with_test(model, X_t_reduced, y_t, X_te_reduced, y_te)
    # else:
    result_final_with_test[name] = evaluate_models_with_test(model, X_t, y_t, X_te, y_te)

# with open('result_cutout_final(wo gpr).pkl', 'wb') as file:
#     pickle.dump(result_final_with_test, file)



In [76]:
result_final_with_test

{'KNN': {'MAE': 18.11773454843191, 'MSE': 1335.4337540798208},
 'lgbm': {'MAE': 14.110405310275835, 'MSE': 644.7681518397106}}

In [11]:
# before hyper
result_final_with_test

{'KNN': {'MAE': 18.35581966154244, 'MSE': 1352.4510321663868},
 'Ridge': {'MAE': 19.710332186479242, 'MSE': 883.2515492869593},
 'SVR': {'MAE': 14.433396871556601, 'MSE': 558.6550652045959},
 'Random Forest': {'MAE': 14.561781423588686, 'MSE': 698.0238534685441},
 'Gradient Boosting': {'MAE': 12.788332651025968, 'MSE': 461.9928087925666},
 'Artificial Neural Network': {'MAE': 22.518542494658924,
  'MSE': 1340.6892306734721}}

In [20]:
# after hyper
result_final_with_test

{'KNN': {'MAE': 18.11773454843191, 'MSE': 1335.4337540798208},
 'SVR': {'MAE': 15.780271259700418, 'MSE': 683.5681447350628},
 'Random Forest': {'MAE': 14.173445705968929, 'MSE': 641.6423604186432},
 'Gradient Boosting': {'MAE': 13.950800886267793, 'MSE': 660.9973649428741},
 'Artificial Neural Network': {'MAE': 21.492595848539622,
  'MSE': 2161.810201175524}}

In [50]:
# Train classifiers
reg1 = GradientBoostingRegressor()
reg2 = RandomForestRegressor()

ereg = VotingRegressor([("gb", reg1), ("rf", reg2)])
ereg1 = StackingRegressor([("gb", reg1), ("rf", reg2)])
ereg.fit(X_t, y_t)
ereg1.fit(X_t, y_t)

In [51]:
y_pred = ereg.predict(X_te)
mse = mean_squared_error(y_te, y_pred)
mae = mean_absolute_error(y_te, y_pred)
print(mae, mse)
y_pred1 = ereg1.predict(X_te)
mse1 = mean_squared_error(y_te, y_pred1)
mae1 = mean_absolute_error(y_te, y_pred1)
print(mae1, mse1)

12.99393889798277 497.4354372121598
13.778640339737045 609.852207115587


In [27]:
# BO w/ feature
result_final_with_test

{'Random Forest': {'MAE': 15.408751754177185, 'MSE': 616.0415054078707},
 'Gradient Boosting': {'MAE': 17.594169844517467, 'MSE': 906.6786634713227}}

In [19]:
# BO w/o feature
result_final_with_test

{'Linear Regression': {'MAE': 20.174646411039785, 'MSE': 907.5637017883494},
 'Lasso': {'MAE': 15.94555308382198, 'MSE': 623.346131490013},
 'Ridge': {'MAE': 20.15102515583173, 'MSE': 899.627760582678},
 'SVR': {'MAE': 24.28971478267384, 'MSE': 1022.310088408284},
 'Random Forest': {'MAE': 13.544514569822395, 'MSE': 540.7677890007706},
 'Gradient Boosting': {'MAE': 14.415849761332476, 'MSE': 720.1563227840516},
 'Artificial Neural Network': {'MAE': 51.41355254068389,
  'MSE': 10314.245120300402}}

In [23]:
result_final_with_test

{'Linear Regression': {'MAE': 20.17464641103979, 'MSE': 907.5637017883504},
 'Lasso': {'MAE': 19.630499926649197, 'MSE': 827.446194872528},
 'Ridge': {'MAE': 20.15102515583173, 'MSE': 899.627760582678},
 'SVR': {'MAE': 24.289714782673844, 'MSE': 1022.3100884082841},
 'Random Forest': {'MAE': 19.925688067490935, 'MSE': 860.415125118749},
 'Gradient Boosting': {'MAE': 14.206169892700725, 'MSE': 653.6444558714534},
 'Artificial Neural Network': {'MAE': 29.089758158721573,
  'MSE': 1716.4521812762798}}

In [21]:
# SVR with feature selection
result_final_with_test

{'Linear Regression': {'MAE': 20.17464641103979, 'MSE': 907.5637017883504},
 'Lasso': {'MAE': 19.630499926649197, 'MSE': 827.446194872528},
 'Ridge': {'MAE': 20.15102515583173, 'MSE': 899.627760582678},
 'SVR': {'MAE': 622.6378771377161, 'MSE': 389044.89117324405},
 'Random Forest': {'MAE': 19.925688067490935, 'MSE': 860.415125118749},
 'Gradient Boosting': {'MAE': 14.206169892700725, 'MSE': 653.6444558714534},
 'Artificial Neural Network': {'MAE': 29.089758158721573,
  'MSE': 1716.4521812762798}}

In [71]:
ln_results

{'MAE': 20.1076673770882, 'MSE': 902.1385041302448}

In [72]:
lasso_results

{'MAE': 14.695032409248478, 'MSE': 570.2595372331552}

In [73]:
ridge_results

{'MAE': 20.91790633288539, 'MSE': 955.1395825360881}

In [74]:
svr_results

{'MAE': 23.39522142225316, 'MSE': 1003.343949315722}

In [60]:
rf_results

{'MAE': 14.532470756135266, 'MSE': 676.9930176271231}

In [64]:
gb_results

{'MAE': 17.260081814945043, 'MSE': 721.9088243448939}

In [65]:
ann_results

{'MAE': 22.55849174892375, 'MSE': 1005.3902663530097}

In [None]:
# # Initialize a list to hold trips
# trips = []
# current_trip = [df_od_pairs.iloc[0]['origin']]  # Start with the first origin
# 
# # Iterate over the DataFrame rows
# for i, row in df_od_pairs.iterrows():
#     current_trip.append(row['destination'])  # Always add the destination
#     # Check if the next origin matches the current destination
#     if i + 1 < len(df_od_pairs) and row['destination'] != df_od_pairs.iloc[i + 1]['origin']:
#         # If it doesn't, the current trip has ended
#         trips.append(current_trip)
#         current_trip = [df_od_pairs.iloc[i + 1]['origin']]  # Start a new trip
# 
# # Add the last trip if it wasn't already added
# if current_trip not in trips:
#     trips.append(current_trip)


# from collections import Counter
# # Flatten the list of trips into a single list of nodes including origins and destinations
# all_nodes = [node for trip in trips for node in trip]
# 
# # Use Counter to count the occurrences of each node
# node_trip_counts = Counter(all_nodes)
# 
# df_node_trip_counts = pd.DataFrame.from_dict(node_trip_counts, orient='index').reset_index()
# df_node_trip_counts.columns = ['node_id', 'trip_amount']