In [1]:
import json
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.linear_model import LinearRegression, Ridge, Lasso, RidgeCV, LassoCV
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error, make_scorer
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold, RandomizedSearchCV, StratifiedKFold
from scipy.stats import expon, reciprocal, uniform
from sklearn.gaussian_process.kernels import RBF, WhiteKernel, DotProduct, ExpSineSquared, RationalQuadratic
import numpy as np
from sklearn.feature_selection import RFE, SelectFromModel, RFECV
from sklearn.compose import ColumnTransformer
# from mlxtend.feature_selection import ExhaustiveFeatureSelector as EFS

In [2]:
def load_data(file_list, df_activities, df_links_network):
    data_frames = []
    for file in file_list:
        with open(file, 'r') as f:
            data = json.load(f)
            if isinstance(data['link_counts'], dict):
                data['link_counts'] = data['link_counts'].values()
            df_links = pd.DataFrame({
                'link_id': data['links_id'],
                'link_from': data['link_from'],
                'link_to': data['link_to'],
                'link_length': data['link_length'],
                'link_freespeed': data['link_freespeed'],
                'link_capacity': data['link_capacity'],
                'link_permlanes': data['link_permlanes'],
                'link_counts': data['link_counts']
            })
            df_nodes = pd.DataFrame({
                'node_id': data['nodes_id'],
                'node_x': data['nodes_x'],
                'node_y': data['nodes_y']
            })
            df_od_pairs = pd.DataFrame(data['o_d_pairs'], columns=['origin', 'destination'])
            
            df_work = pd.DataFrame({
                        'work_x': data['work_x'],
                        'work_y': data['work_y'],
                        'go_to_work': data['go_to_work']
            })
            df_home = pd.DataFrame({
                'home_x': data['home_x'],
                'home_y': data['home_y'],
                'go_to_home': data['go_to_home']
            })
            
            df_links = df_links.merge(df_nodes, how='left', left_on='link_from', right_on='node_id')
            df_links = df_links.rename(columns={'node_x': 'start_node_x', 'node_y': 'start_node_y'})
            df_links.drop('node_id', axis=1, inplace=True)
            df_links = df_links.merge(df_nodes, how='left', left_on='link_to', right_on='node_id')
            df_links = df_links.rename(columns={'node_x': 'end_node_x', 'node_y': 'end_node_y'})
            df_links.drop('node_id', axis=1, inplace=True) 
            
            origin_counts = df_od_pairs['origin'].value_counts()
            df_origin_counts = origin_counts.reset_index()
            df_origin_counts.columns = ['origin', 'start_count']
            destination_counts = df_od_pairs['destination'].value_counts()
            df_destination_counts = destination_counts.reset_index()
            df_destination_counts.columns = ['destination', 'end_count']
            df_links = df_links.merge(df_origin_counts, how='left', left_on='link_from', right_on='origin')
            df_links.drop('origin', axis=1, inplace=True)
            df_links = df_links.merge(df_destination_counts, how='left', left_on='link_to', right_on='destination')
            df_links.drop('destination', axis=1, inplace=True)
            df_links[['start_count','end_count']] = df_links[['start_count','end_count']].fillna(-1)
            
            # Calculate time of go_to_work and go_to_sum
            df_act_work = df_activities[df_activities['activity_type_main']=='work'].drop(['end_time'], axis=1)
            df_act_work = df_act_work.merge(df_work, how='left', left_on=['x','y'], right_on=['work_x','work_y'])
            df_act_work.drop(['x','y'], axis=1, inplace=True)
            df_act_work_agg = df_act_work.groupby(by="link")['go_to_work'].sum().reset_index(drop=False)
            df_act_home = df_activities[df_activities['activity_type_main']=='home'].drop(['end_time'], axis=1)
            df_act_home = df_act_home.merge(df_home, how='left', left_on=['x','y'], right_on=['home_x','home_y'])
            df_act_home.drop(['x','y'], axis=1, inplace=True)
            df_act_home_agg = df_act_home.groupby(by="link")['go_to_home'].sum().reset_index(drop=False)
            df_act_agg = df_act_home_agg.merge(df_act_work_agg, how='outer', on='link')
            df_act_agg.fillna(0, inplace=True)
            df_act_agg['go_to_sum'] = df_act_agg['go_to_home'] + df_act_agg['go_to_work']

            df_rushhr = df_activities[df_activities['end_time']!=-1]
            df_rushhr.loc[:, 'rush_hour'] = 0
            df_rushhr.loc[df_rushhr['end_time'].between(pd.to_timedelta('08:00:00'), pd.to_timedelta('10:00:00'), inclusive='both'), 'rush_hour'] = 1
            df_rushhr.loc[df_rushhr['end_time'].between(pd.to_timedelta('16:00:00'), pd.to_timedelta('19:00:00'), inclusive='both'), 'rush_hour'] = 1
            df_rushhr.drop(['end_time', 'max_dur', 'zoneId', 'cemdapStopDuration_s'], axis=1, inplace=True)
            df_rushhragg = df_rushhr.groupby(by="link").sum()['rush_hour'].reset_index(drop=False)
            
            df_maxduragg = df_activities[df_activities['max_dur']!=-1].groupby(by='link')['max_dur'].sum().reset_index(drop=False)
            
            df_activities['cemdapStopDuration_s'] = df_activities['cemdapStopDuration_s'].astype(float)
            df_cemagg = df_activities[df_activities['cemdapStopDuration_s']!=-1].groupby(by='link')['cemdapStopDuration_s'].sum().reset_index(drop=False)
            
            df_temp = df_links.merge(df_links_network, how='left', on=['start_node_x','start_node_y','end_node_x','end_node_y'])
            df_temp = df_temp[['link_id_x','link_from','link_to','link_id_y','from', 'to', 'type']]
            df_temp = df_temp.merge(df_act_agg, how='left', left_on='link_id_y', right_on='link')
            df_temp.drop('link', axis=1, inplace=True)
            df_temp = df_temp.merge(df_rushhragg, how='left', left_on='link_id_y', right_on='link')
            df_temp.drop('link', axis=1, inplace=True)
            df_temp = df_temp.merge(df_maxduragg, how='left', left_on='link_id_y', right_on='link')
            df_temp.drop('link', axis=1, inplace=True)
            df_temp = df_temp.merge(df_cemagg, how='left', left_on='link_id_y', right_on='link')
            df_temp.fillna({'cemdapStopDuration_s':-1, 'max_dur':-1, 'rush_hour': -1, 'go_to_sum': -1}, inplace=True)
            df_temp = df_temp[['link_id_x', 'go_to_sum', 'rush_hour', 'max_dur', 'cemdapStopDuration_s', 'type']]
            
            df_links = df_links.merge(df_temp, how='left', left_on='link_id', right_on='link_id_x')
            df_links.drop('link_id_x', axis=1, inplace=True)
        data_frames.append(df_links)
    return pd.concat(data_frames, ignore_index=True)


In [5]:
df_train = []
for i in range(0, 10):
    train_files = f'Data/sparseWorlds/Train/po-1/s-{i}.json'
    df_activities = pd.read_pickle(f'Data/sparseWorlds/Train/po-1/df_activities_{i}.pkl')
    df_links_network = pd.read_pickle(f'Data/sparseWorlds/Train/po-1/df_links_network_{i}.pkl')
    train_data = load_data([train_files], df_activities, df_links_network)
    df_train.append(train_data)
train_data_all = pd.concat(df_train, ignore_index=True)

df_validate = []
for i in range(10, 15):
    validate_files = f'Data/sparseWorlds/Validate/po-1/s-{i}.json'
    df_activities = pd.read_pickle(f'Data/sparseWorlds/Validate/po-1/df_activities_{i}.pkl')
    df_links_network = pd.read_pickle(f'Data/sparseWorlds/Validate/po-1/df_links_network_{i}.pkl')
    validate_data = load_data([validate_files], df_activities, df_links_network)
    df_validate.append(validate_data)
validate_data_all = pd.concat(df_validate, ignore_index=True)
    
df_test = []
for i in range(15, 20):
    test_files = f'Data/sparseWorlds/Test/po-1/s-{i}.json'
    df_activities = pd.read_pickle(f'Data/sparseWorlds/Test/po-1/df_activities_{i}.pkl')
    df_links_network = pd.read_pickle(f'Data/sparseWorlds/Test/po-1/df_links_network_{i}.pkl')
    test_data = load_data([test_files], df_activities, df_links_network)
    df_test.append(test_data)
test_data_all = pd.concat(df_test, ignore_index=True)

Big_train_data = pd.concat([train_data_all, validate_data_all], ignore_index=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_rushhr.loc[:, 'rush_hour'] = 0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_rushhr.drop(['end_time', 'max_dur', 'zoneId', 'cemdapStopDuration_s'], axis=1, inplace=True)
  df_rushhragg = df_rushhr.groupby(by="link").sum()['rush_hour'].reset_index(drop=False)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_rushhr.loc[:, 'rush_hour'] = 0
A value i

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_rushhr.loc[:, 'rush_hour'] = 0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_rushhr.drop(['end_time', 'max_dur', 'zoneId', 'cemdapStopDuration_s'], axis=1, inplace=True)
  df_rushhragg = df_rushhr.groupby(by="link").sum()['rush_hour'].reset_index(drop=False)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_rushhr.loc[:, 'rush_hour'] = 0
A value i

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_rushhr.loc[:, 'rush_hour'] = 0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_rushhr.drop(['end_time', 'max_dur', 'zoneId', 'cemdapStopDuration_s'], axis=1, inplace=True)
  df_rushhragg = df_rushhr.groupby(by="link").sum()['rush_hour'].reset_index(drop=False)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_rushhr.loc[:, 'rush_hour'] = 0
A value i

In [11]:
numerical_features = ['link_length', 'link_freespeed', 'link_capacity', 'link_permlanes', 'start_node_x', 'start_node_y', 'end_node_x', 'end_node_y', 'start_count', 'end_count', 'go_to_sum', 'rush_hour', 'max_dur', 'cemdapStopDuration_s']
category_feature = ['type']
X_t = Big_train_data.drop(columns=['link_counts'])
y_t = Big_train_data['link_counts']
X_te = test_data.drop(columns=['link_counts'])
y_te = test_data['link_counts']

scaler = StandardScaler()
ohe = OneHotEncoder(sparse_output=False)
ct = ColumnTransformer(
     [("num_preprocess", scaler, numerical_features),
      ("text_preprocess", ohe, category_feature)], remainder='passthrough').set_output(transform="pandas")
X_t = ct.fit_transform(X_t) 
X_te = ct.fit_transform(X_te)

In [12]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)
# Initialize models
models = {
    'Linear Regression': LinearRegression(),
    'Lasso': LassoCV(cv=kf),
    'Ridge': RidgeCV(cv=kf),
    'SVR': SVR(),
    'Random Forest': RandomForestRegressor(),
    'Gradient Boosting': GradientBoostingRegressor(),
    'Artificial Neural Network': MLPRegressor(),
#     'Gaussian Process Regression': GaussianProcessRegressor(kernel=RBF(length_scale=1.0) + WhiteKernel(noise_level=1.0), alpha=0.1, n_restarts_optimizer=3)
}


In [13]:
# Function to train and evaluate models
def evaluate_models(models, X_train, y_train):
    results = {}
    for name, model in models.items():
#         model.fit(X_train, y_train)
#         y_pred = model.predict(X_test)
#         mse = mean_squared_error(y_test, y_pred)
#         mae = mean_absolute_error(y_test, y_pred)
        mse_scorer = make_scorer(mean_squared_error, greater_is_better=False)
        mae_scorer = make_scorer(mean_absolute_error, greater_is_better=False)

        # Define the cross-validation strategy (e.g., 5-fold cross-validation)
        kf = KFold(n_splits=5, shuffle=True, random_state=42)

        # Perform k-fold cross-validation and calculate MSE and MAE
        mse_scores = cross_val_score(model, X_train, y_train, cv=kf, scoring=mse_scorer)
        mae_scores = cross_val_score(model, X_train, y_train, cv=kf, scoring=mae_scorer)

        # Display the mean MSE and MAE across folds
        mean_mse = -mse_scores.mean()
        mean_mae = -mae_scores.mean()
        std_mse = mse_scores.std()
        # mape = mean_absolute_percentage_error(y_test, y_pred)
        # r2 = r2_score(y_test, y_pred)
        print(name + " done")
        
        results[name] = {'MAE': mean_mae, 'MSE': mean_mse, 'MSE_std': std_mse}
    
    return results

# Function to train and evaluate models
def feature_select_models(models, X_train, y_train):
    mse_scorer = make_scorer(mean_squared_error, greater_is_better=False)
    mae_scorer = make_scorer(mean_absolute_error, greater_is_better=False)
    kf = KFold(n_splits=5, shuffle=True, random_state=42) 
    results = {}
    for name, model in models.items():
        if name not in ['SVR', 'Artificial Neural Network', 'Gaussian Process Regression']:
            selector = RFECV(model, step=1, cv=kf, scoring=mse_scorer).fit(X_train, y_train)
            print(f'{name} selection done')

        else:
            # Fit Random Forest to get feature importances
            rf = RandomForestRegressor()
            rf.fit(X_train, y_train)
            # Select features based on importances
            selector = RFECV(estimator=rf, step=1, cv=kf, scoring=mse_scorer).fit(X_train, y_train)
            print(f'{name} selection done')
            
        selected_features = X_train.columns[selector.support_]
        X_train_reduced = X_train[selected_features] 
        mse_scores = cross_val_score(model, X_train_reduced, y_train, cv=kf, scoring=mse_scorer)
        mse = -mse_scores.mean()
        mse_std = mse_scores.std()           
        mae_scores = cross_val_score(model, X_train_reduced, y_train, cv=kf, scoring=mae_scorer)
        mean_mae = -mae_scores.mean()

        results[name] = {'MAE': mean_mae, 'MSE': mse, 'MSE_std': mse_std, 'selected_feature': selected_features}

            
    return results

# Train and evaluate
results = evaluate_models(models, X_t, y_t)

# results_feature = feature_select_models(models, X_t, y_t)


Linear Regression done
Lasso done
Ridge done
SVR done
Random Forest done
Gradient Boosting done
Artificial Neural Network done


In [15]:
X_t

Unnamed: 0,num_preprocess__link_length,num_preprocess__link_freespeed,num_preprocess__link_capacity,num_preprocess__link_permlanes,num_preprocess__start_node_x,num_preprocess__start_node_y,num_preprocess__end_node_x,num_preprocess__end_node_y,num_preprocess__start_count,num_preprocess__end_count,...,text_preprocess__type_primary_link,text_preprocess__type_residential,text_preprocess__type_secondary,text_preprocess__type_secondary_link,text_preprocess__type_tertiary,text_preprocess__type_trunk,text_preprocess__type_unclassified,remainder__link_id,remainder__link_from,remainder__link_to
0,-0.736842,-0.06896,0.409756,0.532350,-0.505438,-0.478713,-0.509033,-0.474096,-0.152384,-0.152953,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0,644,369
1,-0.799751,-0.06896,-0.390171,-0.776035,-0.707842,1.846523,-0.708528,1.852005,-0.152384,-0.152953,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1,1529,1042
2,-0.799626,-0.06896,0.009792,-0.776035,-0.711397,1.850141,-0.710658,1.846278,-0.152384,-0.152953,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2,1535,1528
3,-0.882597,-0.06896,0.009792,-0.776035,-0.633512,1.937923,-0.633738,1.936990,-0.152384,-0.152953,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3,1534,171
4,0.092928,-0.06896,0.409756,0.532350,-0.630003,1.939947,-0.621308,1.975206,-0.152384,-0.152953,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,4,1531,1411
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29520,-0.064155,-0.06896,2.009608,3.149120,0.985277,-0.107835,1.003074,-0.122228,-0.152384,-0.152953,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1936,1127,1142
29521,1.141428,-0.06896,-0.490161,0.532350,-0.167125,-1.586140,-0.205892,-1.542719,-0.152384,-0.152953,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1937,777,1252
29522,-0.820076,-0.06896,-0.390171,-0.776035,-0.341127,0.497685,-0.338478,0.498576,-0.152384,-0.152953,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1938,1405,828
29523,-0.574670,-0.06896,-0.390171,-0.776035,0.230536,-1.083409,0.222721,-1.083068,-0.152384,-0.152953,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1939,290,317


In [14]:
# before hyperparametertuning
results

{'Linear Regression': {'MAE': 4.693113446842107,
  'MSE': 55.849006153897605,
  'MSE_std': 2.432367097366999},
 'Lasso': {'MAE': 4.73523872550065,
  'MSE': 56.36714670885806,
  'MSE_std': 2.4722245661994404},
 'Ridge': {'MAE': 4.692299720318795,
  'MSE': 55.84925513445673,
  'MSE_std': 2.4336838277175006},
 'SVR': {'MAE': 3.796789836594848,
  'MSE': 66.91232244253311,
  'MSE_std': 3.0398415798701195},
 'Random Forest': {'MAE': 3.8049070279424213,
  'MSE': 45.3557022895851,
  'MSE_std': 1.9916994575650346},
 'Gradient Boosting': {'MAE': 4.435219723809254,
  'MSE': 51.70751836160465,
  'MSE_std': 2.3147557241346983},
 'Artificial Neural Network': {'MAE': 5.1972694271238336,
  'MSE': 65.93310931246707,
  'MSE_std': 16.515316185356948}}

In [45]:
import pickle
with open('result_after_featureselection(wo gpr).pkl', 'wb') as file:
    pickle.dump(results_feature, file)

In [44]:
# after hyperparametertuning
results

{'Linear Regression': {'MAE': 19.000541966353587, 'MSE': 1350.56069460534},
 'Lasso': {'MAE': 22.610703312854127, 'MSE': 1679.4760262080995},
 'Ridge': {'MAE': 18.989592790164192, 'MSE': 1349.7480663335148},
 'SVR': {'MAE': 23.793351948629688, 'MSE': 1756.1078902638878},
 'Random Forest': {'MAE': 17.635830260141653, 'MSE': 2015.2344103327919},
 'Gradient Boosting': {'MAE': 15.969274342418169, 'MSE': 1596.9989870917648},
 'Artificial Neural Network': {'MAE': 21.345942437519778,
  'MSE': 1888.5103715269568}}

In [45]:
results_feature

{'Linear Regression': {'MAE': 19.47148762371669,
  'MSE': 874.7590684153923,
  'selected_feature': Index(['link_length', 'link_freespeed', 'link_capacity', 'link_permlanes',
         'start_node_x', 'start_node_y', 'end_node_x', 'end_node_y',
         'start_count'],
        dtype='object')},
 'Lasso': {'MAE': 21.29080224423866,
  'MSE': 995.8290100956449,
  'selected_feature': Index(['link_id', 'link_permlanes', 'start_node_x', 'start_node_y',
         'end_node_x', 'end_node_y', 'start_count', 'end_count', 'go_to_sum'],
        dtype='object')},
 'Ridge': {'MAE': 18.855381894067843,
  'MSE': 831.7957770846882,
  'selected_feature': Index(['link_length', 'link_freespeed', 'link_capacity', 'link_permlanes',
         'start_node_x', 'start_node_y', 'end_node_y', 'start_count',
         'end_count'],
        dtype='object')},
 'SVR': {'MAE': 21.113999718501606,
  'MSE': 998.1921690104024,
  'selected_feature': Index(['link_id', 'link_length', 'link_freespeed', 'link_capacity',
         '

In [8]:
param_grid_svr = {
    'C': reciprocal(1e-4, 1e3),  # Extended range for the regularization parameter
    'gamma': reciprocal(1e-4, 1e2),  # Including specific gamma values
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],  # Focusing on RBF kernel
    'epsilon': [0.01, 0.1, 0.2],  # Epsilon in the epsilon-SVR model
}

mse_scorer = make_scorer(mean_squared_error, greater_is_better=False)
kf = KFold(n_splits=5, shuffle=True, random_state=42)
random_search_svr = RandomizedSearchCV(SVR(max_iter=2000), param_grid_svr, n_iter=80, cv=kf, n_jobs=-1, verbose=10, scoring=mse_scorer)
random_search_svr.fit(X_t, y_t)
print(random_search_svr.best_params_)
print(random_search_svr.best_estimator_)
print(random_search_svr.best_score_)

Fitting 3 folds for each of 70 candidates, totalling 210 fits


 -2.11726699e+03 -1.78346984e+03 -1.78347029e+03 -1.78347029e+03
             nan             nan -6.24658420e+19 -6.10801167e+12
 -1.96408398e+16 -1.78347029e+03             nan -1.78347029e+03
 -1.78347029e+03 -5.64219848e+20             nan -5.87544447e+21
             nan -1.78347029e+03 -1.78347029e+03             nan
 -1.78347029e+03             nan -4.64578815e+15 -1.77140788e+03
             nan -1.78347029e+03 -1.18411241e+16 -1.92220132e+03
 -1.78347029e+03 -1.78346995e+03             nan -2.11002684e+03
 -1.07400560e+17             nan             nan -1.78347029e+03
 -2.04274595e+03 -1.78347029e+03 -1.78347028e+03 -1.78347029e+03
 -1.78347029e+03 -1.78346908e+03 -1.77144765e+03 -1.78204420e+03
 -1.78347029e+03 -1.78347029e+03 -1.78347029e+03 -1.78070475e+03
 -2.11794703e+69 -1.78347029e+03 -1.78347029e+03 -2.47674823e+13
 -2.09596395e+22 -7.27577476e+20 -1.78347029e+03 -1.78347029e+03
 -5.24770013e+17 -2.66758547e+48             nan -8.75319761e+11
 -1.78347029e+03 -8.04400

{'C': 25.383309585489613, 'epsilon': 0.01, 'gamma': 2.1969677491639246, 'kernel': 'rbf'}
SVR(C=25.383309585489613, epsilon=0.01, gamma=2.1969677491639246, max_iter=2000)




In [8]:
param_grid_rf = {
    'n_estimators': [50, 100, 150, 200],
    'max_depth': [None, 5, 10, 20],
    'min_samples_split': [2, 4],
    'min_samples_leaf': [1, 2],
    'criterion':['friedman_mse']
}
mse_scorer = make_scorer(mean_squared_error, greater_is_better=False)
kf = KFold(n_splits=5, shuffle=True, random_state=42)

grid_search_rf = GridSearchCV(RandomForestRegressor(random_state=42), param_grid_rf, cv=kf, n_jobs=-1, verbose=10, scoring=mse_scorer)
grid_search_rf.fit(X_t, y_t)

print(grid_search_rf.best_params_)
print(grid_search_rf.best_estimator_)
print(grid_search_rf.best_score_)

Fitting 3 folds for each of 48 candidates, totalling 144 fits
{'criterion': 'friedman_mse', 'max_depth': 5, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
RandomForestRegressor(criterion='friedman_mse', max_depth=5, n_estimators=200,
                      random_state=42)


In [23]:
param_grid_gb = {
    'n_estimators': [100, 200],
    'learning_rate': [0.01, 0.1, 0.2],  # Varied learning rates for gradient boosting
    'max_depth': [3, 5, 10],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 4],
    'subsample': [0.8, 1.0],  # Fraction of samples to be used for fitting individual base learners
}

mse_scorer = make_scorer(mean_squared_error, greater_is_better=False)
kf = KFold(n_splits=5, shuffle=True, random_state=42)
grid_search_gb = GridSearchCV(GradientBoostingRegressor(random_state=42), param_grid_gb, cv=kf, n_jobs=-1, verbose=10, scoring=mse_scorer)
grid_search_gb.fit(X_t, y_t)
print(grid_search_gb.best_params_)
print(grid_search_gb.best_estimator_)
print(grid_search_gb.best_score_)

Fitting 3 folds for each of 144 candidates, totalling 432 fits


KeyboardInterrupt: 

In [33]:
param_grid_mlp = {
    'hidden_layer_sizes': [(50,), (100,), (50, 50), (100, 100), (30, 30, 30)],
    'activation': ['tanh', 'relu'],
    'solver': ['sgd', 'adam'],
    'alpha': [0.0001, 0.001, 0.01],
}
mse_scorer = make_scorer(mean_squared_error, greater_is_better=False)
kf = KFold(n_splits=5, shuffle=True, random_state=42)
grid_search_mlp = GridSearchCV(MLPRegressor(max_iter=2000, random_state=42), param_grid_mlp, cv=kf, n_jobs=-1, verbose=10, scoring=mse_scorer)
grid_search_mlp.fit(X_t, y_t)
print(grid_search_mlp.best_params_)
print(grid_search_mlp.best_estimator_)
print(grid_search_mlp.best_score_)

Fitting 2 folds for each of 60 candidates, totalling 120 fits


 -1.83398930e+03 -1.31004265e+03 -1.87286169e+03 -1.15885216e+03
 -1.84286007e+03 -1.26657990e+03 -1.85422786e+03 -1.13490388e+03
 -1.81175536e+03 -1.14374680e+03 -1.82734857e+03 -1.15570608e+03
 -1.83311349e+03 -1.14598088e+03 -1.84129863e+03 -1.23583309e+03
 -1.80666932e+03 -1.13699446e+03 -1.83771222e+03 -1.14234487e+03
 -1.84250819e+03 -1.29638924e+03 -1.83180786e+03 -1.16942667e+03
 -1.84268122e+03 -1.21379560e+03 -8.48523766e+20 -1.16028209e+03
 -5.01611135e+12 -1.19183664e+03             nan -1.23202037e+03
             nan -1.26948358e+03             nan -1.23640000e+03
 -8.48523937e+20 -1.25983731e+03 -5.01609670e+12 -1.24090603e+03
             nan -1.29318375e+03             nan -1.35751659e+03
             nan -1.20704770e+03 -8.48525641e+20 -1.11242170e+03
 -5.01595022e+12 -1.18502254e+03             nan -1.22700677e+03
             nan -1.22550436e+03             nan -1.23931689e+03]


{'activation': 'relu', 'alpha': 0.01, 'hidden_layer_sizes': (50,), 'solver': 'adam'}
MLPRegressor(alpha=0.01, hidden_layer_sizes=(50,), max_iter=2000,
             random_state=42)
5739.742620293232 33.459636948944656


In [35]:
from sklearn.gaussian_process.kernels import RBF, ConstantKernel, DotProduct
import numpy as np
param_grid = {
    'kernel': [ConstantKernel (1.0, (1e-1, 1e1)) * RBF(1.0, (1e-2, 1e2))],
    'alpha': [ 1e-2, 0.1, 1.0]
}

gpr = GaussianProcessRegressor(copy_X_train=False)

# Initialize GridSearchCV
grid_search_gpr = RandomizedSearchCV(gpr, param_grid, n_iter=5, cv=0, scoring='neg_mean_squared_error', n_jobs=-1, verbose=10)
grid_search_gpr.fit(X_t, y_t)

print(grid_search_gpr.best_params_)
print(grid_search_gpr.best_estimator_)

ValueError: k-fold cross-validation requires at least one train/test split by setting n_splits=2 or more, got n_splits=0.

In [59]:
def evaluate_models_with_test(model, X_train, y_train, X_test, y_test):
    results = {}
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    # mape = mean_absolute_percentage_error(y_test, y_pred)
    # r2 = r2_score(y_test, y_pred)
      
    results = {'MAE': mae, 'MSE': mse}
    
    return results

In [66]:
ln_model = models['Linear Regression']
ln_X_t = X_t[results_feature['Linear Regression']['selected_feature']]
ln_X_te = X_te[results_feature['Linear Regression']['selected_feature']]

ln_results = evaluate_models_with_test(ln_model, ln_X_t, y_t, ln_X_te, y_te)

In [67]:
lasso_model = models['Lasso']
lasso_X_t = X_t[results_feature['Lasso']['selected_feature']]
lasso_X_te = X_te[results_feature['Lasso']['selected_feature']]

lasso_results = evaluate_models_with_test(lasso_model, lasso_X_t, y_t, lasso_X_te, y_te)

In [68]:
ridge_model = models['Ridge']
ridge_X_t = X_t[results_feature['Ridge']['selected_feature']]
ridge_X_te = X_te[results_feature['Ridge']['selected_feature']]

ridge_results = evaluate_models_with_test(ridge_model, ridge_X_t, y_t, ridge_X_te, y_te)

In [69]:
svr_model = models['SVR']
svr_X_t = X_t[results_feature['SVR']['selected_feature']]
svr_X_te = X_te[results_feature['SVR']['selected_feature']]

svr_results = evaluate_models_with_test(svr_model, svr_X_t, y_t, svr_X_te, y_te)



In [70]:
rf_model = models['Random Forest']
rf_X_t = X_t[results_feature['Random Forest']['selected_feature']]
rf_X_te = X_te[results_feature['Random Forest']['selected_feature']]

rf_results = evaluate_models_with_test(rf_model, rf_X_t, y_t, rf_X_te, y_te)

In [62]:
gb_model = models['Gradient Boosting']
gb_X_t = X_t[results_feature['Gradient Boosting']['selected_feature']]
gb_X_te = X_te[results_feature['Gradient Boosting']['selected_feature']]

gb_results = evaluate_models_with_test(gb_model, gb_X_t, y_t, gb_X_te, y_te)

In [63]:
ann_model = models['Artificial Neural Network']
ann_X_t = X_t[results_feature['Artificial Neural Network']['selected_feature']]
ann_X_te = X_te[results_feature['Artificial Neural Network']['selected_feature']]

ann_results = evaluate_models_with_test(ann_model, ann_X_t, y_t, ann_X_te, y_te)

In [71]:
ln_results

{'MAE': 20.1076673770882, 'MSE': 902.1385041302448}

In [72]:
lasso_results

{'MAE': 14.695032409248478, 'MSE': 570.2595372331552}

In [73]:
ridge_results

{'MAE': 20.91790633288539, 'MSE': 955.1395825360881}

In [74]:
svr_results

{'MAE': 23.39522142225316, 'MSE': 1003.343949315722}

In [60]:
rf_results

{'MAE': 14.532470756135266, 'MSE': 676.9930176271231}

In [64]:
gb_results

{'MAE': 17.260081814945043, 'MSE': 721.9088243448939}

In [65]:
ann_results

{'MAE': 22.55849174892375, 'MSE': 1005.3902663530097}

In [None]:
# # Initialize a list to hold trips
# trips = []
# current_trip = [df_od_pairs.iloc[0]['origin']]  # Start with the first origin
# 
# # Iterate over the DataFrame rows
# for i, row in df_od_pairs.iterrows():
#     current_trip.append(row['destination'])  # Always add the destination
#     # Check if the next origin matches the current destination
#     if i + 1 < len(df_od_pairs) and row['destination'] != df_od_pairs.iloc[i + 1]['origin']:
#         # If it doesn't, the current trip has ended
#         trips.append(current_trip)
#         current_trip = [df_od_pairs.iloc[i + 1]['origin']]  # Start a new trip
# 
# # Add the last trip if it wasn't already added
# if current_trip not in trips:
#     trips.append(current_trip)


# from collections import Counter
# # Flatten the list of trips into a single list of nodes including origins and destinations
# all_nodes = [node for trip in trips for node in trip]
# 
# # Use Counter to count the occurrences of each node
# node_trip_counts = Counter(all_nodes)
# 
# df_node_trip_counts = pd.DataFrame.from_dict(node_trip_counts, orient='index').reset_index()
# df_node_trip_counts.columns = ['node_id', 'trip_amount']