In [1]:
import json
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.linear_model import LinearRegression, Ridge, Lasso, RidgeCV, LassoCV
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error, make_scorer
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold, RandomizedSearchCV, StratifiedKFold
from scipy.stats import expon, reciprocal, uniform
from sklearn.gaussian_process.kernels import RBF, WhiteKernel, DotProduct, ExpSineSquared, RationalQuadratic
import numpy as np
from sklearn.feature_selection import RFE, SelectFromModel, RFECV
from sklearn.compose import ColumnTransformer
# from mlxtend.feature_selection import ExhaustiveFeatureSelector as EFS

In [2]:
def load_data(file_list, df_activities, df_links_network):
    data_frames = []
    for file in file_list:
        with open(file, 'r') as f:
            data = json.load(f)
            if isinstance(data['link_counts'], dict):
                data['link_counts'] = data['link_counts'].values()
            df_links = pd.DataFrame({
                'link_id': data['links_id'],
                'link_from': data['link_from'],
                'link_to': data['link_to'],
                'link_length': data['link_length'],
                'link_freespeed': data['link_freespeed'],
                'link_capacity': data['link_capacity'],
                'link_permlanes': data['link_permlanes'],
                'link_counts': data['link_counts']
            })
            df_nodes = pd.DataFrame({
                'node_id': data['nodes_id'],
                'node_x': data['nodes_x'],
                'node_y': data['nodes_y']
            })
            df_od_pairs = pd.DataFrame(data['o_d_pairs'], columns=['origin', 'destination'])
            
            df_work = pd.DataFrame({
                        'work_x': data['work_x'],
                        'work_y': data['work_y'],
                        'go_to_work': data['go_to_work']
            })
            df_home = pd.DataFrame({
                'home_x': data['home_x'],
                'home_y': data['home_y'],
                'go_to_home': data['go_to_home']
            })
            
            df_links = df_links.merge(df_nodes, how='left', left_on='link_from', right_on='node_id')
            df_links = df_links.rename(columns={'node_x': 'start_node_x', 'node_y': 'start_node_y'})
            df_links.drop('node_id', axis=1, inplace=True)
            df_links = df_links.merge(df_nodes, how='left', left_on='link_to', right_on='node_id')
            df_links = df_links.rename(columns={'node_x': 'end_node_x', 'node_y': 'end_node_y'})
            df_links.drop('node_id', axis=1, inplace=True) 
            
            origin_counts = df_od_pairs['origin'].value_counts()
            df_origin_counts = origin_counts.reset_index()
            df_origin_counts.columns = ['origin', 'start_count']
            destination_counts = df_od_pairs['destination'].value_counts()
            df_destination_counts = destination_counts.reset_index()
            df_destination_counts.columns = ['destination', 'end_count']
            df_links = df_links.merge(df_origin_counts, how='left', left_on='link_from', right_on='origin')
            df_links.drop('origin', axis=1, inplace=True)
            df_links = df_links.merge(df_destination_counts, how='left', left_on='link_to', right_on='destination')
            df_links.drop('destination', axis=1, inplace=True)
            df_links[['start_count','end_count']] = df_links[['start_count','end_count']].fillna(-1)
            
            # Calculate time of go_to_work and go_to_sum
            df_act_work = df_activities[df_activities['activity_type_main']=='work'].drop(['end_time'], axis=1)
            df_act_work = df_act_work.merge(df_work, how='left', left_on=['x','y'], right_on=['work_x','work_y'])
            df_act_work.drop(['x','y'], axis=1, inplace=True)
            df_act_work_agg = df_act_work.groupby(by="link")['go_to_work'].sum().reset_index(drop=False)
            df_act_home = df_activities[df_activities['activity_type_main']=='home'].drop(['end_time'], axis=1)
            df_act_home = df_act_home.merge(df_home, how='left', left_on=['x','y'], right_on=['home_x','home_y'])
            df_act_home.drop(['x','y'], axis=1, inplace=True)
            df_act_home_agg = df_act_home.groupby(by="link")['go_to_home'].sum().reset_index(drop=False)
            df_act_agg = df_act_home_agg.merge(df_act_work_agg, how='outer', on='link')
            df_act_agg.fillna(0, inplace=True)
            df_act_agg['go_to_sum'] = df_act_agg['go_to_home'] + df_act_agg['go_to_work']

            df_rushhr = df_activities[df_activities['end_time']!=-1]
            df_rushhr.loc[:, 'rush_hour'] = 0
            df_rushhr.loc[df_rushhr['end_time'].between(pd.to_timedelta('08:00:00'), pd.to_timedelta('10:00:00'), inclusive='both'), 'rush_hour'] = 1
            df_rushhr.loc[df_rushhr['end_time'].between(pd.to_timedelta('16:00:00'), pd.to_timedelta('19:00:00'), inclusive='both'), 'rush_hour'] = 1
            df_rushhr.drop(['end_time', 'max_dur', 'zoneId', 'cemdapStopDuration_s'], axis=1, inplace=True)
            df_rushhragg = df_rushhr.groupby(by="link").sum()['rush_hour'].reset_index(drop=False)
            
            df_maxduragg = df_activities[df_activities['max_dur']!=-1].groupby(by='link')['max_dur'].sum().reset_index(drop=False)
            
            df_activities['cemdapStopDuration_s'] = df_activities['cemdapStopDuration_s'].astype(float)
            df_cemagg = df_activities[df_activities['cemdapStopDuration_s']!=-1].groupby(by='link')['cemdapStopDuration_s'].sum().reset_index(drop=False)
            
            df_temp = df_links.merge(df_links_network, how='left', on=['start_node_x','start_node_y','end_node_x','end_node_y'])
            df_temp = df_temp[['link_id_x','link_from','link_to','link_id_y','from', 'to', 'type']]
            df_temp = df_temp.merge(df_act_agg, how='left', left_on='link_id_y', right_on='link')
            df_temp.drop('link', axis=1, inplace=True)
            df_temp = df_temp.merge(df_rushhragg, how='left', left_on='link_id_y', right_on='link')
            df_temp.drop('link', axis=1, inplace=True)
            df_temp = df_temp.merge(df_maxduragg, how='left', left_on='link_id_y', right_on='link')
            df_temp.drop('link', axis=1, inplace=True)
            df_temp = df_temp.merge(df_cemagg, how='left', left_on='link_id_y', right_on='link')
            df_temp.fillna({'cemdapStopDuration_s':-1, 'max_dur':-1, 'rush_hour': -1, 'go_to_sum': -1}, inplace=True)
            df_temp = df_temp[['link_id_x', 'go_to_sum', 'rush_hour', 'max_dur', 'cemdapStopDuration_s', 'type']]
            
            df_links = df_links.merge(df_temp, how='left', left_on='link_id', right_on='link_id_x')
            df_links.drop('link_id_x', axis=1, inplace=True)
        data_frames.append(df_links)
    return pd.concat(data_frames, ignore_index=True)


In [3]:
df_train = []
for i in range(0, 10):
    train_files = f'Data/sparseWorlds/Train/po-1/s-{i}.json'
    df_activities = pd.read_pickle(f'Data/sparseWorlds/Train/po-1/df_activities_{i}.pkl')
    df_links_network = pd.read_pickle(f'Data/sparseWorlds/Train/po-1/df_links_network_{i}.pkl')
    train_data = load_data([train_files], df_activities, df_links_network)
    df_train.append(train_data)
train_data_all = pd.concat(df_train, ignore_index=True)

df_validate = []
for i in range(10, 15):
    validate_files = f'Data/sparseWorlds/Validate/po-1/s-{i}.json'
    df_activities = pd.read_pickle(f'Data/sparseWorlds/Validate/po-1/df_activities_{i}.pkl')
    df_links_network = pd.read_pickle(f'Data/sparseWorlds/Validate/po-1/df_links_network_{i}.pkl')
    validate_data = load_data([validate_files], df_activities, df_links_network)
    df_validate.append(validate_data)
validate_data_all = pd.concat(df_validate, ignore_index=True)
    
df_test = []
for i in range(15, 20):
    test_files = f'Data/sparseWorlds/Test/po-1/s-{i}.json'
    df_activities = pd.read_pickle(f'Data/sparseWorlds/Test/po-1/df_activities_{i}.pkl')
    df_links_network = pd.read_pickle(f'Data/sparseWorlds/Test/po-1/df_links_network_{i}.pkl')
    test_data = load_data([test_files], df_activities, df_links_network)
    df_test.append(test_data)
test_data_all = pd.concat(df_test, ignore_index=True)

Big_train_data = pd.concat([train_data_all, validate_data_all], ignore_index=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_rushhr.loc[:, 'rush_hour'] = 0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_rushhr.drop(['end_time', 'max_dur', 'zoneId', 'cemdapStopDuration_s'], axis=1, inplace=True)
  df_rushhragg = df_rushhr.groupby(by="link").sum()['rush_hour'].reset_index(drop=False)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_rushhr.loc[:, 'rush_hour'] = 0
A value i

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_rushhr.loc[:, 'rush_hour'] = 0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_rushhr.drop(['end_time', 'max_dur', 'zoneId', 'cemdapStopDuration_s'], axis=1, inplace=True)
  df_rushhragg = df_rushhr.groupby(by="link").sum()['rush_hour'].reset_index(drop=False)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_rushhr.loc[:, 'rush_hour'] = 0
A value i

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_rushhr.loc[:, 'rush_hour'] = 0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_rushhr.drop(['end_time', 'max_dur', 'zoneId', 'cemdapStopDuration_s'], axis=1, inplace=True)
  df_rushhragg = df_rushhr.groupby(by="link").sum()['rush_hour'].reset_index(drop=False)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_rushhr.loc[:, 'rush_hour'] = 0
A value i

In [4]:
numerical_features = ['link_length', 'link_freespeed', 'link_capacity', 'link_permlanes', 'start_node_x', 'start_node_y', 'end_node_x', 'end_node_y', 'start_count', 'end_count', 'go_to_sum', 'rush_hour', 'max_dur', 'cemdapStopDuration_s']
category_feature = ['type']
X_t = Big_train_data.drop(columns=['link_counts'])
y_t = Big_train_data['link_counts']
X_te = test_data.drop(columns=['link_counts'])
y_te = test_data['link_counts']

scaler = StandardScaler()
ohe = OneHotEncoder(sparse_output=False)
ct = ColumnTransformer(
     [("num_preprocess", scaler, numerical_features),
      ("text_preprocess", ohe, category_feature)], remainder='passthrough').set_output(transform="pandas")
X_t = ct.fit_transform(X_t) 
X_te = ct.fit_transform(X_te)

In [20]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)
# Initialize models
models = {
    'Linear Regression': LinearRegression(),
    'Lasso': LassoCV(cv=kf, random_state=42, max_iter=100000),
    'Ridge': RidgeCV(cv=kf),
    'SVR': SVR(C=0.01, kernel='sigmoid', max_iter=2000),
    'Random Forest': RandomForestRegressor(criterion='friedman_mse', max_depth=20,
                      min_samples_leaf=2, n_estimators=200, random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(max_depth=10, min_samples_leaf=4, random_state=42),
    'Artificial Neural Network': MLPRegressor(activation='tanh', alpha=0.001, hidden_layer_sizes=(100, 100),
             max_iter=2000, random_state=42),
#     'Gaussian Process Regression': GaussianProcessRegressor(kernel=RBF(length_scale=1.0) + WhiteKernel(noise_level=1.0), alpha=0.1, n_restarts_optimizer=3)
}


In [21]:
# Function to train and evaluate models
def evaluate_models(models, X_train, y_train):
    results = {}
    for name, model in models.items():
#         model.fit(X_train, y_train)
#         y_pred = model.predict(X_test)
#         mse = mean_squared_error(y_test, y_pred)
#         mae = mean_absolute_error(y_test, y_pred)
        mse_scorer = make_scorer(mean_squared_error, greater_is_better=False)
        mae_scorer = make_scorer(mean_absolute_error, greater_is_better=False)

        # Define the cross-validation strategy (e.g., 5-fold cross-validation)
        kf = KFold(n_splits=5, shuffle=True, random_state=42)

        # Perform k-fold cross-validation and calculate MSE and MAE
        mse_scores = cross_val_score(model, X_train, y_train, cv=kf, scoring=mse_scorer)
        mae_scores = cross_val_score(model, X_train, y_train, cv=kf, scoring=mae_scorer)

        # Display the mean MSE and MAE across folds
        mean_mse = -mse_scores.mean()
        mean_mae = -mae_scores.mean()
        std_mse = mse_scores.std()
        # mape = mean_absolute_percentage_error(y_test, y_pred)
        # r2 = r2_score(y_test, y_pred)
        print(name + " done")
        
        results[name] = {'MAE': mean_mae, 'MSE': mean_mse, 'MSE_std': std_mse}
    
    return results

# Function to train and evaluate models
def feature_select_models(models, X_train, y_train):
    mse_scorer = make_scorer(mean_squared_error, greater_is_better=False)
    mae_scorer = make_scorer(mean_absolute_error, greater_is_better=False)
    kf = KFold(n_splits=5, shuffle=True, random_state=42) 
    results = {}
    for name, model in models.items():
        if name not in ['SVR', 'Artificial Neural Network', 'Gaussian Process Regression']:
            selector = RFECV(model, step=1, cv=kf, scoring=mse_scorer).fit(X_train, y_train)
            print(f'{name} selection done')

        else:
            # Fit Random Forest to get feature importances
            rf = RandomForestRegressor()
            rf.fit(X_train, y_train)
            # Select features based on importances
            selector = RFECV(estimator=rf, step=1, cv=kf, scoring=mse_scorer).fit(X_train, y_train)
            print(f'{name} selection done')
            
        selected_features = X_train.columns[selector.support_]
        X_train_reduced = X_train[selected_features] 
        mse_scores = cross_val_score(model, X_train_reduced, y_train, cv=kf, scoring=mse_scorer)
        mse = -mse_scores.mean()
        mse_std = mse_scores.std()           
        mae_scores = cross_val_score(model, X_train_reduced, y_train, cv=kf, scoring=mae_scorer)
        mean_mae = -mae_scores.mean()

        results[name] = {'MAE': mean_mae, 'MSE': mse, 'MSE_std': mse_std, 'selected_feature': selected_features}

            
    return results

# Train and evaluate
# results = evaluate_models(models, X_t, y_t)

results_feature = feature_select_models(models, X_t, y_t)


Linear Regression selection done
Lasso selection done
Ridge selection done
SVR selection done




Random Forest selection done
Gradient Boosting selection done
Artificial Neural Network selection done


In [14]:
# before hyperparametertuning
results

{'Linear Regression': {'MAE': 4.693113446842107,
  'MSE': 55.849006153897605,
  'MSE_std': 2.432367097366999},
 'Lasso': {'MAE': 4.73523872550065,
  'MSE': 56.36714670885806,
  'MSE_std': 2.4722245661994404},
 'Ridge': {'MAE': 4.692299720318795,
  'MSE': 55.84925513445673,
  'MSE_std': 2.4336838277175006},
 'SVR': {'MAE': 3.796789836594848,
  'MSE': 66.91232244253311,
  'MSE_std': 3.0398415798701195},
 'Random Forest': {'MAE': 3.8049070279424213,
  'MSE': 45.3557022895851,
  'MSE_std': 1.9916994575650346},
 'Gradient Boosting': {'MAE': 4.435219723809254,
  'MSE': 51.70751836160465,
  'MSE_std': 2.3147557241346983},
 'Artificial Neural Network': {'MAE': 5.1972694271238336,
  'MSE': 65.93310931246707,
  'MSE_std': 16.515316185356948}}

In [23]:
import pickle
with open('result_sparse_after_featureselection(wo gpr).pkl', 'wb') as file:
    pickle.dump(results_feature, file)

In [18]:
# after hyperparametertuning
results

{'Linear Regression': {'MAE': 4.693113446842107,
  'MSE': 55.849006153897605,
  'MSE_std': 2.432367097366999},
 'Lasso': {'MAE': 4.73523872550065,
  'MSE': 56.36714670885806,
  'MSE_std': 2.4722245661994404},
 'Ridge': {'MAE': 4.692299720318795,
  'MSE': 55.84925513445673,
  'MSE_std': 2.4336838277175006},
 'SVR': {'MAE': 5.977155856752495,
  'MSE': 61.98269569332149,
  'MSE_std': 2.0059101781075888},
 'Random Forest': {'MAE': 3.7669767167115547,
  'MSE': 43.49061871205631,
  'MSE_std': 2.233419838225602},
 'Gradient Boosting': {'MAE': 3.8769150608008958,
  'MSE': 44.29968885637518,
  'MSE_std': 2.00551870212844},
 'Artificial Neural Network': {'MAE': 4.755552312927051,
  'MSE': 57.868662954965124,
  'MSE_std': 2.4034977982000765}}

In [22]:
results_feature

{'Linear Regression': {'MAE': 4.692953454109057,
  'MSE': 55.847603502380174,
  'MSE_std': 2.436216250739268,
  'selected_feature': Index(['num_preprocess__link_length', 'num_preprocess__link_freespeed',
         'num_preprocess__link_capacity', 'num_preprocess__link_permlanes',
         'num_preprocess__start_node_x', 'num_preprocess__start_node_y',
         'num_preprocess__end_node_x', 'num_preprocess__end_node_y',
         'num_preprocess__start_count', 'num_preprocess__end_count',
         'num_preprocess__go_to_sum', 'num_preprocess__rush_hour',
         'num_preprocess__max_dur', 'num_preprocess__cemdapStopDuration_s',
         'text_preprocess__type_motorway', 'text_preprocess__type_motorway_link',
         'text_preprocess__type_primary', 'text_preprocess__type_primary_link',
         'text_preprocess__type_residential', 'text_preprocess__type_secondary',
         'text_preprocess__type_secondary_link',
         'text_preprocess__type_tertiary', 'text_preprocess__type_trunk',


In [15]:
param_grid_svr = {
    'C': [0.01, 0.1, 1, 10],  # Extended range for the regularization parameter
    'gamma': ['scale', 'auto'],  # Including specific gamma values
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],  # Focusing on RBF kernel
    'epsilon': [0.01, 0.1, 0.2],  # Epsilon in the epsilon-SVR model
}

mse_scorer = make_scorer(mean_squared_error, greater_is_better=False)
kf = KFold(n_splits=5, shuffle=True, random_state=42)
random_search_svr = GridSearchCV(SVR(max_iter=2000), param_grid_svr, cv=kf, n_jobs=-1, verbose=10, scoring=mse_scorer)
random_search_svr.fit(X_t, y_t)
print(random_search_svr.best_params_)
print(random_search_svr.best_estimator_)
print(random_search_svr.best_score_)

Fitting 5 folds for each of 96 candidates, totalling 480 fits
{'C': 0.01, 'epsilon': 0.1, 'gamma': 'scale', 'kernel': 'sigmoid'}
SVR(C=0.01, kernel='sigmoid', max_iter=2000)
-61.98269569332149




In [6]:
param_grid_rf = {
    'n_estimators': [50, 100, 150, 200],
    'max_depth': [None, 5, 10, 20],
    'min_samples_split': [2, 4],
    'min_samples_leaf': [1, 2],
    'criterion':['friedman_mse']
}
mse_scorer = make_scorer(mean_squared_error, greater_is_better=False)
kf = KFold(n_splits=5, shuffle=True, random_state=42)

grid_search_rf = GridSearchCV(RandomForestRegressor(random_state=42), param_grid_rf, cv=kf, n_jobs=-1, verbose=10, scoring=mse_scorer)
grid_search_rf.fit(X_t, y_t)

print(grid_search_rf.best_params_)
print(grid_search_rf.best_estimator_)
print(grid_search_rf.best_score_)

Fitting 5 folds for each of 64 candidates, totalling 320 fits
{'criterion': 'friedman_mse', 'max_depth': 20, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 200}
RandomForestRegressor(criterion='friedman_mse', max_depth=20,
                      min_samples_leaf=2, n_estimators=200, random_state=42)
-43.49061871205631


In [7]:
param_grid_gb = {
    'n_estimators': [100, 200],
    'learning_rate': [0.01, 0.1, 0.2],  # Varied learning rates for gradient boosting
    'max_depth': [3, 5, 10],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 4],
    'subsample': [0.8, 1.0],  # Fraction of samples to be used for fitting individual base learners
}

mse_scorer = make_scorer(mean_squared_error, greater_is_better=False)
kf = KFold(n_splits=5, shuffle=True, random_state=42)
grid_search_gb = GridSearchCV(GradientBoostingRegressor(random_state=42), param_grid_gb, cv=kf, n_jobs=-1, verbose=10, scoring=mse_scorer)
grid_search_gb.fit(X_t, y_t)
print(grid_search_gb.best_params_)
print(grid_search_gb.best_estimator_)
print(grid_search_gb.best_score_)

Fitting 5 folds for each of 144 candidates, totalling 720 fits
{'learning_rate': 0.1, 'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 100, 'subsample': 1.0}
GradientBoostingRegressor(max_depth=10, min_samples_leaf=4, random_state=42)
-44.29968885637518


In [8]:
param_grid_mlp = {
    'hidden_layer_sizes': [(50,), (100,), (50, 50), (100, 100), (30, 30, 30)],
    'activation': ['tanh', 'relu'],
    'solver': ['sgd', 'adam'],
    'alpha': [0.0001, 0.001, 0.01],
}
mse_scorer = make_scorer(mean_squared_error, greater_is_better=False)
kf = KFold(n_splits=5, shuffle=True, random_state=42)
grid_search_mlp = GridSearchCV(MLPRegressor(max_iter=2000, random_state=42), param_grid_mlp, cv=kf, n_jobs=-1, verbose=10, scoring=mse_scorer)
grid_search_mlp.fit(X_t, y_t)
print(grid_search_mlp.best_params_)
print(grid_search_mlp.best_estimator_)
print(grid_search_mlp.best_score_)

Fitting 5 folds for each of 60 candidates, totalling 300 fits


24 fits failed out of a total of 300.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
24 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\user\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\user\anaconda3\lib\site-packages\sklearn\base.py", line 1152, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "C:\Users\user\anaconda3\lib\site-packages\sklearn\neural_network\_multilayer_perceptron.py", line 753, in fit
    return self._fit(X, y, incremental=False)
  File "C:\Users\user\anaconda3\lib\site-packages\sklearn\neural_network\_multilayer_perceptron.py", line 496, i

{'activation': 'tanh', 'alpha': 0.001, 'hidden_layer_sizes': (100, 100), 'solver': 'adam'}
MLPRegressor(activation='tanh', alpha=0.001, hidden_layer_sizes=(100, 100),
             max_iter=2000, random_state=42)
-57.74607022008577


In [35]:
from sklearn.gaussian_process.kernels import RBF, ConstantKernel, DotProduct
import numpy as np
param_grid = {
    'kernel': [ConstantKernel (1.0, (1e-1, 1e1)) * RBF(1.0, (1e-2, 1e2))],
    'alpha': [ 1e-2, 0.1, 1.0]
}

gpr = GaussianProcessRegressor(copy_X_train=False)

# Initialize GridSearchCV
grid_search_gpr = RandomizedSearchCV(gpr, param_grid, n_iter=5, cv=0, scoring='neg_mean_squared_error', n_jobs=-1, verbose=10)
grid_search_gpr.fit(X_t, y_t)

print(grid_search_gpr.best_params_)
print(grid_search_gpr.best_estimator_)

ValueError: k-fold cross-validation requires at least one train/test split by setting n_splits=2 or more, got n_splits=0.

In [24]:
def evaluate_models_with_test(model, X_train, y_train, X_test, y_test):
    results = {}
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    # mape = mean_absolute_percentage_error(y_test, y_pred)
    # r2 = r2_score(y_test, y_pred)
      
    results = {'MAE': mae, 'MSE': mse}
    
    return results

In [29]:
X_te['text_preprocess__type_trunk']=0.0
X_te

Unnamed: 0,num_preprocess__link_length,num_preprocess__link_freespeed,num_preprocess__link_capacity,num_preprocess__link_permlanes,num_preprocess__start_node_x,num_preprocess__start_node_y,num_preprocess__end_node_x,num_preprocess__end_node_y,num_preprocess__start_count,num_preprocess__end_count,...,text_preprocess__type_primary_link,text_preprocess__type_residential,text_preprocess__type_secondary,text_preprocess__type_secondary_link,text_preprocess__type_tertiary,text_preprocess__type_unclassified,remainder__link_id,remainder__link_from,remainder__link_to,text_preprocess__type_trunk
0,-0.719399,-0.071818,0.373713,0.546057,-0.513968,-0.829322,-0.518060,-0.823897,-0.133832,-0.136994,...,0.0,0.0,1.0,0.0,0.0,0.0,0,562,372,0.0
1,-0.783520,-0.071818,-0.414415,-0.748252,-0.750814,1.583563,-0.751500,1.591151,-0.133832,-0.136994,...,0.0,0.0,0.0,1.0,0.0,0.0,1,1531,869,0.0
2,-0.783392,-0.071818,-0.020351,-0.748252,-0.754973,1.587318,-0.753992,1.585206,-0.133832,-0.136994,...,1.0,0.0,0.0,0.0,0.0,0.0,2,1537,1530,0.0
3,-0.867961,-0.071818,-0.020351,-0.748252,-0.663835,1.678409,-0.663984,1.679387,-0.133832,-0.136994,...,0.0,0.0,0.0,0.0,0.0,0.0,3,1536,181,0.0
4,0.126348,-0.071818,0.373713,0.546057,-0.659729,1.680508,-0.649440,1.719064,-0.133832,-0.136994,...,0.0,0.0,1.0,0.0,0.0,0.0,4,1533,1424,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1941,0.230741,-0.071818,-1.202543,-0.748252,-0.530309,-1.625632,-0.532439,-1.582642,-0.133832,-0.136994,...,0.0,0.0,1.0,0.0,0.0,0.0,1941,1626,613,0.0
1942,0.213312,-0.071818,-0.512931,0.546057,0.773616,1.355518,0.803640,1.354697,-0.133832,-0.136994,...,0.0,0.0,0.0,0.0,0.0,0.0,1942,1183,1575,0.0
1943,-0.204226,-0.071818,1.161841,1.840366,1.261386,-0.430346,1.252985,-0.452120,-0.133832,-0.136994,...,0.0,0.0,1.0,0.0,0.0,0.0,1943,1341,1080,0.0
1944,-0.033759,-0.071818,1.949968,3.134676,1.230410,-0.444465,1.251337,-0.458574,-0.133832,8.651697,...,0.0,0.0,1.0,0.0,0.0,0.0,1944,1081,1103,0.0


In [30]:
X_t

Unnamed: 0,num_preprocess__link_length,num_preprocess__link_freespeed,num_preprocess__link_capacity,num_preprocess__link_permlanes,num_preprocess__start_node_x,num_preprocess__start_node_y,num_preprocess__end_node_x,num_preprocess__end_node_y,num_preprocess__start_count,num_preprocess__end_count,...,text_preprocess__type_primary_link,text_preprocess__type_residential,text_preprocess__type_secondary,text_preprocess__type_secondary_link,text_preprocess__type_tertiary,text_preprocess__type_trunk,text_preprocess__type_unclassified,remainder__link_id,remainder__link_from,remainder__link_to
0,-0.736842,-0.06896,0.409756,0.532350,-0.505438,-0.478713,-0.509033,-0.474096,-0.152384,-0.152953,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0,644,369
1,-0.799751,-0.06896,-0.390171,-0.776035,-0.707842,1.846523,-0.708528,1.852005,-0.152384,-0.152953,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1,1529,1042
2,-0.799626,-0.06896,0.009792,-0.776035,-0.711397,1.850141,-0.710658,1.846278,-0.152384,-0.152953,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2,1535,1528
3,-0.882597,-0.06896,0.009792,-0.776035,-0.633512,1.937923,-0.633738,1.936990,-0.152384,-0.152953,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3,1534,171
4,0.092928,-0.06896,0.409756,0.532350,-0.630003,1.939947,-0.621308,1.975206,-0.152384,-0.152953,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,4,1531,1411
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29520,-0.064155,-0.06896,2.009608,3.149120,0.985277,-0.107835,1.003074,-0.122228,-0.152384,-0.152953,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1936,1127,1142
29521,1.141428,-0.06896,-0.490161,0.532350,-0.167125,-1.586140,-0.205892,-1.542719,-0.152384,-0.152953,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1937,777,1252
29522,-0.820076,-0.06896,-0.390171,-0.776035,-0.341127,0.497685,-0.338478,0.498576,-0.152384,-0.152953,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1938,1405,828
29523,-0.574670,-0.06896,-0.390171,-0.776035,0.230536,-1.083409,0.222721,-1.083068,-0.152384,-0.152953,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1939,290,317


In [31]:
import pickle

result_final_with_test = {}
for name, model in models.items():
    X_t_reduced = X_t[results_feature[name]['selected_feature']]
    X_te_reduced = X_te[results_feature[name]['selected_feature']]
    result_final_with_test[name] = evaluate_models_with_test(model, X_t_reduced, y_t, X_te_reduced, y_te)
result_final_with_test
with open('result_sparse_final(wo gpr).pkl', 'wb') as file:
    pickle.dump(result_final_with_test, file)



In [32]:
result_final_with_test

{'Linear Regression': {'MAE': 3.080183348962724, 'MSE': 12.419092563002643},
 'Lasso': {'MAE': 3.1106409525709178, 'MSE': 12.503019186623643},
 'Ridge': {'MAE': 3.0806624598898042, 'MSE': 12.423992202135015},
 'SVR': {'MAE': 6.442003396245811, 'MSE': 45.04902062571267},
 'Random Forest': {'MAE': 3.082777998593182, 'MSE': 17.22896160836622},
 'Gradient Boosting': {'MAE': 3.0093110252525332, 'MSE': 16.819822124756406},
 'Artificial Neural Network': {'MAE': 3.263439721629703,
  'MSE': 13.186400138371253}}

In [None]:
# # Initialize a list to hold trips
# trips = []
# current_trip = [df_od_pairs.iloc[0]['origin']]  # Start with the first origin
# 
# # Iterate over the DataFrame rows
# for i, row in df_od_pairs.iterrows():
#     current_trip.append(row['destination'])  # Always add the destination
#     # Check if the next origin matches the current destination
#     if i + 1 < len(df_od_pairs) and row['destination'] != df_od_pairs.iloc[i + 1]['origin']:
#         # If it doesn't, the current trip has ended
#         trips.append(current_trip)
#         current_trip = [df_od_pairs.iloc[i + 1]['origin']]  # Start a new trip
# 
# # Add the last trip if it wasn't already added
# if current_trip not in trips:
#     trips.append(current_trip)


# from collections import Counter
# # Flatten the list of trips into a single list of nodes including origins and destinations
# all_nodes = [node for trip in trips for node in trip]
# 
# # Use Counter to count the occurrences of each node
# node_trip_counts = Counter(all_nodes)
# 
# df_node_trip_counts = pd.DataFrame.from_dict(node_trip_counts, orient='index').reset_index()
# df_node_trip_counts.columns = ['node_id', 'trip_amount']