In [1]:
import json
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.linear_model import LinearRegression, Ridge, Lasso, RidgeCV, LassoCV
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, RandomForestClassifier
from sklearn.neural_network import MLPRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error, make_scorer
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold, RandomizedSearchCV, StratifiedKFold
from scipy.stats import expon, reciprocal, uniform
from sklearn.gaussian_process.kernels import RBF, WhiteKernel, DotProduct, ExpSineSquared, RationalQuadratic
import numpy as np
from sklearn.feature_selection import RFE, SelectFromModel, RFECV
from sklearn.compose import ColumnTransformer
from mango import Tuner, scheduler

In [2]:
def load_data_small(file_list, df_activities, df_links_network):
    data_frames = []
    for file in file_list:
        with open(file, 'r') as f:
            data = json.load(f)
            if isinstance(data['link_counts'], dict):
                data['link_counts'] = data['link_counts'].values()
            df_links = pd.DataFrame({
                'link_id': data['links_id'],
                'link_from': data['link_from'],
                'link_to': data['link_to'],
                'link_length': data['link_length'],
                'link_freespeed': data['link_freespeed'],
                'link_capacity': data['link_capacity'],
                'link_permlanes': data['link_permlanes'],
                'link_counts': data['link_counts']
            })
            df_nodes = pd.DataFrame({
                'node_id': data['nodes_id'],
                'node_x': data['nodes_x'],
                'node_y': data['nodes_y']
            })
            df_od_pairs = pd.DataFrame(data['o_d_pairs'], columns=['origin', 'destination'])
            
            df_work = pd.DataFrame({
                        'work_x': data['work_x'],
                        'work_y': data['work_y'],
                        'go_to_work': data['go_to_work']
            })
            df_home = pd.DataFrame({
                'home_x': data['home_x'],
                'home_y': data['home_y'],
                'go_to_home': data['go_to_home']
            })
            
            df_links = df_links.merge(df_nodes, how='left', left_on='link_from', right_on='node_id')
            df_links = df_links.rename(columns={'node_x': 'start_node_x', 'node_y': 'start_node_y'})
            df_links.drop('node_id', axis=1, inplace=True)
            df_links = df_links.merge(df_nodes, how='left', left_on='link_to', right_on='node_id')
            df_links = df_links.rename(columns={'node_x': 'end_node_x', 'node_y': 'end_node_y'})
            df_links.drop('node_id', axis=1, inplace=True) 
            
            origin_counts = df_od_pairs['origin'].value_counts()
            df_origin_counts = origin_counts.reset_index()
            df_origin_counts.columns = ['origin', 'start_count']
            destination_counts = df_od_pairs['destination'].value_counts()
            df_destination_counts = destination_counts.reset_index()
            df_destination_counts.columns = ['destination', 'end_count']
            df_links = df_links.merge(df_origin_counts, how='left', left_on='link_from', right_on='origin')
            df_links.drop('origin', axis=1, inplace=True)
            df_links = df_links.merge(df_destination_counts, how='left', left_on='link_to', right_on='destination')
            df_links.drop('destination', axis=1, inplace=True)
            df_links[['start_count','end_count']] = df_links[['start_count','end_count']].fillna(-1)

        data_frames.append(df_links)
    return pd.concat(data_frames, ignore_index=True)

In [3]:
df_train = []
for i in range(0, 10):
    small_train_files = f'Data/smallWorlds/Train/s/s-{i}.json'
    small_df_activities = pd.read_pickle(f"Data/smallWorlds/Train/s/df_activities_{i}.pkl")
    small_df_links_network = pd.read_pickle(f"Data/smallWorlds/Train/s/df_links_network_{i}.pkl")
    small_train_data = load_data_small([small_train_files], small_df_activities, small_df_links_network)
    df_train.append(small_train_data)
small_train_data_all = pd.concat(df_train, ignore_index=True)

df_validate = []
for i in range(10, 15):
    small_validate_files = f'Data/smallWorlds/Validate/s/s-{i}.json'
    small_df_activities = pd.read_pickle(f"Data/smallWorlds/Validate/s/df_activities_{i}.pkl")
    small_df_links_network = pd.read_pickle(f"Data/smallWorlds/Validate/s/df_links_network_{i}.pkl")
    small_validate_data = load_data_small([small_validate_files], small_df_activities, small_df_links_network)
    df_validate.append(small_validate_data)
small_validate_data_all = pd.concat(df_validate, ignore_index=True)
    
df_test = []
for i in range(15, 20):
    small_test_files = f'Data/smallWorlds/Test/s/s-{i}.json'
    small_df_activities = pd.read_pickle(f"Data/smallWorlds/Test/s/df_activities_{i}.pkl")
    small_df_links_network = pd.read_pickle(f"Data/smallWorlds/Test/s/df_links_network_{i}.pkl")
    small_test_data = load_data_small([small_test_files], small_df_activities, small_df_links_network)
    df_test.append(small_test_data)
small_test_data_all = pd.concat(df_test, ignore_index=True)

small_Big_train_data = pd.concat([small_train_data_all, small_validate_data_all], ignore_index=True)

In [4]:
small_numerical_features = ['link_length', 'link_freespeed', 'link_capacity', 'link_permlanes', 'start_node_x', 'start_node_y', 'end_node_x', 'end_node_y', 'start_count', 'end_count']
X_t = small_Big_train_data.drop(columns=['link_counts'])
y_t = small_Big_train_data['link_counts']
X_te = small_test_data_all.drop(columns=['link_counts'])
y_te = small_test_data_all['link_counts']
scaler = StandardScaler()

X_t[small_numerical_features] = scaler.fit_transform(X_t[small_numerical_features])
X_te[small_numerical_features] = scaler.fit_transform(X_te[small_numerical_features])


In [6]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)
# Initialize models
models = {
    'Linear Regression': LinearRegression(),
    'Lasso': LassoCV(cv=kf, random_state=42, max_iter=100000),
    'Ridge': RidgeCV(cv=kf),
    'SVR': SVR(C=2.295970789995008, epsilon=0.2, gamma=0.000534081267285769, max_iter=2000),
    'Random Forest': RandomForestRegressor(criterion='friedman_mse', min_samples_leaf=2, n_estimators=150, random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(n_estimators=200, random_state=42, subsample=0.8),
    'Artificial Neural Network': MLPRegressor(activation='tanh', alpha=0.001, max_iter=2000, random_state=42),
#     'Gaussian Process Regression': GaussianProcessRegressor(kernel=RBF(length_scale=1.0) + WhiteKernel(noise_level=1.0), alpha=0.1, n_restarts_optimizer=3)
}


In [17]:
# Function to train and evaluate models
def evaluate_models(models, X_train, y_train):
    results = {}
    for name, model in models.items():
#         model.fit(X_train, y_train)
#         y_pred = model.predict(X_test)
#         mse = mean_squared_error(y_test, y_pred)
#         mae = mean_absolute_error(y_test, y_pred)
        mse_scorer = make_scorer(mean_squared_error, greater_is_better=False)
        mae_scorer = make_scorer(mean_absolute_error, greater_is_better=False)

        # Define the cross-validation strategy (e.g., 5-fold cross-validation)
        kf = KFold(n_splits=5, shuffle=True, random_state=42)

        # Perform k-fold cross-validation and calculate MSE and MAE
        mse_scores = cross_val_score(model, X_train, y_train, cv=kf, scoring=mse_scorer)
        mae_scores = cross_val_score(model, X_train, y_train, cv=kf, scoring=mae_scorer)

        # Display the mean MSE and MAE across folds
        mean_mse = -mse_scores.mean()
        mean_mae = -mae_scores.mean()
        std_mse = mse_scores.std()
        # mape = mean_absolute_percentage_error(y_test, y_pred)
        # r2 = r2_score(y_test, y_pred)
        print(name + " done")
        
        results[name] = {'MAE': mean_mae, 'MSE': mean_mse, 'MSE_std': std_mse}
    
    return results

# Function to train and evaluate models
def feature_select_models(models, X_train, y_train):
    mse_scorer = make_scorer(mean_squared_error, greater_is_better=False)
    mae_scorer = make_scorer(mean_absolute_error, greater_is_better=False)
    kf = KFold(n_splits=5, shuffle=True, random_state=42) 
    results = {}
    for name, model in models.items():
        if name not in ['SVR', 'Artificial Neural Network', 'Gaussian Process Regression']:
            selector = RFECV(model, step=1, cv=kf, scoring=mse_scorer).fit(X_train, y_train)
            print(f'{name} selection done')

        else:
            # Fit Random Forest to get feature importances
            rf = RandomForestRegressor()
            rf.fit(X_train, y_train)
            # Select features based on importances
            selector = RFECV(estimator=rf, step=1, cv=kf, scoring=mse_scorer).fit(X_train, y_train)
            print(f'{name} selection done')
            
        selected_features = X_train.columns[selector.support_]
        X_train_reduced = X_train[selected_features] 
        mse_scores = cross_val_score(model, X_train_reduced, y_train, cv=kf, scoring=mse_scorer)
        mse = -mse_scores.mean()
        mse_std = mse_scores.std()           
        mae_scores = cross_val_score(model, X_train_reduced, y_train, cv=kf, scoring=mae_scorer)
        mean_mae = -mae_scores.mean()

        results[name] = {'MAE': mean_mae, 'MSE': mse, 'MSE_std': mse_std, 'selected_feature': selected_features}

            
    return results

# Train and evaluate
# results = evaluate_models(models, X_t, y_t)

results_feature = feature_select_models(models, X_t, y_t)


Linear Regression selection done
Lasso selection done
Ridge selection done
SVR selection done




Random Forest selection done
Gradient Boosting selection done
Artificial Neural Network selection done


In [8]:
results

{'Linear Regression': {'MAE': 1.7094362875403877,
  'MSE': 4.900999802756333,
  'MSE_std': 0.3034433550348224},
 'Lasso': {'MAE': 1.7097878691904878,
  'MSE': 4.899167712759024,
  'MSE_std': 0.30045162699567984},
 'Ridge': {'MAE': 1.7094924216674936,
  'MSE': 4.90090785762512,
  'MSE_std': 0.3033803689519051},
 'SVR': {'MAE': 1.8030095623221283,
  'MSE': 5.580490064264206,
  'MSE_std': 0.37766254756037027},
 'Random Forest': {'MAE': 1.680673140842427,
  'MSE': 4.636675908059576,
  'MSE_std': 0.30927833658718745},
 'Gradient Boosting': {'MAE': 1.6563186278324562,
  'MSE': 4.550166627722172,
  'MSE_std': 0.2258874018980454},
 'Artificial Neural Network': {'MAE': 1.7315297127472742,
  'MSE': 4.919825762670676,
  'MSE_std': 0.2531822085531791}}

In [19]:
import pickle
with open('result_small_after_featureselection(wo gpr).pkl', 'wb') as file:
    pickle.dump(results_feature, file)

In [18]:
results_feature

{'Linear Regression': {'MAE': 1.7094362875403877,
  'MSE': 4.900999802756333,
  'MSE_std': 0.30344335503482245,
  'selected_feature': Index(['link_id', 'link_from', 'link_to', 'link_length', 'start_node_x',
         'start_node_y', 'end_node_x', 'end_node_y', 'start_count', 'end_count'],
        dtype='object')},
 'Lasso': {'MAE': 1.7096318937117352,
  'MSE': 4.8987235386674985,
  'MSE_std': 0.3006239484845701,
  'selected_feature': Index(['link_id', 'link_from', 'link_to', 'link_length', 'start_node_x',
         'end_node_x', 'end_node_y', 'start_count', 'end_count'],
        dtype='object')},
 'Ridge': {'MAE': 1.7094924216674936,
  'MSE': 4.90090785762512,
  'MSE_std': 0.30338036895190534,
  'selected_feature': Index(['link_id', 'link_from', 'link_to', 'link_length', 'start_node_x',
         'start_node_y', 'end_node_x', 'end_node_y', 'start_count', 'end_count'],
        dtype='object')},
 'SVR': {'MAE': 1.7206257024656444,
  'MSE': 5.118170544329931,
  'MSE_std': 0.33316639693574035

In [15]:
param_grid_svr = {
    'C': reciprocal(1e-4, 1e3),  # Extended range for the regularization parameter
    'gamma': reciprocal(1e-4, 1e2),  # Including specific gamma values
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],  # Focusing on RBF kernel
    'epsilon': [0.01, 0.1, 0.2],  # Epsilon in the epsilon-SVR model
}
mse_scorer = make_scorer(mean_squared_error, greater_is_better=False)
kf = KFold(n_splits=5, shuffle=True, random_state=42)
random_search_svr = RandomizedSearchCV(SVR(max_iter=2000), param_grid_svr, n_iter=80, cv=kf, n_jobs=-1, verbose=10, scoring=mse_scorer)
random_search_svr.fit(X_t, y_t)
print(random_search_svr.best_params_)
print(random_search_svr.best_estimator_)
print(random_search_svr.best_score_)

Fitting 5 folds for each of 80 candidates, totalling 400 fits
{'C': 2.295970789995008, 'epsilon': 0.2, 'gamma': 0.000534081267285769, 'kernel': 'rbf'}
SVR(C=2.295970789995008, epsilon=0.2, gamma=0.000534081267285769, max_iter=2000)
-5.1181705443299315




In [5]:
param_space =  dict(
    max_features=['sqrt', 'log2', .1, .3, .5, .7, .9],
    n_estimators=range(50, 1000, 50), # 10 to 1000 in steps of 50
    bootstrap=[True, False],
    max_depth=range(1, 20),
    min_samples_leaf=range(1, 10)
)

kf = KFold(n_splits=5, shuffle=True, random_state=42)

@scheduler.parallel(n_jobs=-1)
def objective(**hyper_par):
    global X_t, y_t

    clf = RandomForestRegressor(**hyper_par)
    result = cross_val_score(clf, X_t, y_t, scoring='neg_mean_absolute_error', cv=kf, n_jobs=-1).mean()
    return result


tuner = Tuner(param_space, objective)
results = tuner.maximize()
print('best parameters:', results['best_params'])
print('best accuracy:', results['best_objective'])

  0%|          | 0/20 [00:00<?, ?it/s]

best parameters: {'bootstrap': False, 'max_depth': 17, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'n_estimators': 950}
best accuracy: -1.6478244084152949


In [11]:
param_grid_rf = {
    'n_estimators': [50, 100, 150, 200],
    'max_depth': [None, 5, 10, 20],
    'min_samples_split': [2, 4],
    'min_samples_leaf': [1, 2],
    'criterion':['friedman_mse']
}
mae_scorer = make_scorer(mean_absolute_error, greater_is_better=False)
mse_scorer = make_scorer(mean_squared_error, greater_is_better=False)
kf = KFold(n_splits=5, shuffle=True, random_state=42)

grid_search_rf = GridSearchCV(RandomForestRegressor(random_state=42), param_grid_rf, cv=kf, n_jobs=-1, verbose=10, scoring=mae_scorer)
grid_search_rf.fit(X_t, y_t)

print(grid_search_rf.best_params_)
print(grid_search_rf.best_estimator_)
print(grid_search_rf.best_score_)

Fitting 5 folds for each of 64 candidates, totalling 320 fits
{'criterion': 'friedman_mse', 'max_depth': None, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 150}
RandomForestRegressor(criterion='friedman_mse', min_samples_leaf=2,
                      n_estimators=150, random_state=42)
-4.592160039805052


In [13]:
param_grid_gb = {
    'n_estimators': [100, 200],
    'learning_rate': [0.01, 0.1, 0.2],  # Varied learning rates for gradient boosting
    'max_depth': [3, 5, 10],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 4],
    'subsample': [0.8, 1.0],  # Fraction of samples to be used for fitting individual base learners
}

mse_scorer = make_scorer(mean_squared_error, greater_is_better=False)
kf = KFold(n_splits=5, shuffle=True, random_state=42)
grid_search_gb = GridSearchCV(GradientBoostingRegressor(random_state=42), param_grid_gb, cv=kf, n_jobs=-1, verbose=10, scoring=mse_scorer)
grid_search_gb.fit(X_t, y_t)
print(grid_search_gb.best_params_)
print(grid_search_gb.best_estimator_)
print(grid_search_gb.best_score_)

Fitting 5 folds for each of 144 candidates, totalling 720 fits
{'learning_rate': 0.1, 'max_depth': 3, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200, 'subsample': 0.8}
GradientBoostingRegressor(n_estimators=200, random_state=42, subsample=0.8)
-4.492667022563979


In [14]:
param_grid_mlp = {
    'hidden_layer_sizes': [(50,), (100,), (50, 50), (100, 100), (30, 30, 30)],
    'activation': ['tanh', 'relu'],
    'solver': ['sgd', 'adam'],
    'alpha': [0.0001, 0.001, 0.01],
}
mse_scorer = make_scorer(mean_squared_error, greater_is_better=False)
kf = KFold(n_splits=5, shuffle=True, random_state=42)
grid_search_mlp = GridSearchCV(MLPRegressor(max_iter=2000, random_state=42), param_grid_mlp, cv=kf, n_jobs=-1, verbose=10, scoring=mse_scorer)
grid_search_mlp.fit(X_t, y_t)
print(grid_search_mlp.best_params_)
print(grid_search_mlp.best_estimator_)
print(grid_search_mlp.best_score_)

Fitting 5 folds for each of 60 candidates, totalling 300 fits


42 fits failed out of a total of 300.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
42 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\user\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\user\anaconda3\lib\site-packages\sklearn\base.py", line 1152, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "C:\Users\user\anaconda3\lib\site-packages\sklearn\neural_network\_multilayer_perceptron.py", line 753, in fit
    return self._fit(X, y, incremental=False)
  File "C:\Users\user\anaconda3\lib\site-packages\sklearn\neural_network\_multilayer_perceptron.py", line 496, i

{'activation': 'tanh', 'alpha': 0.001, 'hidden_layer_sizes': (100,), 'solver': 'adam'}
MLPRegressor(activation='tanh', alpha=0.001, max_iter=2000, random_state=42)
-4.80147699247909


In [35]:
from sklearn.gaussian_process.kernels import RBF, ConstantKernel, DotProduct
import numpy as np
param_grid = {
    'kernel': [ConstantKernel (1.0, (1e-1, 1e1)) * RBF(1.0, (1e-2, 1e2))],
    'alpha': [ 1e-2, 0.1, 1.0]
}

gpr = GaussianProcessRegressor(copy_X_train=False)

# Initialize GridSearchCV
grid_search_gpr = RandomizedSearchCV(gpr, param_grid, n_iter=5, cv=0, scoring='neg_mean_squared_error', n_jobs=-1, verbose=10)
grid_search_gpr.fit(X_t, y_t)

print(grid_search_gpr.best_params_)
print(grid_search_gpr.best_estimator_)

ValueError: k-fold cross-validation requires at least one train/test split by setting n_splits=2 or more, got n_splits=0.

In [10]:
results_feature = pd.read_pickle(f"result_small_after_featureselection(wo gpr).pkl")

In [7]:
def evaluate_models_with_test(model, X_train, y_train, X_test, y_test):
    results = {}
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    # mape = mean_absolute_percentage_error(y_test, y_pred)
    # r2 = r2_score(y_test, y_pred)
      
    results = {'MAE': mae, 'MSE': mse}
    
    return results

In [15]:
result_final_with_test = {}
for name, model in models.items():
    X_t_reduced = X_t[results_feature[name]['selected_feature']]
    X_te_reduced = ln_X_te = X_te[results_feature[name]['selected_feature']]
    result_final_with_test[name] = evaluate_models_with_test(model, X_t_reduced, y_t, X_te_reduced, y_te)
result_final_with_test   



{'Linear Regression': {'MAE': 1.7003736354075103, 'MSE': 4.788585580208417},
 'Lasso': {'MAE': 1.6987809558103777, 'MSE': 4.7782281748967605},
 'Ridge': {'MAE': 1.70036638282739, 'MSE': 4.788378917406207},
 'SVR': {'MAE': 1.7210487258198386, 'MSE': 5.015873127333126},
 'Random Forest': {'MAE': 1.7169790721673193, 'MSE': 4.8305655047192575},
 'Gradient Boosting': {'MAE': 1.7096366504708942, 'MSE': 4.819619027120631},
 'Artificial Neural Network': {'MAE': 1.7272302028791726,
  'MSE': 4.795904433322586}}

In [16]:
import pickle
with open('result_small_final(wo gpr).pkl', 'wb') as file:
    pickle.dump(result_final_with_test, file)

In [None]:
# # Initialize a list to hold trips
# trips = []
# current_trip = [df_od_pairs.iloc[0]['origin']]  # Start with the first origin
# 
# # Iterate over the DataFrame rows
# for i, row in df_od_pairs.iterrows():
#     current_trip.append(row['destination'])  # Always add the destination
#     # Check if the next origin matches the current destination
#     if i + 1 < len(df_od_pairs) and row['destination'] != df_od_pairs.iloc[i + 1]['origin']:
#         # If it doesn't, the current trip has ended
#         trips.append(current_trip)
#         current_trip = [df_od_pairs.iloc[i + 1]['origin']]  # Start a new trip
# 
# # Add the last trip if it wasn't already added
# if current_trip not in trips:
#     trips.append(current_trip)


# from collections import Counter
# # Flatten the list of trips into a single list of nodes including origins and destinations
# all_nodes = [node for trip in trips for node in trip]
# 
# # Use Counter to count the occurrences of each node
# node_trip_counts = Counter(all_nodes)
# 
# df_node_trip_counts = pd.DataFrame.from_dict(node_trip_counts, orient='index').reset_index()
# df_node_trip_counts.columns = ['node_id', 'trip_amount']

In [37]:
# Initialize models
gpr_models = {
#     'Linear Regression': LinearRegression(),
#     'Lasso': LassoCV(cv=3, random_state=42, max_iter=100000),
#     'Ridge': RidgeCV(cv=3),
#     'SVR': SVR(C=2.295970789995008, epsilon=0.2, gamma=0.000534081267285769, max_iter=2000),
#     'Random Forest': RandomForestRegressor(criterion='friedman_mse', min_samples_leaf=2, n_estimators=150, random_state=42),
#     'Gradient Boosting': GradientBoostingRegressor(n_estimators=200, random_state=42, subsample=0.8),
#     'Artificial Neural Network': MLPRegressor(activation='tanh', alpha=0.001, max_iter=2000, random_state=42),
    'Gaussian Process Regression': GaussianProcessRegressor(kernel=RBF(length_scale=1.0) + WhiteKernel(noise_level=1.0), alpha=0.1, n_restarts_optimizer=3)
}
# 
gpr_results = evaluate_models(gpr_models, X_t, y_t)
gpr_results
# gpr_results_feature = feature_select_models(models, X_t, y_t)

Gaussian Process Regression done


{'Gaussian Process Regression': {'MAE': 1.7930880421020745,
  'MSE': 5.2807308400265285,
  'MSE_std': 0.2917042872476591}}