In [1]:
import json
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso, RidgeCV, LassoCV
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.metrics import mean_absolute_percentage_error, r2_score, mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score, KFold

In [2]:
def load_data(file_list, df_activities, df_links_network):
    data_frames = []
    for file in file_list:
        with open(file, 'r') as f:
            data = json.load(f)
            df_links = pd.DataFrame({
                'link_id': data['links_id'],
                'link_from': data['link_from'],
                'link_to': data['link_to'],
                'link_length': data['link_length'],
                'link_freespeed': data['link_freespeed'],
                'link_capacity': data['link_capacity'],
                'link_permlanes': data['link_permlanes'],
                'link_counts': data['link_counts']
            })
            df_nodes = pd.DataFrame({
                'node_id': data['nodes_id'],
                'node_x': data['nodes_x'],
                'node_y': data['nodes_y']
            })
            df_od_pairs = pd.DataFrame(data['o_d_pairs'], columns=['origin', 'destination'])
            
            df_work = pd.DataFrame({
                        'work_x': data['work_x'],
                        'work_y': data['work_y'],
                        'go_to_work': data['go_to_work']
            })
            df_home = pd.DataFrame({
                'home_x': data['home_x'],
                'home_y': data['home_y'],
                'go_to_home': data['go_to_home']
            })
            
            df_links = df_links.merge(df_nodes, how='left', left_on='link_from', right_on='node_id')
            df_links = df_links.rename(columns={'node_x': 'start_node_x', 'node_y': 'start_node_y'})
            df_links.drop('node_id', axis=1, inplace=True)
            df_links = df_links.merge(df_nodes, how='left', left_on='link_to', right_on='node_id')
            df_links = df_links.rename(columns={'node_x': 'end_node_x', 'node_y': 'end_node_y'})
            df_links.drop('node_id', axis=1, inplace=True) 
            
            origin_counts = df_od_pairs['origin'].value_counts()
            df_origin_counts = origin_counts.reset_index()
            df_origin_counts.columns = ['origin', 'start_count']
            destination_counts = df_od_pairs['destination'].value_counts()
            df_destination_counts = destination_counts.reset_index()
            df_destination_counts.columns = ['destination', 'end_count']
            df_links = df_links.merge(df_origin_counts, how='left', left_on='link_from', right_on='origin')
            df_links.drop('origin', axis=1, inplace=True)
            df_links = df_links.merge(df_destination_counts, how='left', left_on='link_to', right_on='destination')
            df_links.drop('destination', axis=1, inplace=True)
            df_links[['start_count','end_count']] = df_links[['start_count','end_count']].fillna(0)
            
            df_act_work = df_activities[df_activities['activity_type_main']=='work']
            df_act_work = df_act_work.merge(df_work, how='left', left_on=['x','y'], right_on=['work_x','work_y'])
            df_act_work.drop(['x','y'], axis=1, inplace=True)
            df_act_work_agg = df_act_work.groupby(by="link").sum()['go_to_work'].reset_index(drop=False)
            df_act_home = df_activities[df_activities['activity_type_main']=='home']
            df_act_home = df_act_home.merge(df_home, how='left', left_on=['x','y'], right_on=['home_x','home_y'])
            df_act_home.drop(['x','y'], axis=1, inplace=True)
            df_act_home_agg = df_act_home.groupby(by="link").sum()['go_to_home'].reset_index(drop=False)
            df_act_agg = df_act_home_agg.merge(df_act_work_agg, how='outer', on='link')
            df_act_agg.fillna(0, inplace=True)
            df_act_agg['go_to_sum'] = df_act_agg['go_to_home'] + df_act_agg['go_to_work']
            
            mg = df_links.merge(df_links_network, how='left', on=['start_node_x','start_node_y','end_node_x','end_node_y'])
            mg = mg[['link_id_x','link_from','link_to','link_id_y','from', 'to']]
            link_home_work = mg.merge(df_act_agg, how='left', left_on='link_id_y', right_on='link')
            link_home_work['go_to_sum'].fillna(0, inplace=True)
            link_go_to = link_home_work[['link_id_x', 'go_to_sum']]
            df_links = df_links.merge(link_go_to, how='left', left_on='link_id', right_on='link_id_x')
            df_links.drop('link_id_x', axis=1, inplace=True)
        data_frames.append(df_links)
    return pd.concat(data_frames, ignore_index=True)

train_files = ['s-0.json', 's-1.json', 's-2.json', 's-3.json', 's-4.json','s-5.json', 's-6.json', 's-7.json', 's-8.json', 's-9.json'] 
test_files = ['s-15.json', 's-16.json', 's-17.json', 's-18.json','s-19.json']
validate_files = ['s-10.json', 's-11.json', 's-12.json', 's-13.json','s-14.json']
train_files = ['Data/cutoutWorlds/Train/po-1_pn-1.0_sn-1/' + i for i in train_files]
test_files = ['Data/cutoutWorlds/Test/po-1_pn-1.0_sn-1/' + j for j in test_files]
validate_files = ['Data/cutoutWorlds/Validate/po-1_pn-1.0_sn-1/' + k for k in validate_files]
df_activities = pd.read_pickle("Data/cutoutWorlds/Train/po-1_pn-1.0_sn-1/df_activities.pkl")
df_links_network = pd.read_pickle("Data/cutoutWorlds/Train/po-1_pn-1.0_sn-1/df_links_network.pkl")
train_data = load_data(train_files, df_activities, df_links_network)
test_data = load_data(test_files, df_activities, df_links_network)
validate_data = load_data(validate_files, df_activities, df_links_network)


  df_act_work_agg = df_act_work.groupby(by="link").sum()['go_to_work'].reset_index(drop=False)
  df_act_home_agg = df_act_home.groupby(by="link").sum()['go_to_home'].reset_index(drop=False)
  df_act_work_agg = df_act_work.groupby(by="link").sum()['go_to_work'].reset_index(drop=False)
  df_act_home_agg = df_act_home.groupby(by="link").sum()['go_to_home'].reset_index(drop=False)
  df_act_work_agg = df_act_work.groupby(by="link").sum()['go_to_work'].reset_index(drop=False)
  df_act_home_agg = df_act_home.groupby(by="link").sum()['go_to_home'].reset_index(drop=False)
  df_act_work_agg = df_act_work.groupby(by="link").sum()['go_to_work'].reset_index(drop=False)
  df_act_home_agg = df_act_home.groupby(by="link").sum()['go_to_home'].reset_index(drop=False)
  df_act_work_agg = df_act_work.groupby(by="link").sum()['go_to_work'].reset_index(drop=False)
  df_act_home_agg = df_act_home.groupby(by="link").sum()['go_to_home'].reset_index(drop=False)
  df_act_work_agg = df_act_work.groupby(by="link")

  df_act_work_agg = df_act_work.groupby(by="link").sum()['go_to_work'].reset_index(drop=False)
  df_act_home_agg = df_act_home.groupby(by="link").sum()['go_to_home'].reset_index(drop=False)
  df_act_work_agg = df_act_work.groupby(by="link").sum()['go_to_work'].reset_index(drop=False)
  df_act_home_agg = df_act_home.groupby(by="link").sum()['go_to_home'].reset_index(drop=False)
  df_act_work_agg = df_act_work.groupby(by="link").sum()['go_to_work'].reset_index(drop=False)
  df_act_home_agg = df_act_home.groupby(by="link").sum()['go_to_home'].reset_index(drop=False)
  df_act_work_agg = df_act_work.groupby(by="link").sum()['go_to_work'].reset_index(drop=False)
  df_act_home_agg = df_act_home.groupby(by="link").sum()['go_to_home'].reset_index(drop=False)
  df_act_work_agg = df_act_work.groupby(by="link").sum()['go_to_work'].reset_index(drop=False)
  df_act_home_agg = df_act_home.groupby(by="link").sum()['go_to_home'].reset_index(drop=False)
  df_act_work_agg = df_act_work.groupby(by="link")

In [54]:
# # Initialize a list to hold trips
# trips = []
# current_trip = [df_od_pairs.iloc[0]['origin']]  # Start with the first origin
# 
# # Iterate over the DataFrame rows
# for i, row in df_od_pairs.iterrows():
#     current_trip.append(row['destination'])  # Always add the destination
#     # Check if the next origin matches the current destination
#     if i + 1 < len(df_od_pairs) and row['destination'] != df_od_pairs.iloc[i + 1]['origin']:
#         # If it doesn't, the current trip has ended
#         trips.append(current_trip)
#         current_trip = [df_od_pairs.iloc[i + 1]['origin']]  # Start a new trip
# 
# # Add the last trip if it wasn't already added
# if current_trip not in trips:
#     trips.append(current_trip)



In [55]:
# from collections import Counter
# # Flatten the list of trips into a single list of nodes including origins and destinations
# all_nodes = [node for trip in trips for node in trip]
# 
# # Use Counter to count the occurrences of each node
# node_trip_counts = Counter(all_nodes)
# 
# df_node_trip_counts = pd.DataFrame.from_dict(node_trip_counts, orient='index').reset_index()
# df_node_trip_counts.columns = ['node_id', 'trip_amount']

In [7]:
train_data

Unnamed: 0,link_id,link_from,link_to,link_length,link_freespeed,link_capacity,link_permlanes,link_counts,start_node_x,start_node_y,end_node_x,end_node_y,start_count,end_count,go_to_sum
0,0,425,579,134.962910,4.166667,300.0,0.5,0.0,4.609957e+06,5.819853e+06,4.609956e+06,5.819988e+06,0.0,0.0,0.0
1,1,579,425,134.962910,4.166667,300.0,0.5,0.0,4.609956e+06,5.819988e+06,4.609957e+06,5.819853e+06,0.0,0.0,0.0
2,2,524,620,49.508163,6.944444,1800.0,1.5,27.0,4.614751e+06,5.819976e+06,4.614750e+06,5.820025e+06,0.0,2.0,0.0
3,3,620,524,49.508163,6.944444,2400.0,2.0,35.0,4.614750e+06,5.820025e+06,4.614751e+06,5.819976e+06,2.0,0.0,0.0
4,4,656,652,13.326026,6.944444,1600.0,1.0,79.0,4.615677e+06,5.819981e+06,4.615681e+06,5.819993e+06,1.0,9.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16980,1391,212,704,1124.319428,22.222222,600.0,1.0,10.0,4.624410e+06,5.875182e+06,4.623684e+06,5.874330e+06,0.0,0.0,0.0
16981,1392,678,679,43.946365,22.222222,6000.0,1.5,36.0,4.643553e+06,5.884511e+06,4.643596e+06,5.884518e+06,13.0,2.0,0.0
16982,1393,679,678,43.946365,22.222222,6000.0,1.5,33.0,4.643596e+06,5.884518e+06,4.643553e+06,5.884511e+06,2.0,13.0,0.0
16983,1394,679,412,42.684073,22.222222,6000.0,1.5,35.0,4.643596e+06,5.884518e+06,4.643639e+06,5.884524e+06,2.0,0.0,0.0


In [3]:
numerical_features = ['link_length', 'link_freespeed', 'link_capacity', 'link_permlanes', 'start_count', 'end_count', 'go_to_sum']
X_t = train_data.drop(columns=['link_counts'])
y_t = train_data['link_counts']
X_v = validate_data.drop(columns=['link_counts'])
y_v = validate_data['link_counts']
scaler = StandardScaler()
X_t[numerical_features] = scaler.fit_transform(X_t[numerical_features])
X_v[numerical_features] = scaler.fit_transform(X_v[numerical_features])

In [10]:
# Initialize models
models = {
    'Linear Regression': LinearRegression(),
    'Lasso': LassoCV(cv=3, random_state=42, max_iter=100000),
    'Ridge': RidgeCV(cv=3),
    'SVR': SVR(),
    'Random Forest': RandomForestRegressor(criterion='friedman_mse', max_depth=20, min_samples_leaf=2, n_estimators=150, random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(max_depth=5, min_samples_split=5, random_state=42, subsample=0.8),
    'Artificial Neural Network': MLPRegressor(activation='tanh', alpha=0.001, learning_rate_init=0.01, max_iter=1000, random_state=42, solver='sgd'),
    'Gaussian Process Regression': GaussianProcessRegressor()
}

# Function to train and evaluate models
def evaluate_models(models, X_train, y_train, X_test, y_test):
    results = {}
    for name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        mse = mean_squared_error(y_test, y_pred)
        
        kf = KFold(n_splits=3, shuffle=True, random_state=42)
        cv_scores = cross_val_score(model, X_train, y_train, cv=kf, scoring='neg_mean_squared_error')
        mse_scores = -cv_scores 
        mean_mse = mse_scores.mean()
        std_mse = mse_scores.std()
        # mape = mean_absolute_percentage_error(y_test, y_pred)
        # r2 = r2_score(y_test, y_pred)
        
        
        results[name] = {'MSE_mean': mean_mse, 'MSE_std': std_mse, 'MSE_predict': mse}
    
    return results

# Train and evaluate
results = evaluate_models(models, X_t, y_t, X_v, y_v)

# Display Results
for model_name, metrics in results.items():
    print(f"Model: {model_name}")
    for metric_name, value in metrics.items():
        print(f"{metric_name}: {value}")
    print("\n")

KeyboardInterrupt: 

In [None]:
param_grid_svr = {
    'C': [0.1, 1, 10, 100],  # Extended range for the regularization parameter
    'gamma': ['scale', 'auto', 0.01, 0.1],  # Including specific gamma values
    'kernel': ['rbf'],  # Focusing on RBF kernel
    'epsilon': [0.01, 0.1, 0.2],  # Epsilon in the epsilon-SVR model
}

grid_search_svr = GridSearchCV(SVR(), param_grid_svr, cv=2, n_jobs=-1, verbose=10, scoring='neg_mean_squared_error')
grid_search_svr.fit(X_t, y_t)
print(grid_search_svr.best_params_)
print(grid_search_svr.best_estimator_)

Fitting 2 folds for each of 48 candidates, totalling 96 fits


In [4]:
param_grid_rf = {
    'n_estimators': [100, 150, 200],
    'max_depth': [None, 5, 10, 20],
    'min_samples_split': [2,4],
    'min_samples_leaf': [1, 2],
    'criterion':['friedman_mse']
}

grid_search_rf = GridSearchCV(RandomForestRegressor(random_state=42), param_grid_rf, cv=3, n_jobs=-1, verbose=10, scoring='neg_mean_squared_error')
grid_search_rf.fit(X_t, y_t)
print(grid_search_rf.best_params_)
print(grid_search_rf.best_estimator_)

Fitting 3 folds for each of 48 candidates, totalling 144 fits
{'criterion': 'friedman_mse', 'max_depth': 20, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 150}
RandomForestRegressor(criterion='friedman_mse', max_depth=20,
                      min_samples_leaf=2, n_estimators=150, random_state=42)


In [5]:
param_grid_gb = {
    'n_estimators': [100, 200],
    'learning_rate': [0.01, 0.1, 0.2],  # Varied learning rates for gradient boosting
    'max_depth': [3, 5, 10],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 4],
    'subsample': [0.8, 1.0],  # Fraction of samples to be used for fitting individual base learners
}

grid_search_gb = GridSearchCV(GradientBoostingRegressor(random_state=42), param_grid_gb, cv=3, n_jobs=-1, verbose=5, scoring='neg_mean_squared_error')
grid_search_gb.fit(X_t, y_t)
print(grid_search_gb.best_params_)
print(grid_search_gb.best_estimator_)

Fitting 3 folds for each of 144 candidates, totalling 432 fits
{'learning_rate': 0.1, 'max_depth': 5, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 100, 'subsample': 0.8}
GradientBoostingRegressor(max_depth=5, min_samples_split=5, random_state=42,
                          subsample=0.8)


In [6]:
param_grid_mlp = {
    'hidden_layer_sizes': [(50,), (100,), (50, 50), (100, 100)],
    'activation': ['tanh', 'relu'],
    'solver': ['sgd', 'adam'],
    'alpha': [0.001, 0.01, 0.1],
    'learning_rate': ['constant','adaptive'],
    'learning_rate_init': [0.001, 0.01]
}

grid_search_mlp = GridSearchCV(MLPRegressor(max_iter=1000, random_state=42), param_grid_mlp, cv=3, n_jobs=-1, verbose=5, scoring='neg_mean_squared_error')
grid_search_mlp.fit(X_t, y_t)
print(grid_search_mlp.best_params_)
print(grid_search_mlp.best_estimator_)

Fitting 3 folds for each of 192 candidates, totalling 576 fits
{'activation': 'tanh', 'alpha': 0.001, 'hidden_layer_sizes': (100,), 'learning_rate': 'constant', 'learning_rate_init': 0.01, 'solver': 'sgd'}
MLPRegressor(activation='tanh', alpha=0.001, learning_rate_init=0.01,
             max_iter=1000, random_state=42, solver='sgd')


In [None]:
from sklearn.gaussian_process.kernels import RBF, WhiteKernel, ConstantKernel

param_grid_gpr = {
    'kernel': [
        1.0 * RBF(length_scale=1.0),  # Default RBF kernel
        1.0 * RBF(length_scale=1.0) + WhiteKernel(noise_level=1)  # RBF with an added white kernel for noise
    ],
    'alpha': [1e-10, 1e-2, 1e-1],  # More options for noise level
}

gpr = GaussianProcessRegressor(random_state=42)

# Initialize GridSearchCV
grid_search_gpr = GridSearchCV(gpr, param_grid_gpr, cv=3, scoring='neg_mean_squared_error', n_jobs=-1, verbose=10)
grid_search_gpr.fit(X_t, y_t)

print(grid_search_gpr.best_params_)
print(grid_search_gpr.best_estimator_)