In [1]:

import warnings
import joblib


warnings.filterwarnings(action='ignore')

import os
import time
import pandas as pd
import numpy as np
import shap

# fields names
ped_level = 'ped_level'
general_path = f'{os.getcwd()}/ml'




from sklearn.tree import DecisionTreeRegressor
from sklearn.feature_selection import RFECV
from sklearn.model_selection import train_test_split, ShuffleSplit
from sklearn.metrics import (
    r2_score, mean_squared_error, mean_absolute_error,
    explained_variance_score, median_absolute_error
)
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt

In [2]:
def run_rfecv(X_train, y_train, X_test, y_test, max_depth,features_name):
    if max_depth !='all':
        selector = RFECV(
            estimator=RandomForestRegressor(n_estimators=100, random_state=1, n_jobs=-1, max_depth=max_depth),
            step=1,
            cv=cv,
            scoring='r2',
            n_jobs=-1
        )

        start = time.time()
        selector.fit(X_train, y_train)
        print(f"\nFeature selection completed in {(time.time() - start):.2f} seconds")

        # Apply selected features
        X_train_selected = selector.transform(X_train)
        X_test_selected = selector.transform(X_test)

        # Selected feature names
        selected_features = [name for name, keep in zip(features_name, selector.support_) if keep]
    else:
        X_train_selected = X_train
        X_test_selected = X_test


    # ------------------------
    # Train Final Regressor
    # ------------------------

    regressor = DecisionTreeRegressor(random_state=1)
    regressor.fit(X_train_selected, y_train)

    # ------------------------
    # Evaluation
    # ------------------------

    y_pred = regressor.predict(X_test_selected)

    print('\nModel Performance:')
    print('R² score:', r2_score(y_test, y_pred, multioutput='raw_values'))
    print('Mean Squared Error:', mean_squared_error(y_test, y_pred, multioutput='raw_values'))
    print('Mean Absolute Error:', mean_absolute_error(y_test, y_pred, multioutput='raw_values'))
    print('Explained Variance:', explained_variance_score(y_test, y_pred, multioutput='raw_values'))
    print('Median Absolute Error:', median_absolute_error(y_test, y_pred, multioutput='raw_values'))

    # ------------------------
    # Grouped MSE by y_test
    # ------------------------

    stat_df = pd.DataFrame({'y_test': y_test, 'y_pred': y_pred})
    group_mse = stat_df.groupby('y_test').apply(lambda df: mean_squared_error(df['y_test'], df['y_pred']))
    print("\nGroup-wise MSE:")
    print(group_mse)

    # ------------------------
    # Feature Importance (sorted)
    # ------------------------
    importances = regressor.feature_importances_
    if max_depth =='all':
        selected_features = features_name
    sorted_features = sorted(zip(selected_features, importances), key=lambda x: x[1], reverse=True)

    print(f"\n{len(selected_features)} Selected Features and Importances (sorted):")
    for name, imp in sorted_features:
        print(f"{name}: {imp:.4f}")

    # ------------------------
    # Tree depth
    # ------------------------

    print(f"\nFinal Decision Tree Depth: {regressor.get_depth()}")
    return regressor, selected_features


def create_node_df(regressor):
    # Initialize list to store data
    node_data = []

    # Access tree
    tree = regressor.tree_
    node_count = tree.node_count

    for node_id in range(node_count):
        value = tree.value[node_id][0][0]  # average target value at node
        impurity = tree.impurity[node_id]  # mean squared error
        n_samples = tree.n_node_samples[node_id]  # number of samples

        # Define group (1 if 0–1, 2 if 1–2, etc.)
        group = round(value)  # since 0–1 → group 1, 1–2 → group 2, ...

        # Check if this is a leaf node
        is_leaf = (tree.children_left[node_id] == -1) and (tree.children_right[node_id] == -1)
        leaf_id = node_id if is_leaf else -1  # -1 means not a leaf

        # Collect
        node_data.append({
            'node_id': node_id,
            'leaf_id': leaf_id,
            'value': value,
            'group': group,
            'impurity': impurity,
            'n_node_samples': n_samples
        })

    # Create DataFrame
    return pd.DataFrame(node_data)


def find_important_paths(regressor, node_df,total_samples,selected_features):
    # Parameters

    min_threshold = 0.0001
    initial_threshold = 0.1
    step = 0.0001

    # Helper: get decision path to a node
    def get_path_to_node(node_id):
        path = []
        i = node_id
        while i != 0 and parent_nodes[i] is not None:
            parent = parent_nodes[i]
            feat_i = regressor.tree_.feature[parent]
            feat = selected_features[feat_i]
            threshold = regressor.tree_.threshold[parent]
            direction = "<=" if i == regressor.tree_.children_left[parent] else ">"
            path.append(f"{feat} {direction} {threshold:.6f}")
            i = parent
        return "--".join(reversed(path))

    # Make sure parent_nodes list is defined
    parent_nodes = [None] * regressor.tree_.node_count
    for i in range(regressor.tree_.node_count):
        if regressor.tree_.children_left[i] != -1:
            parent_nodes[regressor.tree_.children_left[i]] = i
        if regressor.tree_.children_right[i] != -1:
            parent_nodes[regressor.tree_.children_right[i]] = i




    # Process each group
    for group_id in sorted(node_df['group'].unique()):
        group_nodes = node_df[node_df['group'] == group_id].sort_values(by='impurity')

        threshold = initial_threshold
        selected_nodes = pd.DataFrame()

        while threshold >= min_threshold:
            min_samples = total_samples * threshold
            filtered = group_nodes[
                (group_nodes['n_node_samples'] >= min_samples) &
                (group_nodes['impurity'] < 1.0)
            ]
            selected_nodes = filtered.head(5)

            if len(selected_nodes) >= 5:
                break
            else:
                threshold -= step

        # If valid nodes found, build full output
        if not selected_nodes.empty:
            selected_nodes = selected_nodes.copy()
            selected_nodes['path'] = selected_nodes['node_id'].apply(get_path_to_node)
            selected_nodes.to_csv(f"{output_folder}/group_{group_id}.csv", index=False)
            print(f"Group {group_id}: {len(selected_nodes)} nodes saved with threshold {threshold:.3%}")
        else:
            print(f"Group {group_id}: No nodes passed conditions.")


In [None]:

# ------------------------
# Load and Prepare Data
# ------------------------

ml_df = pd.read_csv(f'{general_path}/model_data_final_1.csv').drop(columns=['index_str_name', 'Unnamed: 0', 'length'])
data_feature = ml_df.drop(columns=[ped_level])
features_name = list(data_feature.columns)

X = data_feature.to_numpy(dtype=np.float32)  # Use float32 for memory speed
y = ml_df[ped_level].to_numpy(dtype=np.float32)

# ------------------------
# Split Data
# ------------------------

X_train, X_test, y_train, y_test = train_test_split(X ,y, test_size=0.2, random_state=1)
cv = ShuffleSplit(n_splits=5, test_size=0.3, random_state=1)
# ------------------------
# Feature Selection with RFECV (Random Forest)
# ------------------------
# ------------------------
# for max in ['all',5,10,20,None]:
for max in [None]:
    print("Training model with max_depth {}".format(max) if max !='all' else "Model with all features")
    regressor, selected_features = run_rfecv(X_train, y_train, X_test, y_test,max,features_name)
     # Create output folder
    output_folder = f"{general_path}/group_nodes_{str(max)}"
    os.makedirs(output_folder, exist_ok=True)
    joblib.dump(regressor, f"{output_folder}/decision_tree_{str(max)}.pkl")
    joblib.dump(selected_features, f"{output_folder}/selected_features_{str(max)}.pkl")
    node_df = create_node_df(regressor)
    total_samples = len(ml_df)
    find_important_paths(regressor, node_df,total_samples,selected_features)

Training model with max_depth None


In [1]:
str(None)

'None'

In [7]:
# This code work with assumption that we already have tree

# List of folder names
ml_df = pd.read_csv(f'{general_path}/model_data_final_1.csv').drop(columns=['index_str_name', 'Unnamed: 0', 'length'])
for max in [5,10,20,'all',None]:
    output_folder = f"{general_path}/group_nodes_{max}"
    regressor= joblib.load(f"{output_folder}/decision_tree_{max}.pkl")
    selected_features =joblib.load(f"{output_folder}/selected_features_{max}.pkl")
    node_df = create_node_df(regressor)
    total_samples = len(ml_df)
    find_important_paths(regressor, node_df,total_samples,selected_features)

Group 0: 5 nodes saved with threshold 0.210%
Group 1: 5 nodes saved with threshold 0.530%
Group 2: 5 nodes saved with threshold 0.070%
Group 3: 5 nodes saved with threshold 0.280%
Group 4: 5 nodes saved with threshold 0.330%
Group 0: 5 nodes saved with threshold 0.230%
Group 1: 5 nodes saved with threshold 0.870%
Group 2: 5 nodes saved with threshold 0.060%
Group 3: 5 nodes saved with threshold 0.340%
Group 4: 5 nodes saved with threshold 0.270%
Group 0: 5 nodes saved with threshold 0.170%
Group 1: 5 nodes saved with threshold 0.440%
Group 2: 5 nodes saved with threshold 0.060%
Group 3: 5 nodes saved with threshold 0.250%
Group 4: 5 nodes saved with threshold 0.260%
Group 0: 5 nodes saved with threshold 0.210%
Group 1: 5 nodes saved with threshold 0.530%
Group 2: 5 nodes saved with threshold 0.070%
Group 3: 5 nodes saved with threshold 0.280%
Group 4: 5 nodes saved with threshold 0.330%


FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\achit\\OneDrive - ariel.ac.il\\Current_research\\ASC2\\pythonProject\\places\\tel_aviv/ml/group_nodes_None/selected_features_None.pkl'

In [8]:
selected_features

['day part',
 'season',
 'day',
 'buildings',
 'businesses',
 'educationa',
 'Health_ser',
 'Leisure_am',
 'Playground',
 'Sport_faci',
 'synagogues',
 'bus_statio',
 'lighting',
 'bike_trail',
 'parks',
 'SEleve1_10',
 'closeness',
 'betweennes',
 'highway',
 'bench',
 'green_canopy',
 'pop_dens',
 'road_right',
 'shadows',
 'sidewalk_width',
 'slope']