## ML for foot traffic score
Prediction for the foot traffic score using the Yellow Taxi Data from 2020 - 2023 to predict the foot traffic score during daytimes (morning, afternoon, evening, night)

### Combine foot traffic scores from 2020 - 2023 with subway access scores

**2020**

In [3]:
import pandas as pd
import geopandas as gpd
from sklearn.preprocessing import MinMaxScaler

all_future_foot_scores = pd.read_csv("foot_scores_years/all_foot_traffic_scores_with_daytime_category.csv")


In [4]:
all_future_foot_scores.head()

Unnamed: 0,trip_date,LocationID,daytime_category,daily_pickup_count,daily_dropoff_count,daily_dropoff_count_scaled,daily_pickup_count_scaled,daily_foot_traffic_score
0,2020-01-01,4,morning,31.0,36.0,1.073939,1.052111,1.06739
1,2020-01-01,4,afternoon,31.0,123.0,1.252624,1.052111,1.19247
2,2020-01-01,4,evening,363.0,611.0,2.254906,1.610198,2.061494
3,2020-01-01,12,morning,10.0,77.0,1.158147,1.01681,1.115746
4,2020-01-01,12,afternoon,81.0,62.0,1.127339,1.13616,1.129985


In [7]:
#load geojson
import json
with open(r'..\census tract geofiles\manhattan_census_tracts.geojson', 'r') as f:
    geojson = json.load(f)
    
geoids = [feature['properties']['GEOID'] for feature in geojson['features']]

In [8]:
# Map LocationIDs to GEOIDs
result_rows = []
locations = sorted(all_future_foot_scores['LocationID'].unique())
tracts_per_location = len(geoids) // len(locations)

print(f"Mapping {len(locations)} LocationIDs to {len(geoids)} GEOIDs")
print(f"Approximately {tracts_per_location} census tracts per LocationID")

geoid_index = 0
for i, location_id in enumerate(locations):
    location_data = all_future_foot_scores[all_future_foot_scores['LocationID'] == location_id].iloc[0]
    num_geoids = tracts_per_location + (1 if i < len(geoids) % len(locations) else 0)
    
    for j in range(num_geoids):
        if geoid_index < len(geoids):
            row = {'GEOID': geoids[geoid_index]}
            for col in all_future_foot_scores.columns:
                if col != 'LocationID':
                    row[col] = location_data[col]
            result_rows.append(row)
            geoid_index += 1
            
print(f"Created {len(result_rows)} rows for GEOID mapping")

Mapping 67 LocationIDs to 310 GEOIDs
Approximately 4 census tracts per LocationID
Created 310 rows for GEOID mapping


In [9]:
#Save to df
mapped_df = pd.DataFrame(result_rows)
mapped_df.head()

Unnamed: 0,GEOID,trip_date,daytime_category,daily_pickup_count,daily_dropoff_count,daily_dropoff_count_scaled,daily_pickup_count_scaled,daily_foot_traffic_score
0,36061000100,2020-01-01,morning,31.0,36.0,1.073939,1.052111,1.06739
1,36061001401,2020-01-01,morning,31.0,36.0,1.073939,1.052111,1.06739
2,36061001402,2020-01-01,morning,31.0,36.0,1.073939,1.052111,1.06739
3,36061001800,2020-01-01,morning,31.0,36.0,1.073939,1.052111,1.06739
4,36061002201,2020-01-01,morning,31.0,36.0,1.073939,1.052111,1.06739


In [11]:
# load subway scores and merge with future foot traffic
subway_scores_df = pd.read_csv("subway_score_by_tract.csv")
subway_scores_df.head()

Unnamed: 0,GEOID,subway_score
0,36061000100,0.0
1,36061001401,2.17
2,36061001402,2.02
3,36061001800,2.65
4,36061002201,1.67


In [None]:
# Merge future foot scores with subway scores based on GEOIDs
combined_df = pd.merge(mapped_df, subway_scores_df, on="GEOID", how="inner")

In [14]:
combined_df.head()

Unnamed: 0,GEOID,trip_date,daytime_category,daily_pickup_count,daily_dropoff_count,daily_dropoff_count_scaled,daily_pickup_count_scaled,daily_foot_traffic_score,subway_score
0,36061000100,2020-01-01,morning,31.0,36.0,1.073939,1.052111,1.06739,0.0
1,36061001401,2020-01-01,morning,31.0,36.0,1.073939,1.052111,1.06739,2.17
2,36061001402,2020-01-01,morning,31.0,36.0,1.073939,1.052111,1.06739,2.02
3,36061001800,2020-01-01,morning,31.0,36.0,1.073939,1.052111,1.06739,2.65
4,36061002201,2020-01-01,morning,31.0,36.0,1.073939,1.052111,1.06739,1.67


In [15]:
# Compute combined score (65% foot traffic, 35% subway access)
combined_df["combined_score"] = (
    0.65 * combined_df["daily_foot_traffic_score"] +
    0.35 * combined_df["subway_score"]
).round(2)

In [16]:
#save combined future foot scores
combined_df.to_csv("combined_future_foot_scores.csv", index=False)

### ML revised

In [None]:
# Feature engineering
geoid_means = combined_df.groupby('GEOID')['combined_score'].mean()
combined_df['GEOID_encoded'] = combined_df['GEOID'].map(geoid_means)

In [55]:
combined_df['pickup_x_dayofweek'] = combined_df['daily_pickup_count_scaled'] * combined_df['day_of_week']
combined_df['dropoff_x_dayofweek'] = combined_df['daily_dropoff_count_scaled'] * combined_df['day_of_week']

In [64]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

combined_df['trip_date'] = pd.to_datetime(combined_df['trip_date'])
combined_df['year'] = combined_df['trip_date'].dt.year
combined_df['month'] = combined_df['trip_date'].dt.month
combined_df['day_of_week'] = combined_df['trip_date'].dt.dayofweek

features = ['GEOID_encoded','daytime_category', 'daily_pickup_count_scaled', 'dropoff_x_dayofweek',
            'pickup_x_dayofweek','daily_dropoff_count_scaled', 'year', 'month', 'day_of_week']
target = 'combined_score'

X = combined_df[features]
y = combined_df[target]

categorical_features = ['GEOID_encoded','daytime_category', 'year', 'month', 'day_of_week']

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ],
    remainder='passthrough' # 'remainder' will have no effect here as there are no numerical features other than what's encoded
)

print("\n--- Preprocessor Setup Complete ---")


--- Preprocessor Setup Complete ---


**Training Model**

*Linear Regression*

In [65]:
from sklearn.linear_model import LinearRegression
# Using a standard random train-test split instead.
print("\n--- Performing Random Train-Test Split ---")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # 20% for testing

print(f"Train set shape: {X_train.shape}, {y_train.shape}")
print(f"Test set shape: {X_test.shape}, {y_test.shape}")

model_lr = Pipeline(steps=[('preprocessor', preprocessor),
                        ('regressor', LinearRegression())])

print("\n--- Training the Linear Regression model ---")
model_lr.fit(X_train, y_train)
print("Model training complete.")

y_pred = model_lr.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"\n--- Model Evaluation (on random 20% test data) ---")
print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"R-squared (R2): {r2:.4f}")



--- Performing Random Train-Test Split ---
Train set shape: (248, 9), (248,)
Test set shape: (62, 9), (62,)

--- Training the Linear Regression model ---
Model training complete.

--- Model Evaluation (on random 20% test data) ---
Mean Absolute Error (MAE): 0.1152
R-squared (R2): 0.6465


In [58]:
from sklearn.preprocessing import PolynomialFeatures

model_lr = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('poly', PolynomialFeatures(degree=2, include_bias=False)),  # Add interaction & squared terms
    ('regressor', LinearRegression())
])

print("\n--- Training the RandomForestRegressor model ---")
model_lr.fit(X_train, y_train)
print("Model training complete.")

y_pred = model_lr.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"\n--- Model Evaluation (on random 20% test data) ---")
print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"R-squared (R2): {r2:.4f}")


--- Training the RandomForestRegressor model ---
Model training complete.

--- Model Evaluation (on random 20% test data) ---
Mean Absolute Error (MAE): 0.1393
R-squared (R2): 0.6118


*Random Forest*

In [66]:
# Using a standard random train-test split instead.
print("\n--- Performing Random Train-Test Split ---")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # 20% for testing

print(f"Train set shape: {X_train.shape}, {y_train.shape}")
print(f"Test set shape: {X_test.shape}, {y_test.shape}")

model_randomforest = Pipeline(steps=[('preprocessor', preprocessor),
                        ('regressor', RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1))])

print("\n--- Training the RandomForestRegressor model ---")
model_randomforest.fit(X_train, y_train)
print("Model training complete.")

y_pred = model_randomforest.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"\n--- Model Evaluation (on random 20% test data) ---")
print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"R-squared (R2): {r2:.4f}")



--- Performing Random Train-Test Split ---
Train set shape: (248, 9), (248,)
Test set shape: (62, 9), (62,)

--- Training the RandomForestRegressor model ---
Model training complete.

--- Model Evaluation (on random 20% test data) ---
Mean Absolute Error (MAE): 0.2689
R-squared (R2): 0.3733


*HistGradientBoostingRegressor*

In [67]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

combined_df['trip_date'] = pd.to_datetime(combined_df['trip_date'])
combined_df['year'] = combined_df['trip_date'].dt.year
combined_df['month'] = combined_df['trip_date'].dt.month
combined_df['day_of_week'] = combined_df['trip_date'].dt.dayofweek

features = ['GEOID_encoded','daytime_category', 'daily_pickup_count_scaled', 'dropoff_x_dayofweek',
            'pickup_x_dayofweek','daily_dropoff_count_scaled', 'year', 'month', 'day_of_week']
target = 'combined_score'

X = combined_df[features]
y = combined_df[target]

categorical_features = ['GEOID_encoded','daytime_category', 'year', 'month', 'day_of_week']
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
    ],
    remainder='passthrough' 
)

print("\n--- Preprocessor Setup Complete ---")


--- Preprocessor Setup Complete ---


In [68]:
from sklearn.ensemble import HistGradientBoostingRegressor

# Using a standard random train-test split instead.
print("\n--- Performing Random Train-Test Split (no 'year' column for chronological split) ---")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # 20% for testing

print(f"Train set shape: {X_train.shape}, {y_train.shape}")
print(f"Test set shape: {X_test.shape}, {y_test.shape}")

model_histgrbregressor = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', HistGradientBoostingRegressor(random_state=42))
])

print("\n--- Training the HistGradientBoostingRegressor model ---")
model_histgrbregressor.fit(X_train, y_train)
print("Model training complete.")

y_pred = model_histgrbregressor.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"\n--- Model Evaluation (on random 20% test data) ---")
print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"R-squared (R2): {r2:.4f}")


--- Performing Random Train-Test Split (no 'year' column for chronological split) ---
Train set shape: (248, 9), (248,)
Test set shape: (62, 9), (62,)

--- Training the HistGradientBoostingRegressor model ---
Model training complete.

--- Model Evaluation (on random 20% test data) ---
Mean Absolute Error (MAE): 0.3592
R-squared (R2): 0.1412


*Comparing all models with XGBRegressor (from xgboost) and LGBMRegressor*

In [69]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.ensemble import RandomForestRegressor, HistGradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR

# --- Train-test Split ---
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# --- Preprocessing Pipeline ---
preprocessor = ColumnTransformer(
    transformers=[('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)],
    remainder='passthrough'
)

# --- Models to Compare ---
models = {
    "RandomForest": RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1),
    "HistGradientBoosting": HistGradientBoostingRegressor(random_state=42),
    "XGBoost": XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42, n_jobs=-1),
    "LightGBM": LGBMRegressor(n_estimators=100, learning_rate=0.1, random_state=42, n_jobs=-1),
    "DecisionTree": DecisionTreeRegressor(random_state=42),
    "LinearRegression": LinearRegression(),
    "RidgeRegression": Ridge(alpha=1.0),
    "KNN": KNeighborsRegressor(n_neighbors=5),
    "SVR": SVR(kernel='rbf', C=1.0, epsilon=0.1),
    "GradientBoosting": GradientBoostingRegressor(random_state=42),

}

# --- Evaluation ---
results = []

for name, regressor in models.items():
    print(f"\n--- Training {name} ---")
    model = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', regressor)
    ])
    
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    results.append((name, r2, mae))
    print(f"{name} → R²: {r2:.4f}, MAE: {mae:.4f}")

# --- Summary ---
print("\n--- Model Comparison Summary ---")
for name, r2, mae in results:
    print(f"{name.ljust(22)} | R²: {r2:.4f} | MAE: {mae:.4f}")



--- Training RandomForest ---
RandomForest → R²: 0.3733, MAE: 0.2688

--- Training HistGradientBoosting ---
HistGradientBoosting → R²: 0.1412, MAE: 0.3592

--- Training XGBoost ---
XGBoost → R²: 0.4014, MAE: 0.2858

--- Training LightGBM ---
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000059 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 176
[LightGBM] [Info] Number of data points in the train set: 248, number of used features: 4
[LightGBM] [Info] Start training from score 1.647097
LightGBM → R²: 0.1429, MAE: 0.3586

--- Training DecisionTree ---
DecisionTree → R²: 0.1494, MAE: 0.2732

--- Training LinearRegression ---
LinearRegression → R²: 0.6465, MAE: 0.1152

--- Training RidgeRegression ---
RidgeRegression → R²: 0.5338, MAE: 0.2269

--- Training KNN ---
KNN → R²: 0.2169, MAE: 0.3034

--- Training SVR ---
SVR → R²: 0.2656, MAE: 0.2877

--- Training GradientBoosting ---
GradientBoosting → R²

**Prediction for 2024, 2025, 2026 aand 2027 using RandomForest Model**

In [None]:
# Copy base data
base = combined_df.copy()

# Get original date components
base['trip_date'] = pd.to_datetime(base['trip_date'])
base['month'] = base['trip_date'].dt.month
base['day_of_week'] = base['trip_date'].dt.dayofweek

# Create future set
future_years = [2025, 2026, 2027]
future_foot_traffic = pd.DataFrame()

def safe_replace_year_or_none(date_obj, new_year):
    try:
        return date_obj.replace(year=new_year)
    except ValueError:
        return None

for year in future_years:
    temp = base.copy()
    temp['year'] = year
    temp['trip_date'] = temp['trip_date'].apply(lambda d: safe_replace_year_or_none(d, year))
    temp = temp.dropna(subset=['trip_date'])
    future_foot_traffic = pd.concat([future_foot_traffic, temp], ignore_index=True)

# Features used in model
features = ['LocationID', 'daytime_category', 'daily_pickup_count_scaled', 
            'daily_dropoff_count_scaled', 'year', 'month', 'day_of_week']

X_future = future_foot_traffic[features]

# Predict
future_foot_traffic['predicted_foot_traffic_score'] = model_randomforest.predict(X_future)

# View
print(future_foot_traffic[['trip_date', 'LocationID', 'daytime_category', 'predicted_foot_traffic_score']].head())


   trip_date  LocationID daytime_category  predicted_foot_traffic_score
0 2025-01-01           4          morning                      1.067397
1 2025-01-01           4        afternoon                      1.192657
2 2025-01-01           4          evening                      2.060485
3 2025-01-01          12          morning                      1.115753
4 2025-01-01          12        afternoon                      1.130004


In [41]:
future_foot_traffic.head()

Unnamed: 0,trip_date,LocationID,daytime_category,daily_pickup_count,daily_dropoff_count,daily_dropoff_count_scaled,daily_pickup_count_scaled,daily_foot_traffic_score,year,month,day_of_week,predicted_foot_traffic_score
0,2025-01-01,4,morning,31.0,36.0,1.073939,1.052111,1.06739,2025,1,2,1.067397
1,2025-01-01,4,afternoon,31.0,123.0,1.252624,1.052111,1.19247,2025,1,2,1.192657
2,2025-01-01,4,evening,363.0,611.0,2.254906,1.610198,2.061494,2025,1,2,2.060485
3,2025-01-01,12,morning,10.0,77.0,1.158147,1.01681,1.115746,2025,1,2,1.115753
4,2025-01-01,12,afternoon,81.0,62.0,1.127339,1.13616,1.129985,2025,1,2,1.130004


In [42]:
#Save as csv
future_foot_traffic.to_csv('future_foot_traffic.csv', index=False)