In [2]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV

df = pd.read_csv('target_with_footTraffic.csv')

socioeconomic_cols = ['TotalPop', 'Income', 'IncomePerCap', 'Poverty', 'Unemployment', 'Men', 'Women',
                      'Professional', 'Service', 'Office', 'Construction', 'Production', 'Drive', 
                      'Transit', 'MeanCommute', 'White', 'Black', 'Hispanic', 'Asian']
target_col = 'footTraffic'


df = df[socioeconomic_cols + [target_col]].dropna()


scaler = StandardScaler()
scaled_features = scaler.fit_transform(df[socioeconomic_cols])


X = pd.DataFrame(scaled_features, columns=socioeconomic_cols)
y = df[target_col]


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


linear_model = LinearRegression()
linear_model.fit(X_train, y_train)
y_pred_linear = linear_model.predict(X_test)


rf_model = RandomForestRegressor(random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)


gb_model = GradientBoostingRegressor(random_state=42)
gb_model.fit(X_train, y_train)
y_pred_gb = gb_model.predict(X_test)


def evaluate_model(y_true, y_pred, model_name):
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    r2 = r2_score(y_true, y_pred)
    print(f"{model_name} - RMSE: {rmse:.2f}, R²: {r2:.2f}")

evaluate_model(y_test, y_pred_linear, "Linear Regression")
evaluate_model(y_test, y_pred_rf, "Random Forest")
evaluate_model(y_test, y_pred_gb, "Gradient Boosting")


param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
}

grid_search_rf = GridSearchCV(estimator=rf_model, param_grid=param_grid, 
                              scoring='neg_mean_squared_error', cv=3, verbose=2, n_jobs=-1)
grid_search_rf.fit(X_train, y_train)

best_rf_model = grid_search_rf.best_estimator_
y_pred_best_rf = best_rf_model.predict(X_test)

evaluate_model(y_test, y_pred_best_rf, "Tuned Random Forest")


print("Best Random Forest Parameters: ", grid_search_rf.best_params_)


Linear Regression - RMSE: 12350.55, R²: 1.00
Random Forest - RMSE: 1372.76, R²: 1.00
Gradient Boosting - RMSE: 2681.84, R²: 1.00
Fitting 3 folds for each of 36 candidates, totalling 108 fits
Tuned Random Forest - RMSE: 1403.19, R²: 1.00
Best Random Forest Parameters:  {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 300}


In [3]:

new_data = df[socioeconomic_cols].iloc[1].to_dict()
new_data_df = pd.DataFrame([new_data])


scaled_new_data = scaler.transform(new_data_df)
predicted_foot_traffic = best_rf_model.predict(scaled_new_data)

print(f"Predicted Foot Traffic: {predicted_foot_traffic[0]:.2f}")

Predicted Foot Traffic: 22330.71




In [4]:
socioeconomic_cols = ['TotalPop', 'Income', 'IncomePerCap', 'Poverty', 'Unemployment', 'Men', 'Women',
                      'Professional', 'Service', 'Office', 'Construction', 'Production', 'Drive', 
                      'Transit', 'MeanCommute', 'White', 'Black', 'Hispanic', 'Asian']

In [5]:
census_data = pd.read_csv('county.csv')
census_data_filtered = census_data[socioeconomic_cols].copy()

scaled_census_data = scaler.transform(census_data_filtered)

predicted_foot_traffic = best_rf_model.predict(scaled_census_data)

census_data_with_predictions = census_data_filtered.copy()
census_data_with_predictions['PredictedFootTraffic'] = predicted_foot_traffic
census_data_with_predictions['County'] = census_data['County'].values

census_data_with_predictions.to_csv('census_data_with_predictions.csv', index=False)

print(census_data_with_predictions.head())

   TotalPop  Income  IncomePerCap  Poverty  Unemployment    Men   Women  \
0     55036   55317         27824     13.7           5.2  26899   28137   
1    203360   52562         29364     11.8           5.5  99527  103833   
2     26201   33368         17561     27.2          12.4  13976   12225   
3     22580   43404         20911     15.2           8.2  12251   10329   
4     57667   47412         22021     15.6           4.9  28490   29177   

   Professional  Service  Office  ...  Production  Drive  Transit  \
0          35.3     18.0    23.2  ...        15.4   86.0      0.1   
1          35.7     18.2    25.6  ...        10.8   84.7      0.1   
2          25.0     16.8    22.6  ...        24.1   83.4      0.3   
3          24.4     17.6    19.7  ...        22.4   86.4      0.7   
4          28.5     12.9    23.3  ...        19.5   86.8      0.1   

   MeanCommute  White  Black  Hispanic  Asian  PredictedFootTraffic  \
0         25.8   75.4   18.9       2.7    0.9          11980.58



In [6]:
census_data_with_predictions

Unnamed: 0,TotalPop,Income,IncomePerCap,Poverty,Unemployment,Men,Women,Professional,Service,Office,...,Production,Drive,Transit,MeanCommute,White,Black,Hispanic,Asian,PredictedFootTraffic,County
0,55036,55317,27824,13.7,5.2,26899,28137,35.3,18.0,23.2,...,15.4,86.0,0.1,25.8,75.4,18.9,2.7,0.9,11980.580267,Autauga County
1,203360,52562,29364,11.8,5.5,99527,103833,35.7,18.2,25.6,...,10.8,84.7,0.1,27.0,83.1,9.5,4.4,0.7,45313.426033,Baldwin County
2,26201,33368,17561,27.2,12.4,13976,12225,25.0,16.8,22.6,...,24.1,83.4,0.3,23.4,45.7,47.8,4.2,0.6,4849.130833,Barbour County
3,22580,43404,20911,15.2,8.2,12251,10329,24.4,17.6,19.7,...,22.4,86.4,0.7,30.0,74.6,22.0,2.4,0.0,4648.507767,Bibb County
4,57667,47412,22021,15.6,4.9,28490,29177,28.5,12.9,23.3,...,19.5,86.8,0.1,35.0,87.4,1.5,9.0,0.1,11408.830267,Blount County
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3215,54754,18900,10197,43.8,16.8,26269,28485,28.6,20.2,25.9,...,14.2,92.0,0.9,31.6,3.1,0.1,96.7,0.0,10863.086033,Vega Baja Municipio
3216,8931,16261,11136,36.8,12.8,4351,4580,20.9,38.4,16.4,...,7.3,76.3,0.0,14.9,4.0,0.0,95.7,0.0,2153.793033,Vieques Municipio
3217,23659,19893,10449,50.0,24.8,11510,12149,22.5,21.2,22.7,...,19.5,83.1,0.1,28.4,0.2,0.1,99.7,0.0,4723.608567,Villalba Municipio
3218,35025,15586,8672,52.4,25.4,16984,18041,27.7,26.0,20.7,...,16.0,87.6,0.0,30.5,0.1,0.0,99.9,0.0,7098.818433,Yabucoa Municipio


In [7]:
target_with_foot_traffic = pd.read_csv('target_with_footTraffic.csv')
target_counties = target_with_foot_traffic['County'].unique()

census_data_filtered = census_data[socioeconomic_cols].copy()

scaled_census_data = scaler.transform(census_data_filtered)

predicted_foot_traffic = best_rf_model.predict(scaled_census_data)

census_data_with_predictions = census_data_filtered.copy()
census_data_with_predictions['PredictedFootTraffic'] = predicted_foot_traffic
census_data_with_predictions['County'] = census_data['County'].values

census_data_with_predictions = census_data_with_predictions[~census_data_with_predictions['County'].isin(target_counties)]

census_data_with_predictions_sorted = census_data_with_predictions.sort_values(by='PredictedFootTraffic', ascending=False)

census_data_with_predictions_sorted.to_csv('target_predictions.csv', index=False)

print(census_data_with_predictions_sorted.head())


      TotalPop  Income  IncomePerCap  Poverty  Unemployment      Men    Women  \
204   10105722   61015         30798     17.0           7.8  4979641  5126081   
103    4155501   58580         30186     15.7           6.0  2055464  2100037   
2623   4525519   57791         30856     16.8           6.4  2251060  2274459   
610    5238541   59426         33722     15.9           8.7  2540704  2697837   
215    3155816   81851         37603     12.1           5.8  1558245  1597571   

      Professional  Service  Office  ...  Production  Drive  Transit  \
204           36.4     19.0    24.1  ...        12.8   73.7      6.3   
103           37.4     18.1    26.7  ...         9.5   76.4      2.2   
2623          35.4     17.4    22.9  ...        12.8   79.3      2.7   
610           39.3     17.9    23.6  ...        13.1   61.7     18.9   
215           40.9     17.5    24.8  ...        10.2   78.6      2.2   

      MeanCommute  White  Black  Hispanic  Asian  PredictedFootTraffic  \
204   



In [10]:
## get county, predicted foot traffic
census_data_with_predictions_sorted[['County', 'PredictedFootTraffic']].head(20)

Unnamed: 0,County,PredictedFootTraffic
204,Los Angeles County,2402932.0
103,Maricopa County,964530.3
2623,Harris County,919891.7
610,Cook County,899142.4
215,Orange County,780521.4
222,San Diego County,768058.6
362,Miami-Dade County,636391.2
1851,Kings County,567607.1
1868,Queens County,565377.8
2579,Dallas County,550093.2
