In [None]:
import pandas as pd
import multiprocessing as mp 
import matplotlib.pyplot as plt
import dask.dataframe as dd
import numpy as np
import seaborn as sns

In [None]:
df_2022 = pd.read_pickle(r'D:\capstone_F24\data\MTA_Subway_Hourly_Ridership_2022.pkl')
df_2023 = pd.read_pickle(r'D:\capstone_F24\data\MTA_Subway_Hourly_Ridership_2023.pkl')

In [None]:
!conda install scipy

In [None]:
ridership_data_all_years = pd.concat([df_2022, df_2023], axis=0)

In [None]:
hourly_ridership_data = ridership_data_all_years.copy()

In [None]:
hourly_ridership_data.info()

In [None]:
# convert ridership to numeric
hourly_ridership_data['ridership'] = pd.to_numeric(hourly_ridership_data['ridership'], errors='coerce')

In [None]:
hourly_ridership_data.head()

In [None]:
hourly_ridership_data.info()

In [None]:
events_df = pd.read_csv(r'D:\capstone_F24\data\events.csv')
events_df.info()

In [None]:
weather_pd = pd.read_csv(r'D:\capstone_F24\data\weather_data_ny.csv')
weather_pd.info()

## The Yanks are coming...

How does a Yankees home game impact ridership in the Bronx and surrounding buroughs? Let's take a look.

In [None]:
# Filter transit data for the Bronx
bronx_ridership = hourly_ridership_data[hourly_ridership_data['borough'] == 'Bronx']

In [None]:
# Filter for Yankees games in the events DataFrame
yankees_games = events_df[events_df['event_name'].str.contains('Yankees', case=False, na=False)].copy()

yankees_games['start_date_time'] = pd.to_datetime(yankees_games['start_date_time'], errors='coerce')

# Get a list of game dates
yankees_game_dates = yankees_games['start_date_time'].dt.date.unique()

In [None]:
# Resample to get daily ridership totals
daily_bronx_ridership = bronx_ridership.resample('D', on='transit_timestamp')['ridership'].sum().reset_index()

# Add a column indicating if the day is a game day
daily_bronx_ridership['is_game_day'] = daily_bronx_ridership['transit_timestamp'].dt.date.isin(yankees_game_dates)

In [None]:
# Calculate average ridership for game and non-game days
impact_analysis = daily_bronx_ridership.groupby('is_game_day')['ridership'].mean().reset_index()
impact_analysis['is_game_day'] = impact_analysis['is_game_day'].replace({True: 'Game Day', False: 'Non-Game Day'})

In [None]:
plt.figure(figsize=(10, 6))
sns.barplot(data=impact_analysis, x='is_game_day', y='ridership')
plt.title('Average Ridership in the Bronx on Yankees Game Days vs. Non-Game Days')
plt.xlabel('Day Type')
plt.ylabel('Average Ridership')
plt.grid()
plt.show()

It looks like ridership is slightly higher in the Bronx on game days, but not by a lot. Let's take a look at ridership averages across all boroughs. Which is most impacted by game day?

In [None]:
# Add a column indicating if the day is a game day
hourly_ridership_data['is_game_day'] = hourly_ridership_data['transit_timestamp'].dt.date.isin(yankees_game_dates)

# Group by borough and game day status, averaging the ridership
borough_ridership_avg = hourly_ridership_data.groupby(['borough', 'is_game_day'])['ridership'].mean().reset_index()

# Rename the columns for clarity
borough_ridership_avg['game_day'] = borough_ridership_avg['is_game_day'].replace({True: 'Game Day', False: 'Non-Game Day'})

In [None]:
# Create the bar plot
plt.figure(figsize=(12, 6))
sns.barplot(data=borough_ridership_avg, x='borough', y='ridership', hue='game_day', palette='Set2')

# Adding titles and labels
plt.title('Average Ridership Comparison on Game Days vs Non-Game Days by Borough')
plt.xlabel('Borough')
plt.ylabel('Average Ridership')
plt.xticks(rotation=45)  # Rotate x labels for better readability
plt.legend(title='Game Day Status')
plt.tight_layout()  # Adjust layout to prevent clipping
plt.show()

Again, it looks like average ridership doesn't change too much any any burough except Manhattan (where fewer people likely own cars). Is there a statistical difference? We'll do a t-test between game days and non game days to find out.

In [None]:
# Initialize a dictionary to store results
results = {}

# Get unique boroughs
boroughs = borough_ridership_avg['borough'].unique()

# Conduct t-test for each borough
for borough in boroughs:
    # Filter data for the specific borough
    borough_data = borough_ridership_avg[borough_ridership_avg['borough'] == borough]
    
    # Separate ridership into game days and non-game days
    game_day_ridership = borough_data[borough_data['is_game_day']]['ridership']
    non_game_day_ridership = borough_data[~borough_data['is_game_day']]['ridership']
    
    # Perform the t-test
    t_stat, p_value = stats.ttest_ind(game_day_ridership, non_game_day_ridership, equal_var=False)  # Set equal_var=False for Welch's t-test

    # Store the results
    results[borough] = {
        't_statistic': t_stat,
        'p_value': p_value
    }

# Print the results
for borough, res in results.items():
    print(f"{borough} - t-statistic: {res['t_statistic']:.2f}, p-value: {res['p_value']:.4f}")

In [None]:
import sys
print(sys.executable)

In [2]:
!conda activate env_lydia_3.12

In [4]:
!conda install scipy

^C
Retrieving notices: ...working... done
Channels:
 - defaults
 - conda-forge
Platform: win-64
Collecting package metadata (repodata.json): ...working... done
Solving environment: ...working... done

## Package Plan ##

  environment location: C:\Users\one1c\.conda\envs\capstone_1

  added / updated specs:
    - scipy


The following NEW packages will be INSTALLED:

  blas               pkgs/main/win-64::blas-1.0-mkl 
  icc_rt             pkgs/main/win-64::icc_rt-2022.1.0-h6049295_2 
  intel-openmp       pkgs/main/win-64::intel-openmp-2023.1.0-h59b6b97_46320 
  mkl                pkgs/main/win-64::mkl-2023.1.0-h6b88ed4_46358 
  mkl-service        pkgs/main/win-64::mkl-service-2.4.0-py312h2bbff1b_1 
  mkl_fft            pkgs/main/win-64::mkl_fft-1.3.10-py312h827c3e9_0 
  mkl_random         pkgs/main/win-64::mkl_random-1.2.7-py312h0158946_0 
  numpy              pkgs/main/win-64::numpy-1.26.4-py312hfd52020_0 
  numpy-base         pkgs/main/win-64::numpy-base-1.26.4-py312h4dde369_0 
  py

In [9]:
clean_22 = pd.read_pickle(r'D:\capstone_F24\data\Cleaned_data\df_filled_22.pkl')
clean_22.head()

ModuleNotFoundError: No module named 'numpy._core.numeric'

In [8]:
!pip install numpy --upgrade



In [10]:
import joblib

ModuleNotFoundError: No module named 'joblib'

In [11]:
!pip install joblib



In [12]:
import joblib

ModuleNotFoundError: No module named 'joblib'