In [55]:
import pandas as pd
import networkx as nx

# Set Pandas display options to ensure all columns are visible
pd.set_option('display.max_columns', None)  # Display all columns
pd.set_option('display.width', 1000)       # Increase display width
pd.set_option('display.max_colwidth', None)  # Prevent column content from being truncated

# Load the bike-sharing data
file_path = 'zhong.csv'  # Replace with the actual file path
bike_data = pd.read_csv(file_path)

# Create a directed graph
graph = nx.DiGraph()

# Add edges to the graph based on start and end stations
for _, row in bike_data.iterrows():
    start_station = row['start_station_name']
    end_station = row['end_station_name']
    graph.add_edge(start_station, end_station)

# Calculate the degree centrality of each station
betweenness_centrality = nx.degree_centrality(graph)

# Sort stations by centrality and extract the top 10
top_10_stations = sorted(betweenness_centrality.items(), key=lambda x: x[1], reverse=True)[:10]

# Convert the result to a DataFrame for better visualization
top_10_df = pd.DataFrame(top_10_stations, columns=['Station', 'Betweenness Centrality'])

# Print the top 10 important stations
print(top_10_df)


                        Station  Betweenness Centrality
0             Clark St & Elm St                0.573311
1         Dearborn St & Erie St                0.528830
2     Desplaines St & Kinzie St                0.522241
3       Clark St & Armitage Ave                0.495881
4        Clark St & Schiller St                0.492586
5     Ashland Ave & Division St                0.485997
6   Dearborn Pkwy & Delaware Pl                0.482702
7  Sheffield Ave & Waveland Ave                0.476112
8        Clark St & Lincoln Ave                0.472817
9     Larrabee St & Webster Ave                0.471170


In [56]:
from geopy.distance import geodesic

# Add latitude and longitude to the important stations data
top_10_df = top_10_df.merge(
    bike_data[['start_station_name', 'start_lat', 'start_lng']].drop_duplicates(),
    left_on='Station',
    right_on='start_station_name',
    how='left'
)

# Rename latitude and longitude columns
top_10_df.rename(columns={'start_lat': 'lat', 'start_lng': 'lng'}, inplace=True)

# Get latitude and longitude data for all stations
all_stations = bike_data[['start_station_name', 'start_lat', 'start_lng']].drop_duplicates()
all_stations.rename(columns={'start_station_name': 'Station', 'start_lat': 'lat', 'start_lng': 'lng'}, inplace=True)

# Find the nearest station for each important station, ensuring no duplicates
nearest_stations = []

for _, row in top_10_df.iterrows():
    target_station = row['Station']
    target_coords = (row['lat'], row['lng'])

    # Calculate distances to other stations, excluding the target station itself
    filtered_stations = all_stations[all_stations['Station'] != target_station].copy()

    # Compute the distance to each station
    filtered_stations['Distance'] = filtered_stations.apply(
        lambda x: geodesic(target_coords, (x['lat'], x['lng'])).meters, axis=1
    )

    # Find the nearest station, ensuring no duplicates
    for i in range(len(filtered_stations)):
        nearest = filtered_stations.nsmallest(i + 1, 'Distance').iloc[-1]
        if nearest['Station'] not in top_10_df['Station'].values:
            nearest['Target_Station'] = target_station
            nearest_stations.append(nearest)
            break

# Combine results into a DataFrame
nearest_stations_df = pd.concat(nearest_stations, axis=1).T  # Transpose to align data

# Format the nearest stations data as a clean table
cleaned_nearest_stations_df = nearest_stations_df[['Target_Station', 'Station', 'Distance']]

# Rename columns
cleaned_nearest_stations_df.columns = ['Target_Station', 'Nearest_Station', 'Distance']

# Display the results
print(cleaned_nearest_stations_df)


                    Target_Station             Nearest_Station    Distance
311              Clark St & Elm St           Wells St & Elm St  249.918245
8            Dearborn St & Erie St       LaSalle Dr & Huron St   268.28195
1357     Desplaines St & Kinzie St        Clinton St & Lake St  406.353032
427        Clark St & Armitage Ave   Sedgwick St & Webster Ave  483.903805
533         Clark St & Schiller St    Wells St & Evergreen Ave   309.54453
1061     Ashland Ave & Division St  Ashland Ave & Blackhawk St   401.23225
323    Dearborn Pkwy & Delaware Pl       State St & Pearson St  203.711362
51    Sheffield Ave & Waveland Ave         Clark St & Grace St  419.581692
89          Clark St & Lincoln Ave       Wells St & Concord Ln  399.943859
962      Larrabee St & Webster Ave  Larrabee St & Armitage Ave  412.302791


In [57]:
import pandas as pd

# Convert time columns to datetime format and extract hourly intervals
bike_data['start_time'] = pd.to_datetime(bike_data['start_time'])
bike_data['end_time'] = pd.to_datetime(bike_data['end_time'])
bike_data['hour'] = bike_data['start_time'].dt.floor('H')  # Round to the nearest hour

# Count hourly outflows (departures)
hourly_outflow = bike_data.groupby(['start_station_name', 'hour']).size().rename('outflow')

# Count hourly inflows (arrivals)
hourly_inflow = bike_data.groupby(['end_station_name', 'hour']).size().rename('inflow')

# Combine inflow and outflow statistics
hourly_stats = pd.concat([hourly_outflow, hourly_inflow], axis=1).fillna(0)

# Calculate total visits as the sum of inflow and outflow
hourly_stats['total_visits'] = hourly_stats['inflow'] + hourly_stats['outflow']

# Retrieve the list of target and nearest stations (replace with actual lists from prior results)
target_stations = cleaned_nearest_stations_df['Target_Station'].unique()

# Filter statistics for target stations
target_hourly_stats = hourly_stats.loc[hourly_stats.index.get_level_values(0).isin(target_stations)]

# Reset index for easier handling
target_hourly_stats = target_hourly_stats.reset_index()

# Sort by 'total_visits' and group by 'level_0' (station names)
top_100_target_visits = (
    target_hourly_stats.sort_values(by='total_visits', ascending=False)  # Sort by total visits in descending order
    .groupby('level_0', group_keys=False)  # Group by station name
    .head(100)  # Keep the top 100 records for each station
)

# Rename 'level_0' to 'Station' for better clarity
top_100_target_visits.rename(columns={'level_0': 'Station'}, inplace=True)
formatted_top_100 = top_100_target_visits.reset_index(drop=True)  # Remove the default row index

# Select necessary columns and reorder them
formatted_top_100 = formatted_top_100[['Station', 'hour', 'inflow', 'outflow', 'total_visits']]

# Format the time column as a string
formatted_top_100['hour'] = formatted_top_100['hour'].dt.strftime('%Y-%m-%d %H:%M:%S')

# Display results
print(top_100_target_visits)


                        Station                hour  outflow  inflow  total_visits
499     Clark St & Armitage Ave 2020-04-26 13:00:00     17.0    11.0          28.0
2513  Larrabee St & Webster Ave 2020-04-19 15:00:00     11.0    14.0          25.0
2112  Desplaines St & Kinzie St 2020-04-07 16:00:00     16.0     8.0          24.0
1857      Dearborn St & Erie St 2020-04-11 15:00:00     13.0    11.0          24.0
2576  Larrabee St & Webster Ave 2020-04-26 14:00:00     10.0    13.0          23.0
...                         ...                 ...      ...     ...           ...
3178     Clark St & Lincoln Ave 2020-04-12 10:00:00      0.0     3.0           3.0
1090     Clark St & Lincoln Ave 2020-04-20 13:00:00      3.0     0.0           3.0
1091     Clark St & Lincoln Ave 2020-04-20 14:00:00      2.0     1.0           3.0
1098     Clark St & Lincoln Ave 2020-04-21 10:00:00      1.0     2.0           3.0
1101     Clark St & Lincoln Ave 2020-04-21 17:00:00      2.0     1.0           3.0

[10

In [58]:
# Get the top 10 target stations and time periods
top_10 = (
    formatted_top_100.sort_values(by='total_visits', ascending=False)
    .head(10)  # Retrieve the top 10 records
)

# Initialize the result storage
results = []

# Iterate through the top 10 records to find the nearest station and calculate total visits for the corresponding time period
for _, row in top_10.iterrows():
    target_station = row['Station']  # Target station
    time_period = row['hour']  # Time period

    # Find the corresponding nearest station
    nearest_station = cleaned_nearest_stations_df[
        cleaned_nearest_stations_df['Target_Station'] == target_station
    ]['Nearest_Station'].values[0]

    # Calculate total visits for the target station
    target_total_visits = row['total_visits']

    # Calculate total visits for the nearest station during the same time period
    nearest_total_visits = hourly_stats.loc[(nearest_station, time_period), 'total_visits'] \
        if (nearest_station, time_period) in hourly_stats.index else 0

    # Compute the sum of total visits for the station pair
    pair_total_visits = target_total_visits + nearest_total_visits

    # Add the result to the list
    results.append({
        'Target_Station': target_station,
        'Target_Time': time_period,
        'Nearest_Station': nearest_station,
        'Nearest_Time': time_period,
        'Target_Total_Visits': target_total_visits,
        'Nearest_Total_Visits': nearest_total_visits,
        'Pair_Total_Visits': pair_total_visits
    })

# Convert the results into a DataFrame
results_df = pd.DataFrame(results)

# Sort by Pair_Total_Visits in descending order
results_df = results_df.sort_values(by='Pair_Total_Visits', ascending=False)

# Output the results
print(results_df)


                 Target_Station          Target_Time             Nearest_Station         Nearest_Time  Target_Total_Visits  Nearest_Total_Visits  Pair_Total_Visits
0       Clark St & Armitage Ave  2020-04-26 13:00:00   Sedgwick St & Webster Ave  2020-04-26 13:00:00                 28.0                   8.0               36.0
4     Larrabee St & Webster Ave  2020-04-26 14:00:00  Larrabee St & Armitage Ave  2020-04-26 14:00:00                 23.0                   8.0               31.0
3         Dearborn St & Erie St  2020-04-11 15:00:00       LaSalle Dr & Huron St  2020-04-11 15:00:00                 24.0                   6.0               30.0
2     Desplaines St & Kinzie St  2020-04-07 16:00:00        Clinton St & Lake St  2020-04-07 16:00:00                 24.0                   5.0               29.0
6         Dearborn St & Erie St  2020-04-18 13:00:00       LaSalle Dr & Huron St  2020-04-18 13:00:00                 23.0                   6.0               29.0
8  Sheffield Ave