In [1]:
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
# Let's load the dataset
data = pd.read_csv("daily_cycling_data_newyork_28032025.csv")

In [3]:
# Let's check th basic information about the dataset
print("\nDataset Info:")
data.info()


Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37646 entries, 0 to 37645
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   name       37646 non-null  object 
 1   date       37646 non-null  object 
 2   latitude   37646 non-null  float64
 3   longitude  37646 non-null  float64
 4   counts     37646 non-null  int64  
dtypes: float64(2), int64(1), object(2)
memory usage: 1.4+ MB


In [4]:
print("First 5 rows of the dataset:")
print(data.head())

First 5 rows of the dataset:
                   name        date  latitude  longitude  counts
0  111th St at 50th Ave  2022-05-09  40.74563   -73.8525     120
1  111th St at 50th Ave  2022-05-10  40.74563   -73.8525     165
2  111th St at 50th Ave  2022-05-11  40.74563   -73.8525     197
3  111th St at 50th Ave  2022-05-12  40.74563   -73.8525     206
4  111th St at 50th Ave  2022-05-13  40.74563   -73.8525     178


In [5]:
print("\nStatistical Summary:")
print(data.describe())


Statistical Summary:
           latitude     longitude        counts
count  37646.000000  37646.000000  37646.000000
mean      40.721509    -73.972243   2123.184694
std        0.048307      0.045473   2031.473255
min       40.584100    -74.072075      0.000000
25%       40.709274    -73.994950    369.000000
50%       40.714573    -73.971382   1699.000000
75%       40.751010    -73.951492   3095.000000
max       40.857669    -73.852500  13346.000000


In [6]:
# Let's convert the date column to datetime format
data['date'] = pd.to_datetime(data['date'])

In [8]:
# Let's group by station name to check the range and count of data per station
station_summary = data.groupby('name').agg(
    start_date=('date', 'min'),
    end_date=('date', 'max'),
    total_days=('date', 'count'),
    unique_dates=('date', 'nunique'),
    min_count=('counts', 'min'),
    max_count=('counts', 'max'),
    mean_count=('counts', 'mean'),
    std_count=('counts', 'std')
).reset_index()

# And, calculate the total range in days
station_summary['date_range'] = (station_summary['end_date'] - station_summary['start_date']).dt.days + 1
station_summary['missing_days'] = station_summary['date_range'] - station_summary['unique_dates']

# Let's sort by total days of data available per station
station_summary = station_summary.sort_values(by='total_days', ascending=False)

# And, check the results
print("Station Data Summary:")
print(station_summary)

Station Data Summary:
                                          name start_date   end_date  \
4                    Brooklyn Bridge Bike Path 2015-01-01 2025-01-20   
11                   Manhattan Bridge Ped Path 2015-01-01 2025-01-20   
17               Williamsburg Bridge Bike Path 2015-01-01 2025-01-20   
6        Ed Koch Queensboro Bridge Shared Path 2015-01-01 2025-01-20   
16                         Staten Island Ferry 2016-03-31 2025-01-20   
14                          Prospect Park West 2016-11-07 2025-01-20   
10  Kent Ave btw North 8th St and North 9th St 2016-11-22 2025-01-20   
15                              Pulaski Bridge 2017-06-24 2025-01-20   
5                     Columbus Ave at 86th St. 2019-12-05 2025-01-19   
2                    Amsterdam Ave at 86th St. 2019-12-05 2025-01-20   
1                          8th Ave at 50th St. 2019-12-05 2025-01-20   
3       Brooklyn Bridge Bicycle Path (Roadway) 2021-09-14 2025-01-20   
0                         111th St at 50th

In [9]:
# Let's extract the year from the date column
data['year'] = data['date'].dt.year

# And, group by station name and year to count the number of unique dates per year for each station
yearly_data_summary = data.groupby(['name', 'year']).agg(
    unique_dates=('date', 'nunique')
).reset_index()

# And, pivot to get a clear view
yearly_data_pivot = yearly_data_summary.pivot(index='name', columns='year', values='unique_dates').fillna(0)

# Display the year-wise number of unique dates for each station
print("Year-wise unique dates per station:")
print(yearly_data_pivot)

Year-wise unique dates per station:
year                                         2015   2016   2017   2018   2019  \
name                                                                            
111th St at 50th Ave                          0.0    0.0    0.0    0.0    0.0   
8th Ave at 50th St.                           0.0    0.0    0.0    0.0   27.0   
Amsterdam Ave at 86th St.                     0.0    0.0    0.0    0.0   27.0   
Brooklyn Bridge Bicycle Path (Roadway)        0.0    0.0    0.0    0.0    0.0   
Brooklyn Bridge Bike Path                   365.0  366.0  365.0  365.0  365.0   
Columbus Ave at 86th St.                      0.0    0.0    0.0    0.0   27.0   
Ed Koch Queensboro Bridge Shared Path       365.0  366.0  365.0  365.0  364.0   
Emmons Ave Bikes                              0.0    0.0    0.0    0.0    0.0   
Fountain Ave                                  0.0    0.0    0.0    0.0    0.0   
High Bridge Bikes                             0.0    0.0    0.0    0.0   

In [10]:
# Save the result to a CSV file for further analysis
yearly_data_pivot.to_csv('yearly_unique_dates_per_station.csv')

In [13]:
# Based on the observations for the dataset, the following needs to be done
# 1. Drop all data for 2025
data = data[data['date'].dt.year < 2025]

# 2. Drop 2019 data for specific stations
stations_to_drop_2019 = ['8th Ave at 50th St.', 'Columbus Ave at 86th St.', 'Amsterdam Ave at 86th St.']
data = data[~((data['name'].isin(stations_to_drop_2019)) & (data['date'].dt.year == 2019))]

# 3. Drop 2021 data for Brooklyn Bridge Bicycle Path (Roadway)
data = data[~((data['name'] == 'Brooklyn Bridge Bicycle Path (Roadway)') & (data['date'].dt.year == 2021))]

# 4. Drop 2023 data for Emmons Ave Bikes
data = data[~((data['name'] == 'Emmons Ave Bikes') & (data['date'].dt.year == 2023))]

# 5. Drop 2022 data for Fountain Ave
data = data[~((data['name'] == 'Fountain Ave') & (data['date'].dt.year == 2022))]

# 6. Drop 2023 data for High Bridge Bikes
data = data[~((data['name'] == 'High Bridge Bikes') & (data['date'].dt.year == 2023))]

# 7. Drop 2016 data for Kent Ave btw North 8th St and North 9th St
data = data[~((data['name'] == 'Kent Ave btw North 8th St and North 9th St') & (data['date'].dt.year == 2016))]

# 8. Drop 2023 data for Ocean Pkwy at Avenue J and Pelham Pkwy
stations_to_drop_2023 = ['Ocean Pkwy at Avenue J', 'Pelham Pkwy']
data = data[~((data['name'].isin(stations_to_drop_2023)) & (data['date'].dt.year == 2023))]

# 9. Drop 2016 data for Prospect Park West
data = data[~((data['name'] == 'Prospect Park West') & (data['date'].dt.year == 2016))]

# 10. Drop 2017 data for Pulaski Bridge
data = data[~((data['name'] == 'Pulaski Bridge') & (data['date'].dt.year == 2017))]

# 11. Drop 2022 data for Willis Ave Bikes
data = data[~((data['name'] == 'Willis Ave Bikes') & (data['date'].dt.year == 2022))]

# Reset the index after dropping rows
data.reset_index(drop=True, inplace=True)

In [14]:
# Save the cleaned dataset
data.to_csv('daily_cycling_data_newyork_07042025.csv', index=False)

print("Data cleaning complete. The cleaned dataset has been saved as 'daily_cycling_data_newyork_07042025.csv'.")

Data cleaning complete. The cleaned dataset has been saved as 'daily_cycling_data_newyork_07042025.csv'.


In [16]:
# Check the data consistency again after cleaning
summary = data.groupby(['name', data['date'].dt.year]).agg(
    unique_dates=('date', 'nunique')
).reset_index()

yearly_data_pivot = summary.pivot(index='name', columns='date', values='unique_dates').fillna(0)

print(yearly_data_pivot)

date                                         2015   2016   2017   2018   2019  \
name                                                                            
111th St at 50th Ave                          0.0    0.0    0.0    0.0    0.0   
8th Ave at 50th St.                           0.0    0.0    0.0    0.0    0.0   
Amsterdam Ave at 86th St.                     0.0    0.0    0.0    0.0    0.0   
Brooklyn Bridge Bicycle Path (Roadway)        0.0    0.0    0.0    0.0    0.0   
Brooklyn Bridge Bike Path                   365.0  366.0  365.0  365.0  365.0   
Columbus Ave at 86th St.                      0.0    0.0    0.0    0.0    0.0   
Ed Koch Queensboro Bridge Shared Path       365.0  366.0  365.0  365.0  364.0   
Emmons Ave Bikes                              0.0    0.0    0.0    0.0    0.0   
Fountain Ave                                  0.0    0.0    0.0    0.0    0.0   
High Bridge Bikes                             0.0    0.0    0.0    0.0    0.0   
Kent Ave btw North 8th St an

In [17]:
# Group by station name and count the number of unique years
station_year_count = data.groupby('name')['date'].agg(lambda x: x.dt.year.nunique()).reset_index()

# Categorize stations based on the number of years of data
station_year_count['data_duration'] = station_year_count['date'].apply(lambda x: 'More than 1 year' if x > 1 else '1 year')

# Count the number of stations in each category
station_counts = station_year_count['data_duration'].value_counts()

print("Number of stations with more than 1 year of data:", station_counts.get('More than 1 year', 0))
print("Number of stations with exactly 1 year of data:", station_counts.get('1 year', 0))

# Display the detailed breakdown
print("\nDetailed breakdown:")
print(station_year_count)

Number of stations with more than 1 year of data: 15
Number of stations with exactly 1 year of data: 4

Detailed breakdown:
                                          name  date     data_duration
0                         111th St at 50th Ave     3  More than 1 year
1                          8th Ave at 50th St.     5  More than 1 year
2                    Amsterdam Ave at 86th St.     5  More than 1 year
3       Brooklyn Bridge Bicycle Path (Roadway)     3  More than 1 year
4                    Brooklyn Bridge Bike Path    10  More than 1 year
5                     Columbus Ave at 86th St.     5  More than 1 year
6        Ed Koch Queensboro Bridge Shared Path    10  More than 1 year
7                             Emmons Ave Bikes     1            1 year
8                                 Fountain Ave     2  More than 1 year
9                            High Bridge Bikes     1            1 year
10  Kent Ave btw North 8th St and North 9th St     8  More than 1 year
11                   Man

In [18]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36461 entries, 0 to 36460
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   name       36461 non-null  object        
 1   date       36461 non-null  datetime64[ns]
 2   latitude   36461 non-null  float64       
 3   longitude  36461 non-null  float64       
 4   counts     36461 non-null  int64         
 5   year       36461 non-null  int32         
dtypes: datetime64[ns](1), float64(2), int32(1), int64(1), object(1)
memory usage: 1.5+ MB


In [None]:
2