In [None]:
import os
import pandas as pd
import re
import polars as pl


In [2]:
# Path to your extracted CSV files
data_path = r"C:\Users\anbun\Desktop\Portfolio projects\Google Data Analytics Capstone\Data\12_Months_data"

# Output file path
output_file = os.path.join(data_path, "combined_cyclistic_data.csv")


In [None]:
#Combining the 12 months data together from oct-24 to sep-25
# List all CSV files in the directory
csv_files = [f for f in os.listdir(data_path) if f.endswith(".csv")]

print(f"Found {len(csv_files)} CSV files.")

combined_df = pd.DataFrame()

for file in csv_files:
    # Extract yyyymm from filename (e.g., 202304-divvy-tripdata.csv ‚Üí 202304)
    month_tag = file[:6]  # first 6 characters
    year = month_tag[:4]
    month = month_tag[4:6]
    month_year = f"{month}-{year}"  # mm-yyyy format

    file_path = os.path.join(data_path, file)
    print(f"Processing {file} ...")

    # Read CSV file
    df = pl.read_csv(file_path)

    # Add month_year column
    df["month_year"] = month_year

    # Append to combined dataframe
    combined_df = pl.concat([combined_df, df], ignore_index=True)


Found 12 CSV files.
Processing 202410-divvy-tripdata.csv ...
Processing 202411-divvy-tripdata.csv ...
Processing 202412-divvy-tripdata.csv ...
Processing 202501-divvy-tripdata.csv ...
Processing 202502-divvy-tripdata.csv ...
Processing 202503-divvy-tripdata.csv ...
Processing 202504-divvy-tripdata.csv ...
Processing 202505-divvy-tripdata.csv ...
Processing 202506-divvy-tripdata.csv ...
Processing 202507-divvy-tripdata.csv ...
Processing 202508-divvy-tripdata.csv ...
Processing 202509-divvy-tripdata.csv ...


In [4]:
combined_df.shape


(5539521, 14)

In [5]:
# understanding the data and the columns data types
combined_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5539521 entries, 0 to 5539520
Data columns (total 14 columns):
 #   Column              Dtype  
---  ------              -----  
 0   ride_id             object 
 1   rideable_type       object 
 2   started_at          object 
 3   ended_at            object 
 4   start_station_name  object 
 5   start_station_id    object 
 6   end_station_name    object 
 7   end_station_id      object 
 8   start_lat           float64
 9   start_lng           float64
 10  end_lat             float64
 11  end_lng             float64
 12  member_casual       object 
 13  month_year          object 
dtypes: float64(4), object(10)
memory usage: 591.7+ MB


In [6]:
# checking for missing values
combined_df.isnull().sum()


ride_id                     0
rideable_type               0
started_at                  0
ended_at                    0
start_station_name    1129029
start_station_id      1129029
end_station_name      1175835
end_station_id        1175835
start_lat                   0
start_lng                   0
end_lat                  5442
end_lng                  5442
member_casual               0
month_year                  0
dtype: int64

In [7]:
combined_df["rideable_type"].value_counts(dropna=False)


rideable_type
electric_bike    3462468
classic_bike     2077053
Name: count, dtype: int64

In [8]:
combined_df["member_casual"].value_counts(dropna=False)


member_casual
member    3543363
casual    1996158
Name: count, dtype: int64

In [9]:
combined_df["month_year"].value_counts(dropna=False)


month_year
08-2025    790177
07-2025    763432
09-2025    714759
06-2025    678904
10-2024    616281
05-2025    502456
04-2025    371341
11-2024    335075
03-2025    298155
12-2024    178372
02-2025    151880
01-2025    138689
Name: count, dtype: int64

In [10]:
combined_df['ride_id_length'] = combined_df['ride_id'].astype(str).apply(len)


In [11]:
print(combined_df['ride_id_length'].value_counts().sort_index())


ride_id_length
16    5539521
Name: count, dtype: int64


In [12]:
# --- CONVERT DATES TO DATETIME ---
combined_df['started_at'] = pd.to_datetime(combined_df['started_at'], errors='coerce')
combined_df['ended_at'] = pd.to_datetime(combined_df['ended_at'], errors='coerce')


In [13]:
combined_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5539521 entries, 0 to 5539520
Data columns (total 15 columns):
 #   Column              Dtype         
---  ------              -----         
 0   ride_id             object        
 1   rideable_type       object        
 2   started_at          datetime64[ns]
 3   ended_at            datetime64[ns]
 4   start_station_name  object        
 5   start_station_id    object        
 6   end_station_name    object        
 7   end_station_id      object        
 8   start_lat           float64       
 9   start_lng           float64       
 10  end_lat             float64       
 11  end_lng             float64       
 12  member_casual       object        
 13  month_year          object        
 14  ride_id_length      int64         
dtypes: datetime64[ns](2), float64(4), int64(1), object(8)
memory usage: 633.9+ MB


In [14]:
# --- CALCULATE RIDE TIME IN MINUTES ---
combined_df['ride_time_min'] = (combined_df['ended_at'] - combined_df['started_at']).dt.total_seconds() / 60


In [15]:
unusual_rides = combined_df[(combined_df['ride_time_min'] <= 1) | (combined_df['ride_time_min'] >= 1440)]

print(f"Found {len(unusual_rides)} rides with unusual lengths.\n")


Found 141967 rides with unusual lengths.



In [16]:
combined_df.shape


(5539521, 16)

In [17]:
# --- DROP ROWS ---
combined_df = combined_df.drop(combined_df[(combined_df['ride_time_min'] <= 1) | (combined_df['ride_time_min'] >= 1440)].index)

# --- DISPLAY TOTAL ROWS AFTER ---
print(f"Rows remaining after deleting unusual rides: {len(combined_df)}")


Rows remaining after deleting unusual rides: 5397554


In [18]:
len(combined_df)


5397554

In [19]:
# --- CHECK FOR EMPTY (NULL or BLANK) START STATION NAMES ---
missing_start = combined_df[combined_df['start_station_name'].isna() | (combined_df['start_station_name'].astype(str).str.strip() == '')]
print(f"Rows with missing start_station_name: {len(missing_start)}")


Rows with missing start_station_name: 1047742


In [20]:
# --- CHECK FOR EMPTY (NULL or BLANK) END STATION NAMES ---
missing_end = combined_df[combined_df['end_station_name'].isna() | (combined_df['end_station_name'].astype(str).str.strip() == '')]
print(f"Rows with missing end_station_name: {len(missing_end)}")


Rows with missing end_station_name: 1067050


In [21]:
# --- CHECK MISSING STATIONS ---
missing_start = combined_df[combined_df['start_station_name'].isna() | (combined_df['start_station_name'].astype(str).str.strip() == '')]
missing_end = combined_df[combined_df['end_station_name'].isna() | (combined_df['end_station_name'].astype(str).str.strip() == '')]
missing_start_id = combined_df[combined_df['start_station_id'].isna() | (combined_df['start_station_id'].astype(str).str.strip() == '')]
missing_end_id = combined_df[combined_df['end_station_id'].isna() | (combined_df['end_station_id'].astype(str).str.strip() == '')]


In [22]:
# --- ANALYZE rideable_type DISTRIBUTION FOR MISSING STATIONS ---
print("Distribution of RIDEABLE TYPE for rides with missing START station:")
print("-" * 70)
print(missing_start['rideable_type'].value_counts(dropna=False))


Distribution of RIDEABLE TYPE for rides with missing START station:
----------------------------------------------------------------------
rideable_type
electric_bike    1047742
Name: count, dtype: int64


In [23]:
print("\nDistribution of RIDEABLE TYPE for rides with missing END station:")
print("-" * 70)
print(missing_end['rideable_type'].value_counts(dropna=False))



Distribution of RIDEABLE TYPE for rides with missing END station:
----------------------------------------------------------------------
rideable_type
electric_bike    1066923
classic_bike         127
Name: count, dtype: int64


In [24]:
print("\nDistribution of RIDEABLE TYPE for rides with missing END station:")
print("-" * 70)
print(missing_start_id['rideable_type'].value_counts(dropna=False))



Distribution of RIDEABLE TYPE for rides with missing END station:
----------------------------------------------------------------------
rideable_type
electric_bike    1047742
Name: count, dtype: int64


In [25]:
print("\nDistribution of RIDEABLE TYPE for rides with missing END station:")
print("-" * 70)
print(missing_end_id['rideable_type'].value_counts(dropna=False))



Distribution of RIDEABLE TYPE for rides with missing END station:
----------------------------------------------------------------------
rideable_type
electric_bike    1066923
classic_bike         127
Name: count, dtype: int64


In [26]:
unusual_start_end = combined_df[(combined_df['start_station_name'].isna()) | (combined_df['end_station_name'].isna())| (combined_df['start_station_id'].isna()) | (combined_df['end_station_id'].isna())]

print(f"Found {len(unusual_start_end)} rides with unusual start or end stations.\n")


Found 1656510 rides with unusual start or end stations.



In [27]:
# --- DROP ROWS ---
combined_df = combined_df.drop(combined_df[(combined_df['start_station_name'].isna()) | (combined_df['end_station_name'].isna())| (combined_df['start_station_id'].isna()) | (combined_df['end_station_id'].isna())].index)

# --- DISPLAY TOTAL ROWS AFTER ---
print(f"Rows remaining after deleting unusual start or end stations: {len(combined_df)}")


Rows remaining after deleting unusual start or end stations: 3741044


In [28]:
combined_df['start_station_name'].nunique()


1852

In [29]:
combined_df['end_station_name'].nunique()


1864

In [30]:
combined_df['start_station_id'].nunique()


3231

In [31]:
combined_df['end_station_id'].nunique()


3259

In [32]:
combined_df.isnull().sum()


ride_id               0
rideable_type         0
started_at            0
ended_at              0
start_station_name    0
start_station_id      0
end_station_name      0
end_station_id        0
start_lat             0
start_lng             0
end_lat               0
end_lng               0
member_casual         0
month_year            0
ride_id_length        0
ride_time_min         0
dtype: int64

In [33]:
# --- GROUP BY STATION NAME AND COUNT DISTINCT IDs ---
station_id_counts = (
    combined_df.groupby('start_station_name')['start_station_id']
      .nunique()  # equivalent to COUNT(DISTINCT start_station_id)
      .reset_index(name='distinct_ids')
)


In [34]:
stations_with_multiple_ids = station_id_counts[station_id_counts['distinct_ids'] > 1]
stations_with_multiple_ids


Unnamed: 0,start_station_name,distinct_ids
0,2112 W Peterson Ave,2
1,21st St & Pulaski Rd,2
2,63rd St Beach,2
3,900 W Harrison St,2
4,Aberdeen St & 103rd St,2
...,...,...
1847,Woodlawn Ave & 58th St,2
1848,Woodlawn Ave & 75th St,2
1849,Woodlawn Ave & Lake Park Ave,2
1850,Yates Blvd & 75th St,2


In [35]:
# for name in stations_with_multiple_ids['start_station_name']:
#     ids = combined_df.loc[combined_df['start_station_name'] == name, 'start_station_id'].unique()
#     print(f"{name}: {ids}")


In [36]:
# --- GROUP BY STATION NAME AND COUNT DISTINCT IDs ---
station_id_counts = (
    combined_df.groupby('end_station_name')['end_station_id']
      .nunique()  # equivalent to COUNT(DISTINCT start_station_id)
      .reset_index(name='distinct_ids')
)


In [37]:
stations_with_multiple_ids = station_id_counts[station_id_counts['distinct_ids'] > 1]
stations_with_multiple_ids


Unnamed: 0,end_station_name,distinct_ids
0,2112 W Peterson Ave,2
1,21st St & Pulaski Rd,2
2,63rd St Beach,2
3,900 W Harrison St,2
4,Aberdeen St & 103rd St,2
...,...,...
1859,Woodlawn Ave & 58th St,2
1860,Woodlawn Ave & 75th St,2
1861,Woodlawn Ave & Lake Park Ave,2
1862,Yates Blvd & 75th St,2


In [38]:
print("üîç Checking for NULL or empty values in start_lat:")
start_lat_summary = (
    combined_df['start_lat']
    .value_counts(dropna=False)
    .sort_index()
)


üîç Checking for NULL or empty values in start_lat:


In [39]:
print("\nüîç Checking for NULL or empty values in end_lat:")
end_lat_summary = (
    combined_df['end_lat']
    .value_counts(dropna=False)
    .sort_index()
)



üîç Checking for NULL or empty values in end_lat:


In [40]:
# Identify rows with null, blank, or zero end_lat
condition = (
    combined_df['end_lat'].isna() |
    (combined_df['end_lat'].astype(str).str.strip() == '') |
    (combined_df['end_lat'].astype(str).str.strip() == '0')|
    (combined_df['end_lat'] == 0)
)

print(f"\nüö´ Rows with invalid end_lat: {condition.sum()}")



üö´ Rows with invalid end_lat: 0


In [41]:
combined_df.shape


(3741044, 16)

In [42]:
combined_df.head()


Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,month_year,ride_id_length,ride_time_min
370,528F356117BC3840,classic_bike,2024-10-01 19:20:39.525,2024-10-01 19:35:06.147,California Ave & Milwaukee Ave,13084,California Ave & Milwaukee Ave,13084,41.922695,-87.697153,41.922695,-87.697153,member,10-2024,16,14.4437
1438,777AB735E4C2ACA6,classic_bike,2024-10-02 18:59:27.163,2024-10-02 19:06:55.440,Chicago State University,20106,Chicago State University,20106,41.718963,-87.608296,41.718963,-87.608296,casual,10-2024,16,7.471283
1547,98C68BF9E5BFCD85,classic_bike,2024-10-02 18:59:27.091,2024-10-02 19:07:11.791,Chicago State University,20106,Chicago State University,20106,41.718963,-87.608296,41.718963,-87.608296,casual,10-2024,16,7.745
1605,FE234AD3EFBAD12C,electric_bike,2024-10-05 16:01:35.258,2024-10-05 16:13:25.053,Western Ave & Roscoe St,15634,Western Ave & Roscoe St,15634,41.943034,-87.687288,41.943034,-87.687288,member,10-2024,16,11.829917
1702,EB0472207B948EFB,electric_bike,2024-10-26 22:37:12.775,2024-10-26 22:38:45.058,California Ave & Milwaukee Ave,13084,California Ave & Milwaukee Ave,13084,41.922695,-87.697153,41.922695,-87.697153,member,10-2024,16,1.53805


In [43]:
combined_df.info()


<class 'pandas.core.frame.DataFrame'>
Index: 3741044 entries, 370 to 5539520
Data columns (total 16 columns):
 #   Column              Dtype         
---  ------              -----         
 0   ride_id             object        
 1   rideable_type       object        
 2   started_at          datetime64[ns]
 3   ended_at            datetime64[ns]
 4   start_station_name  object        
 5   start_station_id    object        
 6   end_station_name    object        
 7   end_station_id      object        
 8   start_lat           float64       
 9   start_lng           float64       
 10  end_lat             float64       
 11  end_lng             float64       
 12  member_casual       object        
 13  month_year          object        
 14  ride_id_length      int64         
 15  ride_time_min       float64       
dtypes: datetime64[ns](2), float64(5), int64(1), object(8)
memory usage: 485.2+ MB


In [44]:
combined_df['month'] = combined_df['started_at'].dt.strftime('%b')


In [45]:
combined_df['weekday'] = combined_df['started_at'].dt.strftime('%a') 


In [46]:
combined_df.info()


<class 'pandas.core.frame.DataFrame'>
Index: 3741044 entries, 370 to 5539520
Data columns (total 18 columns):
 #   Column              Dtype         
---  ------              -----         
 0   ride_id             object        
 1   rideable_type       object        
 2   started_at          datetime64[ns]
 3   ended_at            datetime64[ns]
 4   start_station_name  object        
 5   start_station_id    object        
 6   end_station_name    object        
 7   end_station_id      object        
 8   start_lat           float64       
 9   start_lng           float64       
 10  end_lat             float64       
 11  end_lng             float64       
 12  member_casual       object        
 13  month_year          object        
 14  ride_id_length      int64         
 15  ride_time_min       float64       
 16  month               object        
 17  weekday             object        
dtypes: datetime64[ns](2), float64(5), int64(1), object(10)
memory usage: 542.3+ MB


In [47]:
combined_df


Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,month_year,ride_id_length,ride_time_min,month,weekday
370,528F356117BC3840,classic_bike,2024-10-01 19:20:39.525,2024-10-01 19:35:06.147,California Ave & Milwaukee Ave,13084,California Ave & Milwaukee Ave,13084,41.922695,-87.697153,41.922695,-87.697153,member,10-2024,16,14.443700,Oct,Tue
1438,777AB735E4C2ACA6,classic_bike,2024-10-02 18:59:27.163,2024-10-02 19:06:55.440,Chicago State University,20106,Chicago State University,20106,41.718963,-87.608296,41.718963,-87.608296,casual,10-2024,16,7.471283,Oct,Wed
1547,98C68BF9E5BFCD85,classic_bike,2024-10-02 18:59:27.091,2024-10-02 19:07:11.791,Chicago State University,20106,Chicago State University,20106,41.718963,-87.608296,41.718963,-87.608296,casual,10-2024,16,7.745000,Oct,Wed
1605,FE234AD3EFBAD12C,electric_bike,2024-10-05 16:01:35.258,2024-10-05 16:13:25.053,Western Ave & Roscoe St,15634,Western Ave & Roscoe St,15634,41.943034,-87.687288,41.943034,-87.687288,member,10-2024,16,11.829917,Oct,Sat
1702,EB0472207B948EFB,electric_bike,2024-10-26 22:37:12.775,2024-10-26 22:38:45.058,California Ave & Milwaukee Ave,13084,California Ave & Milwaukee Ave,13084,41.922695,-87.697153,41.922695,-87.697153,member,10-2024,16,1.538050,Oct,Sat
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5539516,3C53DEC3D2720737,electric_bike,2025-09-19 12:39:14.679,2025-09-19 12:42:42.164,Harper Ave & 59th St,CHI00584,Ellis Ave & 58th St,CHI00460,41.787943,-87.588315,41.788746,-87.601334,member,09-2025,16,3.458083,Sep,Fri
5539517,DDD88B33A7D1A53E,classic_bike,2025-09-21 15:50:59.008,2025-09-21 15:56:32.850,Kimball Ave & Belmont Ave,CHI00577,Avers Ave & Belmont Ave,CHI00962,41.939394,-87.712068,41.939408,-87.723574,member,09-2025,16,5.564033,Sep,Sun
5539518,98826B3F957759B5,electric_bike,2025-09-12 20:48:51.509,2025-09-12 21:05:00.180,Desplaines St & Jackson Blvd,CHI00518,Clark St & North Ave,CHI00444,41.878119,-87.643948,41.911974,-87.631942,member,09-2025,16,16.144517,Sep,Fri
5539519,1872E41E117B1E69,classic_bike,2025-09-03 07:35:26.329,2025-09-03 07:39:48.206,Harper Ave & 59th St,CHI00584,Ellis Ave & 58th St,CHI00460,41.787943,-87.588315,41.788746,-87.601334,member,09-2025,16,4.364617,Sep,Wed


In [48]:
season_map ={    
    'Dec':'Winter','Jan':'Winter','Feb':'Winter',
    'Mar':'Spring','Apr':'Spring','May':'Spring',
    'Jun':'Summer','Jul':'Summer','Aug':'Summer',
    'Sep':'Fall','Oct':'Fall','Nov':'Fall'
    }

combined_df['season']= combined_df['month'].map(season_map)

season_order = ['Winter','Spring','Summer','Fall']
combined_df['season']= pd.Categorical(combined_df['season'],
                                      categories= season_order,
                                      ordered =True)


In [49]:
combined_df.head()


Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,month_year,ride_id_length,ride_time_min,month,weekday,season
370,528F356117BC3840,classic_bike,2024-10-01 19:20:39.525,2024-10-01 19:35:06.147,California Ave & Milwaukee Ave,13084,California Ave & Milwaukee Ave,13084,41.922695,-87.697153,41.922695,-87.697153,member,10-2024,16,14.4437,Oct,Tue,Fall
1438,777AB735E4C2ACA6,classic_bike,2024-10-02 18:59:27.163,2024-10-02 19:06:55.440,Chicago State University,20106,Chicago State University,20106,41.718963,-87.608296,41.718963,-87.608296,casual,10-2024,16,7.471283,Oct,Wed,Fall
1547,98C68BF9E5BFCD85,classic_bike,2024-10-02 18:59:27.091,2024-10-02 19:07:11.791,Chicago State University,20106,Chicago State University,20106,41.718963,-87.608296,41.718963,-87.608296,casual,10-2024,16,7.745,Oct,Wed,Fall
1605,FE234AD3EFBAD12C,electric_bike,2024-10-05 16:01:35.258,2024-10-05 16:13:25.053,Western Ave & Roscoe St,15634,Western Ave & Roscoe St,15634,41.943034,-87.687288,41.943034,-87.687288,member,10-2024,16,11.829917,Oct,Sat,Fall
1702,EB0472207B948EFB,electric_bike,2024-10-26 22:37:12.775,2024-10-26 22:38:45.058,California Ave & Milwaukee Ave,13084,California Ave & Milwaukee Ave,13084,41.922695,-87.697153,41.922695,-87.697153,member,10-2024,16,1.53805,Oct,Sat,Fall


In [51]:
combined_df['hour'] = (
    combined_df['started_at']
      .dt.strftime('%I %p')              # e.g., '01 AM'
      .str.replace(r'^0', '', regex=True) # drop leading zero -> '1 AM'
      .str.lower()                        # '1 am'
)

hours_order = (
    [f'{h} am' for h in [12,1,2,3,4,5,6,7,8,9,10,11]] +
    [f'{h} pm' for h in [12,1,2,3,4,5,6,7,8,9,10,11]]
)
combined_df['hour'] = combined_df['hour'].astype(pd.CategoricalDtype(categories=hours_order, ordered=True))


In [52]:
# --- SAVE THE COMBINED FILE ---
combined_df.to_csv(output_file, index=False)
