In [14]:
import pandas as pd
import numpy as np
import os




In [20]:
all_files = [f for f in os.listdir(data_path) if f.endswith(".csv")]
dfs = []

for f in all_files:
    print("Reading:", f)
    df_temp = pd.read_csv(
        os.path.join(data_path, f),
        engine='python',        # More tolerant than C engine
        on_bad_lines='skip',    # Skip corrupted rows
        encoding='utf-8'        # Common encoding
    )
    dfs.append(df_temp)

# Combine all
df = pd.concat(dfs, ignore_index=True)
print("Combined shape:", df.shape)


Reading: 202404-divvy-tripdata.csv
Reading: 202407-divvy-tripdata.csv
Reading: 202410-divvy-tripdata.csv
Reading: 202406-divvy-tripdata.csv
Reading: 202411-divvy-tripdata.csv
Reading: 202408-divvy-tripdata.csv
Reading: 202402-divvy-tripdata.csv
Reading: 202401-divvy-tripdata.csv
Reading: 202405-divvy-tripdata.csv
Reading: 202312-divvy-tripdata.csv
Reading: 202403-divvy-tripdata.csv
Reading: 202409-divvy-tripdata.csv
Combined shape: (2935519, 13)


In [21]:
# Preview
print(df.shape)
df.head()

(2935519, 13)


Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
0,743252713F32516B,classic_bike,2024-04-22 19:08:21,2024-04-22 19:12:56,Aberdeen St & Jackson Blvd,13157,Desplaines St & Jackson Blvd,15539,41.877726,-87.654787,41.878119,-87.643948,member
1,BE90D33D2240C614,electric_bike,2024-04-11 06:19:24,2024-04-11 06:22:21,Aberdeen St & Jackson Blvd,13157,Desplaines St & Jackson Blvd,15539,41.877722,-87.654959,41.878119,-87.643948,member
2,D47BBDDE7C40DD61,classic_bike,2024-04-20 11:13:13,2024-04-20 11:29:31,Sheridan Rd & Montrose Ave,TA1307000107,Ashland Ave & Belle Plaine Ave,13249,41.96167,-87.65464,41.956057,-87.668835,member
3,6684E760BF9EA9B5,classic_bike,2024-04-04 18:39:20,2024-04-04 18:43:06,Aberdeen St & Jackson Blvd,13157,Desplaines St & Jackson Blvd,15539,41.877726,-87.654787,41.878119,-87.643948,member
4,CA9EFC0D24C24A27,electric_bike,2024-04-19 19:30:20,2024-04-19 20:07:42,Sheridan Rd & Montrose Ave,TA1307000107,Stetson Ave & South Water St,TA1308000029,41.961613,-87.654615,41.886835,-87.62232,member


In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2935519 entries, 0 to 2935518
Data columns (total 13 columns):
 #   Column              Dtype  
---  ------              -----  
 0   ride_id             object 
 1   rideable_type       object 
 2   started_at          object 
 3   ended_at            object 
 4   start_station_name  object 
 5   start_station_id    object 
 6   end_station_name    object 
 7   end_station_id      object 
 8   start_lat           float64
 9   start_lng           object 
 10  end_lat             float64
 11  end_lng             object 
 12  member_casual       object 
dtypes: float64(2), object(11)
memory usage: 291.2+ MB


In [23]:
# Check null values
df.isnull().sum()

Unnamed: 0,0
ride_id,0
rideable_type,0
started_at,0
ended_at,1
start_station_name,447250
start_station_id,447249
end_station_name,446631
end_station_id,446632
start_lat,2
start_lng,2


In [24]:
# Preview unique user types
df['member_casual'].value_counts()

Unnamed: 0_level_0,count
member_casual,Unnamed: 1_level_1
member,1895644
casual,1039868


In [26]:
#convert to date and time
df['started_at'] = pd.to_datetime(df['started_at'], errors='coerce', format='mixed')
df['ended_at'] = pd.to_datetime(df['ended_at'], errors='coerce', format='mixed')


In [27]:
#Convert Ride Length in Minutes
df['ride_length'] = (df['ended_at'] - df['started_at']).dt.total_seconds() / 60


In [28]:
#Create day_of_week
df['day_of_week'] = df['started_at'].dt.day_name()


In [29]:
#Remove invalid rides

# Remove negative or zero durations
df = df[df['ride_length'] > 0]

# Remove extremely long rides (> 24 hours)
df = df[df['ride_length'] < 24*60]

In [30]:
#Remove duplicates
df.drop_duplicates(subset='ride_id', inplace=True)


In [31]:
# Validate the Cleaned Data
print("Number of rides after cleaning:", len(df))
print(df['member_casual'].value_counts())

print("Average ride length by user type:")
print(df.groupby('member_casual')['ride_length'].mean())



Number of rides after cleaning: 2901586
member_casual
member    1875165
casual    1026417
Name: count, dtype: int64
Average ride length by user type:
member_casual
casual    21.392059
member    12.128760
Name: ride_length, dtype: float64


In [32]:
# Save Cleaned Data
cleaned_path = "/content/cleaned_cyclistic.csv"
df.to_csv(cleaned_path, index=False)
print("Cleaned dataset saved at:", cleaned_path)


Cleaned dataset saved at: /content/cleaned_cyclistic.csv
