In [1]:
# --- STEP 1: IMPORT LIBRARIES ---
import pandas as pd
import glob
import os
import matplotlib.pyplot as plt
import seaborn as sns
import shutil

# Set visual style for professional charts
sns.set_theme(style="whitegrid")
print("Libraries imported successfully.")

# --- STEP 2: LOAD AND MERGE DATA ---
# This looks for ALL files ending in .csv in the current folder
all_files = glob.glob("*.csv")

li = []

print(f"Found {len(all_files)} CSV files. Merging them now... (This may take a minute)")

for filename in all_files:
    df = pd.read_csv(filename, index_col=None, header=0)
    li.append(df)

# Combine into one big dataframe
if len(li) > 0:
    df = pd.concat(li, axis=0, ignore_index=True)
    print(f"Data merged! Total rows: {df.shape[0]:,}")
else:
    print("No CSV files found. Please upload your data to the Colab 'Files' section.")

# --- STEP 3: DATA CLEANING & PROCESSING ---
# [cite_start]The PDF instructs us to calculate ride_length and day_of_week [cite: 117-119]

# 1. Convert timestamp columns to datetime objects
df['started_at'] = pd.to_datetime(df['started_at'])
df['ended_at'] = pd.to_datetime(df['ended_at'])

# 2. Calculate Ride Length (in minutes)
df['ride_length_minutes'] = (df['ended_at'] - df['started_at']).dt.total_seconds() / 60

# 3. Create 'day_of_week' column
df['day_of_week'] = df['started_at'].dt.day_name()

# 4. Create 'hour' column for hourly analysis
df['hour'] = df['started_at'].dt.hour

# 5. REMOVE BAD DATA
# Remove rides < 1 minute or negative duration
clean_df = df[df['ride_length_minutes'] > 1].copy()

# 6. Ensure days are ordered correctly (Monday -> Sunday)
day_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
clean_df['day_of_week'] = pd.Categorical(clean_df['day_of_week'], categories=day_order, ordered=True)

print(f"Data cleaned. Analyzing {clean_df.shape[0]:,} rides.")

# --- STEP 4: GENERATE AND SAVE IMAGES ---

# Create a directory to store the images
output_dir = 'images'
os.makedirs(output_dir, exist_ok=True)
print(f"Created '{output_dir}' directory.")

# --- CHART 1: Total Rides by Day of Week ---
plt.figure(figsize=(10, 6))
sns.countplot(data=clean_df, x='day_of_week', hue='member_casual', palette='viridis')
plt.title("Total Number of Rides by Day of Week", fontsize=16)
plt.xlabel("Day of Week")
plt.ylabel("Number of Rides")
plt.legend(title='Rider Type')
plt.xticks(rotation=45)
plt.tight_layout()

# Save the figure
save_path_1 = os.path.join(output_dir, 'total_rides_day.png')
plt.savefig(save_path_1, dpi=300)
print(f"Saved: {save_path_1}")
plt.close() # Close plot to free memory

# --- CHART 2: Average Ride Duration ---
# Aggregate data first for better plotting
avg_duration = clean_df.groupby(['member_casual', 'day_of_week'], observed=False)['ride_length_minutes'].mean().reset_index()

plt.figure(figsize=(10, 6))
sns.barplot(data=avg_duration, x='day_of_week', y='ride_length_minutes', hue='member_casual', palette='magma')
plt.title("Average Ride Duration by Day of Week", fontsize=16)
plt.xlabel("Day of Week")
plt.ylabel("Average Duration (Minutes)")
plt.legend(title='Rider Type')
plt.xticks(rotation=45)
plt.tight_layout()

# Save the figure
save_path_2 = os.path.join(output_dir, 'average_duration.png')
plt.savefig(save_path_2, dpi=300)
print(f"Saved: {save_path_2}")
plt.close()

# --- CHART 3: Hourly Usage Patterns ---
# Aggregate data for line chart
hourly_counts = clean_df.groupby(['member_casual', 'hour'], observed=False).size().reset_index(name='count')

plt.figure(figsize=(10, 6))
sns.lineplot(data=hourly_counts, x='hour', y='count', hue='member_casual', palette='coolwarm', linewidth=2.5)
plt.title("Ride Volume by Hour of Day", fontsize=16)
plt.xlabel("Hour of Day (0-23)")
plt.ylabel("Number of Rides")
plt.xticks(range(0, 24))
plt.grid(True)
plt.tight_layout()

# Save the figure
save_path_3 = os.path.join(output_dir, 'hourly_usage.png')
plt.savefig(save_path_3, dpi=300)
print(f"Saved: {save_path_3}")
plt.close()

# --- STEP 5: ZIP THE IMAGES FOR DOWNLOAD ---
shutil.make_archive('cyclistic_images', 'zip', output_dir)
print("\nSUCCESS! A file named 'cyclistic_images.zip' has been created.")
print("Please look in the file browser on the left and download it.")

Libraries imported successfully.
Found 12 CSV files. Merging them now... (This may take a minute)
Data merged! Total rows: 5,590,832
Data cleaned. Analyzing 5,444,045 rides.
Created 'images' directory.
Saved: images/total_rides_day.png
Saved: images/average_duration.png
Saved: images/hourly_usage.png

SUCCESS! A file named 'cyclistic_images.zip' has been created.
Please look in the file browser on the left and download it.
