## Step 1: Import Libraries

In [2]:
# Import libraries 

import pandas as pd
import numpy as np
import os
import requests
import json
from datetime import datetime
import glob
from dotenv import load_dotenv
from IPython.display import Markdown, display

load_dotenv()
print("Libraries imported successfully")

Libraries imported successfully


## Step 2: Create path

In [3]:
# project configuration

DATA_PATH = "..data/raw/"
OUTPUT_PATH = "data/processed/"
NOAA_TOKEN = os.getenv('NOAA_TOKEN')
os.makedirs(OUTPUT_PATH, exist_ok=True)

project_info = {
    "project_name": "NYC CitiBike Demand Analysis 2022",
    "data_source": "Citi Bike System Data",
    "year": 2022,
    "weather_station": "LaGuardia Airport (NYC)",
    "station_id": "GHCND:USW00014732"
}

markdown_content = f"""
# {project_info['project_name']}

## Project Overview
- **Data Source**: {project_info['data_source']}
- **Analysis Period**: {project_info['year']}
- **Weather Station**: {project_info['weather_station']}
- **Station ID**: {project_info['station_id']}

## Analysis Objectives
1. Identify peak demand periods and popular stations
2. Analyze seasonal patterns and weather impact
3. Optimize bike distribution across NYC
4. Identify expansion opportunities
"""

display(Markdown(markdown_content))


# NYC CitiBike Demand Analysis 2022

## Project Overview
- **Data Source**: Citi Bike System Data
- **Analysis Period**: 2022
- **Weather Station**: LaGuardia Airport (NYC)
- **Station ID**: GHCND:USW00014732

## Analysis Objectives
1. Identify peak demand periods and popular stations
2. Analyze seasonal patterns and weather impact
3. Optimize bike distribution across NYC
4. Identify expansion opportunities


## Step 3: Load CitiBike Data

In [23]:
# Diagnostic: Check current working directory and paths
import os

print("=== PATH DIAGNOSTICS ===")
print(f"Current working directory: {os.getcwd()}")
print(f"DATA_PATH: {DATA_PATH}")
print(f"DATA_PATH exists: {os.path.exists(DATA_PATH)}")

# Check what's actually accessible
print("\n=== DIRECTORY CONTENTS ===")
try:
    current_dir = os.getcwd()
    print(f"Current directory contents:")
    for item in os.listdir(current_dir):
        print(f"  - {item}")
    
    print(f"\nData directory contents:")
    if os.path.exists(DATA_PATH):
        for item in os.listdir(DATA_PATH):
            item_path = os.path.join(DATA_PATH, item)
            print(f"  - {item} (exists: {os.path.exists(item_path)})")
    else:
        print("  DATA_PATH does not exist!")
        
except Exception as e:
    print(f"Error listing directory: {e}")

# Try different path approaches
print("\n=== PATH ALTERNATIVES ===")
path_alternatives = [
    "./data/raw/",
    "data/raw/", 
    "../data/raw/",
    "../../data/raw/",
    os.path.join(os.getcwd(), "data/raw/")
]

for path in path_alternatives:
    exists = os.path.exists(path)
    print(f"Path: {path} -> exists: {exists}")
    if exists:
        try:
            files = [f for f in os.listdir(path) if f.endswith('.csv')]
            print(f"  CSV files found: {len(files)}")
            for f in files[:3]:  # Show first 3 files
                print(f"    - {f}")
        except Exception as e:
            print(f"  Error: {e}")

print("\n=== SOLUTION ===")
# Use the working directory approach
correct_data_path = None
for path in path_alternatives:
    if os.path.exists(path):
        files = [f for f in os.listdir(path) if f.endswith('.csv')]
        if files:
            correct_data_path = path
            print(f"Using path: {path}")
            print(f"Found {len(files)} CSV files")
            break

if not correct_data_path:
    print("ERROR: No data directory with CSV files found.")
    print("Please check your file structure and ensure:")
    print("1. CSV files are in data/raw/ directory")
    print("2. You're running Jupyter from the project root directory")
    print("3. Files are actually unzipped")
    raise Exception("Data directory not found")

=== PATH DIAGNOSTICS ===
Current working directory: /Users/kevinmcgreen/Desktop/nyc-citibike-demand-analysis-2022/notebooks
DATA_PATH: data/raw/
DATA_PATH exists: True

=== DIRECTORY CONTENTS ===
Current directory contents:
  - .DS_Store
  - Untitled.ipynb
  - .ipynb_checkpoints
  - data

Data directory contents:

=== PATH ALTERNATIVES ===
Path: ./data/raw/ -> exists: True
  CSV files found: 0
Path: data/raw/ -> exists: True
  CSV files found: 0
Path: ../data/raw/ -> exists: True
  CSV files found: 36
    - 202208-citibike-tripdata_3.csv
    - 202207-citibike-tripdata_2.csv
    - 202207-citibike-tripdata_3.csv
Path: ../../data/raw/ -> exists: False
Path: /Users/kevinmcgreen/Desktop/nyc-citibike-demand-analysis-2022/notebooks/data/raw/ -> exists: True
  CSV files found: 0

=== SOLUTION ===
Using path: ../data/raw/
Found 36 CSV files


In [24]:
# Load data - Using the correct relative path
print("Loading CitiBike data files...")

# Use the correct path from our diagnostic
CORRECT_DATA_PATH = "../data/raw/"

print(f"Using data path: {CORRECT_DATA_PATH}")
print(f"Path exists: {os.path.exists(CORRECT_DATA_PATH)}")

# Get all CSV files
all_files = []
for item in os.listdir(CORRECT_DATA_PATH):
    if item.endswith('.csv') and '2022' in item and 'citibike' in item:
        full_path = os.path.join(CORRECT_DATA_PATH, item)
        all_files.append(full_path)

all_files = sorted(all_files)
file_count = len(all_files)

print(f"Found {file_count} CSV files to process")

# Display first few files
print("First 5 files:")
for i, file_path in enumerate(all_files[:5], 1):
    file_name = os.path.basename(file_path)
    file_size = round(os.path.getsize(file_path) / (1024*1024), 2)
    print(f"  {i}. {file_name} ({file_size} MB)")

def read_citibike_file(file_path):
    try:
        file_name = os.path.basename(file_path)
        file_size = os.path.getsize(file_path) / (1024*1024)  # MB
        print(f"Reading: {file_name} ({file_size:.1f} MB)")
        df = pd.read_csv(file_path, low_memory=False)
        df['_source_file'] = file_name
        print(f"  Success: {len(df):,} rows, {len(df.columns)} columns")
        return df
    except Exception as e:
        print(f"  ERROR reading {file_path}: {e}")
        return None

print("\nStarting to load files...")
dataframes = []
for file_path in all_files:
    df = read_citibike_file(file_path)
    if df is not None:
        dataframes.append(df)

if dataframes:
    df_bikes = pd.concat(dataframes, ignore_index=True)
    print(f"\n=== SUCCESS ===")
    print(f"Successfully concatenated {len(dataframes)} files")
    print(f"Total dataset: {len(df_bikes):,} rows, {len(df_bikes.columns)} columns")
    
    # Display dataset info
    print(f"\nDate range:")
    # Find datetime columns
    datetime_cols = [col for col in df_bikes.columns if 'time' in col.lower() or 'date' in col.lower()]
    if datetime_cols:
        for col in datetime_cols[:2]:  # Check first 2 datetime columns
            if col in df_bikes.columns:
                print(f"  {col}: {df_bikes[col].min()} to {df_bikes[col].max()}")
    
    print(f"\nFirst 2 rows:")
    display(df_bikes.head(2))
    
else:
    raise Exception("No data files were successfully loaded")

Loading CitiBike data files...
Using data path: ../data/raw/
Path exists: True
Found 36 CSV files to process
First 5 files:
  1. 202201-citibike-tripdata_1.csv (185.56 MB)
  2. 202201-citibike-tripdata_2.csv (4.5 MB)
  3. 202202-citibike-tripdata_1.csv (185.96 MB)
  4. 202202-citibike-tripdata_2.csv (36.84 MB)
  5. 202203-citibike-tripdata_1.csv (186.27 MB)

Starting to load files...
Reading: 202201-citibike-tripdata_1.csv (185.6 MB)
  Success: 1,000,000 rows, 14 columns
Reading: 202201-citibike-tripdata_2.csv (4.5 MB)
  Success: 24,555 rows, 14 columns
Reading: 202202-citibike-tripdata_1.csv (186.0 MB)
  Success: 1,000,000 rows, 14 columns
Reading: 202202-citibike-tripdata_2.csv (36.8 MB)
  Success: 197,312 rows, 14 columns
Reading: 202203-citibike-tripdata_1.csv (186.3 MB)
  Success: 1,000,000 rows, 14 columns
Reading: 202203-citibike-tripdata_2.csv (157.5 MB)
  Success: 845,965 rows, 14 columns
Reading: 202204-citibike-tripdata_1.csv (186.4 MB)
  Success: 1,000,000 rows, 14 columns


Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,_source_file
0,BFD29218AB271154,electric_bike,2022-01-21 13:13:43.392,2022-01-21 13:22:31.463,West End Ave & W 107 St,7650.05,Mt Morris Park W & W 120 St,7685.14,40.802117,-73.968181,40.804038,-73.945925,member,202201-citibike-tripdata_1.csv
1,7C953F2FD7BE1302,classic_bike,2022-01-10 11:30:54.162,2022-01-10 11:41:43.422,4 Ave & 3 St,4028.04,Boerum Pl\t& Pacific St,4488.09,40.673746,-73.985649,40.688489,-73.99116,member,202201-citibike-tripdata_1.csv
