# Data Extraction

## Downloading NYC Citi Bike Rental Data

In [None]:
import requests
import os
from xml.etree import ElementTree as ET

bucket_url = "https://s3.amazonaws.com/tripdata"

response = requests.get(bucket_url)
response_text = response.text

root = ET.fromstring(response_text)

namespace = {'s3': 'http://s3.amazonaws.com/doc/2006-03-01/'}

zip_links = [bucket_url + '/' + key.text for key in root.findall('s3:Contents/s3:Key', namespace) if key.text.endswith('.zip')]

os.makedirs('/content/tripdata_zips', exist_ok=True)

for i, link in enumerate(zip_links):
    if i >= 4:
        break

    zip_filename = os.path.join('/content/tripdata_zips', os.path.basename(link))

    print(f"Downloading {zip_filename} ...")

    with requests.get(link, stream=True) as r:
        r.raise_for_status()
        with open(zip_filename, 'wb') as f:
            for chunk in r.iter_content(chunk_size=8192):
                f.write(chunk)

print("Download completed.")

Downloading /content/tripdata_zips/2013-citibike-tripdata.zip ...
Downloading /content/tripdata_zips/2014-citibike-tripdata.zip ...
Downloading /content/tripdata_zips/2015-citibike-tripdata.zip ...
Downloading /content/tripdata_zips/2016-citibike-tripdata.zip ...
Download completed.


## Import data to Pandas Dataframe and Pickle

In [None]:
import zipfile
import os
import pandas as pd

# Path where the ZIP files are stored
zip_folder = '/content/tripdata_zips/'

# Path for debugging and working datasets
debugging_folder = '/content/debugging_dataset/'
working_folder = '/content/working_dataset/'
debugging_pickle_paths = []
working_pickle_paths = []

# Create directories if they don't exist
os.makedirs(debugging_folder, exist_ok=True)
os.makedirs(working_folder, exist_ok=True)

# Define the first year of data to go into debugging
first_year = None

# Loop over each ZIP file in the folder
for i, zip_file in enumerate(os.listdir(zip_folder)):

    if zip_file.endswith(".zip"):
        # Open the ZIP file
        with zipfile.ZipFile(os.path.join(zip_folder, zip_file), 'r') as z:
            # Loop through each file in the ZIP archive
            print(f"Processing {zip_file} ... {z.namelist()}")
            for file_name in z.namelist():
                # Check if it's a CSV file (regardless of folder structure)
                if file_name.endswith(".csv") and not file_name.startswith("__MACOSX"):
                    print(f"Extracting and reading {file_name} from {zip_file}...")
                    try:
                        with z.open(file_name) as f:

                            df = pd.read_csv(f, encoding='utf-8')

                            df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')

                            if 'starttime' in df.columns:
                                df['starttime'] = pd.to_datetime(df['starttime'], errors='coerce')
                            if 'stoptime' in df.columns:
                                df['stoptime'] = pd.to_datetime(df['stoptime'], errors='coerce')

                            base_name = os.path.basename(file_name)
                            year = base_name[0:4]
                            month = base_name[4:6]

                            if first_year is None:
                                first_year = year

                            if year == first_year:
                                pickle_filename = os.path.join(debugging_folder, f"{year}-{month}.pkl")
                                debugging_pickle_paths.append(pickle_filename)
                            else:
                                pickle_filename = os.path.join(working_folder, f"{year}-{month}.pkl")
                                working_pickle_paths.append(pickle_filename)

                            df.to_pickle(pickle_filename)
                            print(f"Saved {file_name} to {pickle_filename}")

                    except UnicodeDecodeError:
                        print(f"Failed to decode {file_name} with utf-8, trying ISO-8859-1...")

Processing 2014-citibike-tripdata.zip ... ['2014-citibike-tripdata/', '2014-citibike-tripdata/4_April/', '2014-citibike-tripdata/12_December/', '2014-citibike-tripdata/.DS_Store', '__MACOSX/2014-citibike-tripdata/._.DS_Store', '2014-citibike-tripdata/11_November/', '2014-citibike-tripdata/7_July/', '2014-citibike-tripdata/10_October/', '2014-citibike-tripdata/9_September/', '2014-citibike-tripdata/8_August/', '2014-citibike-tripdata/6_June/', '2014-citibike-tripdata/3_March/', '2014-citibike-tripdata/1_January/', '2014-citibike-tripdata/2_February/', '2014-citibike-tripdata/5_May/', '2014-citibike-tripdata/4_April/201404-citibike-tripdata_1.csv', '2014-citibike-tripdata/12_December/201412-citibike-tripdata_1.csv', '2014-citibike-tripdata/11_November/201411-citibike-tripdata_1.csv', '__MACOSX/2014-citibike-tripdata/11_November/._201411-citibike-tripdata_1.csv', '2014-citibike-tripdata/7_July/201407-citibike-tripdata_1.csv', '2014-citibike-tripdata/10_October/201410-citibike-tripdata_1.c

## Check Results

In [None]:
import pandas as pd

debugging_dfs = []
for pickle_path in debugging_pickle_paths:
  debugging_dfs.append(pd.read_pickle(pickle_path) )

debugging_df = pd.concat(debugging_dfs, axis=0, ignore_index=True)

debugging_df.to_pickle("/content/debugging_dataset/debugging_dataset.pkl")

print(debugging_df.shape)
debugging_df.head()

(8081216, 15)


Unnamed: 0,tripduration,starttime,stoptime,start_station_id,start_station_name,start_station_latitude,start_station_longitude,end_station_id,end_station_name,end_station_latitude,end_station_longitude,bikeid,usertype,birth_year,gender
0,558,2014-04-01 00:00:07,2014-04-01 00:09:25,82,St James Pl & Pearl St,40.711174,-74.000165,2008,Little West St & 1 Pl,40.705693,-74.016777,21062,Subscriber,1982,1
1,882,2014-04-01 00:00:20,2014-04-01 00:15:02,349,Rivington St & Ridge St,40.718502,-73.983299,312,Allen St & E Houston St,40.722055,-73.989111,20229,Subscriber,1988,1
2,587,2014-04-01 00:00:25,2014-04-01 00:10:12,293,Lafayette St & E 8 St,40.730287,-73.990765,334,W 20 St & 7 Ave,40.742388,-73.997262,20922,Subscriber,1959,1
3,355,2014-04-01 00:00:44,2014-04-01 00:06:39,539,Metropolitan Ave & Bedford Ave,40.715348,-73.960241,282,Kent Ave & S 11 St,40.708273,-73.968341,20914,Subscriber,1981,1
4,524,2014-04-01 00:01:29,2014-04-01 00:10:13,459,W 20 St & 11 Ave,40.746745,-74.007756,503,E 20 St & Park Ave,40.738274,-73.98752,21051,Subscriber,1964,1


In [None]:
# Opening all working datasets leads to kernel crashing, all available RAM used up

# import pandas as pd

# working_dfs = []
# for pickle_path in working_pickle_paths:
#   working_dfs.append(pd.read_pickle(pickle_path) )

# working_df = pd.concat(working_dfs, axis=0, ignore_index=True)
# print(working_df.shape)
# working_df.head()