### CitiBike Data Extraction from online database

In [17]:
import pandas as pd
import requests 
import zipfile
import io

In [18]:
# URLs for June 2013 and June 2019 data (Citi Bike archive)
urls = {
    "2013": "https://s3.amazonaws.com/tripdata/2013-citibike-tripdata.zip",
    "2019": "https://s3.amazonaws.com/tripdata/2019-citibike-tripdata.zip"
}

In [19]:
# fucntion to download and save the 2013 and 2019 ZIP file 
# download the file in chunck to prevent memory overload 
# comment out the below funtion to prevent memeory overload 

"""def download_zip(url, filename):
    response = requests.get(url, stream=True)
    if response.status_code == 200:
        with open(filename, 'wb') as file:
            for chunk in response.iter_content(chunk_size=1024):
                file.write(chunk)
                print(f'Downloaded: {filename}')
    else:
        print(f'Failed to download {filename}')

# download both files
for year, url in urls.items():
    download_zip(url, f"{year}-citibike-tripdata.zip")"""

'def download_zip(url, filename):\n    response = requests.get(url, stream=True)\n    if response.status_code == 200:\n        with open(filename, \'wb\') as file:\n            for chunk in response.iter_content(chunk_size=1024):\n                file.write(chunk)\n                print(f\'Downloaded: {filename}\')\n    else:\n        print(f\'Failed to download {filename}\')\n\n# download both files\nfor year, url in urls.items():\n    download_zip(url, f"{year}-citibike-tripdata.zip")'

In [20]:
# Function to list the files inside the ZIP file without extracting them
def list_files_in_zip(zip_filename):
    with zipfile.ZipFile(zip_filename, 'r') as zip_ref:
        # List all the contents in the ZIP file
        file_list = zip_ref.namelist()
        return file_list

zip_files = ["2013-citibike-tripdata.zip", "2019-citibike-tripdata.zip"]

# Iterate over the ZIP files and print the file names inside
for zip_filename in zip_files:
    print(f"Files in {zip_filename}:")
    file_names = list_files_in_zip(zip_filename)
    for file_name in file_names:
        print(file_name)
    print('--------------------')  

Files in 2013-citibike-tripdata.zip:
2013-citibike-tripdata/
2013-citibike-tripdata/4_April/
2013-citibike-tripdata/12_December/
2013-citibike-tripdata/.DS_Store
__MACOSX/2013-citibike-tripdata/._.DS_Store
2013-citibike-tripdata/201309-citibike-tripdata.csv
__MACOSX/2013-citibike-tripdata/._201309-citibike-tripdata.csv
2013-citibike-tripdata/11_November/
2013-citibike-tripdata/7_July/
2013-citibike-tripdata/201311-citibike-tripdata.csv
__MACOSX/2013-citibike-tripdata/._201311-citibike-tripdata.csv
2013-citibike-tripdata/201307-citibike-tripdata.csv
__MACOSX/2013-citibike-tripdata/._201307-citibike-tripdata.csv
2013-citibike-tripdata/10_October/
2013-citibike-tripdata/9_September/
2013-citibike-tripdata/8_August/
2013-citibike-tripdata/6_June/
2013-citibike-tripdata/3_March/
2013-citibike-tripdata/201308-citibike-tripdata.csv
__MACOSX/2013-citibike-tripdata/._201308-citibike-tripdata.csv
2013-citibike-tripdata/1_January/
2013-citibike-tripdata/201306-citibike-tripdata.csv
__MACOSX/2013-

In [24]:
# Read the june csv files from 2013 and 2019

def read_csv_from_zip(zip_filename, csv_filename):
    with zipfile.ZipFile(zip_filename, 'r') as zip_ref:
        # Open the specific CSV file within the ZIP archive
        with zip_ref.open(csv_filename) as file:
            trip_df = pd.read_csv(file)
            return trip_df

zip_files = {
    "2013": "2013-citibike-tripdata.zip",
    "2019": "2019-citibike-tripdata.zip"
}
# Specific CSV file paths within the ZIP files
csv_files = {
    "2013": "2013-citibike-tripdata/6_June/201306-citibike-tripdata_1.csv",
    "2019_1": "2019-citibike-tripdata/6_June/201906-citibike-tripdata_1.csv",
    "2019_2": "2019-citibike-tripdata/6_June/201906-citibike-tripdata_2.csv",
    "2019_3": "2019-citibike-tripdata/6_June/201906-citibike-tripdata_3.csv"
}

# Read the CSV files from the respective ZIP files
dfs = {}
for key, zip_file in zip_files.items():
    if key == "2013":
        dfs["2013"] = read_csv_from_zip(zip_file, csv_files["2013"])
    else:
        for i in range(1, 4):
            dfs[f"2019_{i}"] = read_csv_from_zip(zip_file, csv_files[f"2019_{i}"])

# Print the DataFrames (or use them as needed)
for key, df in dfs.items():
    print(f"\nData for {key}:")
    print(df.head()) 


Data for 2013:
   tripduration            starttime             stoptime  start station id  \
0           695  2013-06-01 00:00:01  2013-06-01 00:11:36               444   
1           693  2013-06-01 00:00:08  2013-06-01 00:11:41               444   
2          2059  2013-06-01 00:00:44  2013-06-01 00:35:03               406   
3           123  2013-06-01 00:01:04  2013-06-01 00:03:07               475   
4          1521  2013-06-01 00:01:22  2013-06-01 00:26:43              2008   

       start station name  start station latitude  start station longitude  \
0      Broadway & W 24 St               40.742354               -73.989151   
1      Broadway & W 24 St               40.742354               -73.989151   
2  Hicks St & Montague St               40.695128               -73.995951   
3     E 15 St & Irving Pl               40.735243               -73.987586   
4   Little West St & 1 Pl               40.705693               -74.016777   

   end station id        end station nam

In [27]:
# Merge the 2019 June csv files 
zip_files = {
    "2019": "2019-citibike-tripdata.zip"
}

# Specific CSV file paths within the 2019 ZIP file
csv_files_2019 = {
    "2019_1": "2019-citibike-tripdata/6_June/201906-citibike-tripdata_1.csv",
    "2019_2": "2019-citibike-tripdata/6_June/201906-citibike-tripdata_2.csv",
    "2019_3": "2019-citibike-tripdata/6_June/201906-citibike-tripdata_3.csv"
}
dfs_2019 = []

# Read the CSV files from the 2019 ZIP file
for key, zip_file in zip_files.items():
    for i in range(1, 4):
        csv_file = csv_files_2019[f"2019_{i}"]
        df = read_csv_from_zip(zip_file, csv_file)
        dfs_2019.append(df)

# Concatenate all DataFrames into one
merged_df_2019 = pd.concat(dfs_2019, ignore_index=True)

# Save the merged DataFrame to a CSV file
merged_df_2019.to_csv("2019_citibike_tripdata_merged.csv", index=False)

print("2019 CSV files merged and saved as '2019_citibike_tripdata_merged.csv'")

2019 CSV files merged and saved as '2019_citibike_tripdata_merged.csv'
