# PCA for Final Report

### Team 2: Vita Khan, Quinn Reilly, Aarushi Attray

This file contains the Principal component analysis code needed to proceed with the final report.

In [1]:
# Importing all useful libraries (for webscraping and ML as well)
import os
import pandas as pd
import numpy as np
import requests
from io import BytesIO
from zipfile import ZipFile
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score



In [2]:
# Function to clean the dataset
def clean_data(df, date_prefix):
    """
    Cleans the dataframe by performing various transformations and removing invalid data.
    
    Args:
        df (pd.DataFrame): DataFrame containing the raw data.
        date_prefix (str): The date prefix (e.g., "2024-01-" for February, "2024-07-" for July).
    
    Returns:
        pd.DataFrame: Cleaned DataFrame.
    """
    # Handling missing values through replacement 
    df.replace({"n/a": pd.NA, "\n": pd.NA, r"\N": pd.NA}, inplace=True)
    # Convert 'started_at' and 'ended_at' to datetime format
    df['started_at'] = pd.to_datetime(df['started_at'], errors='coerce')
    df['ended_at'] = pd.to_datetime(df['ended_at'], errors='coerce')
    # Filter the rows where 'started_at' or 'ended_at' starts with the specified prefix (only want January and July)
    df = df[df['started_at'].dt.strftime('%Y-%m-').str.startswith(date_prefix)]
    df = df[df['ended_at'].dt.strftime('%Y-%m-').str.startswith(date_prefix)]
    # Calculating trip duration in minutes
    df['tripduration'] = (df['ended_at'] - df['started_at']).dt.total_seconds() / 60
    # Removing rows with invalid or zero trip durations
    df = df[df['tripduration'] > 0]
    # Removing rows with missing critical values
    required_columns = [
        'ride_id', 'rideable_type', 'started_at', 'ended_at', 
        'start_station_name', 'end_station_name', 
        'start_lat', 'start_lng', 'end_lat', 'end_lng', 
        'member_casual'
    ]
    df.dropna(subset=required_columns, inplace=True)
    # Removing unrealistic trip durations (ex. >1000 minutes)
    df = df[df['tripduration'] < 1000]

    return df

In [3]:
# Webscraping 
# Base URL hosting the zip files
url = 'https://s3.amazonaws.com/hubway-data'

# Fetching the list of available files
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

# Filtering for the specific zip files (the two we want to use)
zip_links = []
for download_link in soup.find_all('key'):
    zip_file_name = download_link.text
    if zip_file_name.endswith('.zip') and zip_file_name in ['202401-bluebikes-tripdata.zip', '202407-bluebikes-tripdata.zip']:
        zip_links.append(f"{url}/{zip_file_name}")

# Creating the directory for CSV files 
os.makedirs('csv_files', exist_ok=True)
feb_csv_path = None
aug_csv_path = None

# Process each zip file
for zip_url in zip_links:
    try:
        # Download the ZIP file
        print(f"Downloading {zip_url}...")
        zip_response = requests.get(zip_url)
        zip_response.raise_for_status()
        
        with ZipFile(BytesIO(zip_response.content)) as zip_file:
            for zip_info in zip_file.infolist():
                # Skip system or hidden files
                if "__MACOSX" in zip_info.filename or zip_info.filename.startswith('.'):
                    continue
                
                # Process CSV files only
                if zip_info.filename.endswith('.csv'):
                    try:
                        with zip_file.open(zip_info) as extracted_file:
                            df = pd.read_csv(extracted_file)
                            print(f"Processing {zip_info.filename}...")

                            # Cleaning the data using func above 
                            if '202401' in zip_info.filename:
                                df = clean_data(df, "2024-01-")
                                jan_csv_path = f"csv_files/cleaned_jan_{zip_info.filename}"
                                df.to_csv(jan_csv_path, index=False)
                                print(f"Saved cleaned January data to {jan_csv_path}.")
                            elif '202407' in zip_info.filename:
                                df = clean_data(df, "2024-07-")
                                july_csv_path = f"csv_files/cleaned_july_{zip_info.filename}"
                                df.to_csv(july_csv_path, index=False)
                                print(f"Saved cleaned July data to {july_csv_path}.")
                    
                    except Exception as e:
                        print(f"Error processing file {zip_info.filename}: {e}")
                        continue
    except Exception as e:
        print(f"Error downloading or processing {zip_url}: {e}")
        continue

  k = self.parse_starttag(i)


Downloading https://s3.amazonaws.com/hubway-data/202401-bluebikes-tripdata.zip...
Processing 202401-bluebikes-tripdata.csv...
Saved cleaned January data to csv_files/cleaned_jan_202401-bluebikes-tripdata.csv.
Downloading https://s3.amazonaws.com/hubway-data/202407-bluebikes-tripdata.zip...
Processing 202407-bluebikes-tripdata.csv...
Saved cleaned July data to csv_files/cleaned_july_202407-bluebikes-tripdata.csv.


In [4]:
# Print first 50 rows of cleaned January CSV
if jan_csv_path:
    print(f"Head of cleaned January data ({jan_csv_path}):")
    print(pd.read_csv(jan_csv_path).head())
else:
    print("\nNo cleaned January data available.")

Head of cleaned January data (csv_files/cleaned_jan_202401-bluebikes-tripdata.csv):
            ride_id  rideable_type           started_at             ended_at  \
0  D2F4A4783B230A84  electric_bike  2024-01-31 12:16:49  2024-01-31 12:21:02   
1  D305CEFFD4558633   classic_bike  2024-01-12 08:14:16  2024-01-12 08:19:48   
2  02009BB4EBA0D1F6  electric_bike  2024-01-29 15:00:05  2024-01-29 15:05:47   
3  04C230C1C39071F7   classic_bike  2024-01-09 16:33:40  2024-01-09 17:00:41   
4  CEAFE67E28B43852   classic_bike  2024-01-23 10:19:21  2024-01-23 10:31:39   

   start_station_name start_station_id  \
0  Ames St at Main St           M32037   
1  Ames St at Main St           M32037   
2  One Memorial Drive           M32053   
3  Ames St at Main St           M32037   
4  Mass Ave T Station           C32063   

                          end_station_name end_station_id  start_lat  \
0    Central Square at Mass Ave / Essex St         M32011  42.362357   
1    Central Square at Mass Ave / Esse

In [5]:
# Print first 50 rows of cleaned July CSV
if july_csv_path:
    print(f"Head of cleaned July data ({july_csv_path}):")
    print(pd.read_csv(july_csv_path).head())
else:
    print("\nNo cleaned July data available.")

Head of cleaned July data (csv_files/cleaned_july_202407-bluebikes-tripdata.csv):
            ride_id  rideable_type               started_at  \
0  5FA2A0E02EC53028   classic_bike  2024-07-11 02:07:46.443   
1  5D49B9B78C826FD5   classic_bike  2024-07-19 14:07:03.620   
2  5FEF6FE539C078FB  electric_bike  2024-07-02 09:06:08.296   
3  709B58276144026B   classic_bike  2024-07-30 13:55:16.971   
4  15755F52835908F5   classic_bike  2024-07-18 19:24:45.546   

                  ended_at                      start_station_name  \
0  2024-07-11 02:20:34.180            Maverick St at Massport Path   
1  2024-07-19 14:08:06.530       NCAAA - Walnut Ave at Crawford St   
2  2024-07-02 09:12:52.886  Medford Sq - Riverside Ave at River St   
3  2024-07-30 14:54:55.077       NCAAA - Walnut Ave at Crawford St   
4  2024-07-18 19:42:54.574            Maverick St at Massport Path   

  start_station_id                     end_station_name end_station_id  \
0           A32044         Maverick St at Ma

In [6]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from scipy.stats import probplot, shapiro
import numpy as np

In [7]:
# Function to preprocess the data
def preprocess_data(df):
    """
    Preprocess the data by cleaning the necessary columns and creating derived features.
    
    Args:
        df (pd.df): dataframe to preprocess
    
    Returns:
        df (pd.df): fully preprocessed and organized data to analyze for ML
    """
    required_columns = ['start_lat', 'start_lng', 'end_lat', 'end_lng', 'tripduration', 'member_casual', 'started_at']
    missing_columns = [col for col in required_columns if col not in df.columns]
    if missing_columns:
        raise ValueError(f"Missing columns: {', '.join(missing_columns)}")
    
    # Ensure numeric columns are of proper type
    df[['start_lat', 'start_lng', 'end_lat', 'end_lng', 'tripduration']] = df[
        ['start_lat', 'start_lng', 'end_lat', 'end_lng', 'tripduration']
    ].apply(pd.to_numeric, errors='coerce')
    
    # Handle time-based features
    df['started_at'] = pd.to_datetime(df['started_at'], errors='coerce')
    df['hour'] = df['started_at'].dt.hour
    df.dropna(subset=['start_lat', 'start_lng', 'end_lat', 'end_lng', 'tripduration', 'hour'], inplace=True)
    
    return df

## PCA
Recognizing these challenges, we incorporated PCA as our final ML model to refine our approach.  Performing PCA is helpful when dealing with multicollinearity, as it allows us to keep as much information about the X features before using them to predict y or tripduration. By applying PCA before rerunning our regression analysis, we retained the maximum variance in the data while simplifying the relationships among predictors. For this revised analysis, we focused on key features, including time of day (hour), member_casual, and rideable_type. 

To enhance the regression analysis, we incorporated the PCA via sklearn library modules.  We selected three predictor variables (hour, member_casual, and rideable_type) and used tripduration as the dependent variable to predict the trip duration based on factors such as the time of day, user demographics, and the type of the ride. While PCA effectively simplified the regression model, the resulting analysis still performed poorly, as demonstrated by the unsatisfactory MSE and R² values. In conclusion, despite iterative refinements, including Polynomial Regression, interaction terms, and PCA, Linear Regression remained the most effective model in this context. 

In [8]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [9]:
# Preprocess the data
df = preprocess_data(df)
df['member_casual'] = df['member_casual'].astype('category').cat.codes
df['rideable_type'] = df['rideable_type'].astype('category').cat.codes

# Separate features and target variable
X = df[['hour', 'member_casual','rideable_type']].values
y = df['tripduration']

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Apply PCA
pca = PCA(n_components=2) 
X_pca = pca.fit_transform(X_scaled)

# Perform regression on principal components
model = LinearRegression()
model.fit(X_pca, y)

# Predictions and evaluation
y_pred = model.predict(X_pca)
mse = mean_squared_error(y, y_pred)
r2 = r2_score(y, y_pred)

print(f"Mean Squared Error: {mse:.2f}")
print(f"R^2 Score: {r2:.2f}")

Mean Squared Error: 604.51
R^2 Score: 0.03
