# Analysis of Principal Components and Original Features

This notebook compares the performance of regression models built on two different datasets:
1. A dataset with pre-computed Principal Components (`sheet 1.csv`).
2. The original feature set from `curse-of-dimensionality.xlsx`.

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import r2_score, mean_squared_error

# Define the regression models to be used in this analysis
algorithms = {
    'Linear Regression': LinearRegression(),
    'SVR': SVR(),
    'RandomForest': RandomForestRegressor(random_state=42),
    'GradientBoosting': GradientBoostingRegressor(random_state=42),
    'KNN': KNeighborsRegressor(),
    'MLP': MLPRegressor(random_state=42, max_iter=1000)
}

# List to store all metrics data before creating the final DataFrame
all_metrics_data = []

## Part 1: Analysis with Principal Components from `sheet 1.csv`

In [4]:
# 1. Read "sheet 1.csv" into a dataframe.
try:
    df_csv = pd.read_csv("sheet 1.csv")

    # 2. Prepare feature matrix (X) and target vector (y)
    X_csv = df_csv[['PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'PC6', 'PC7', 'PC8']]
    y_csv = df_csv['y']

    # --- 3. Create models using first 2 PCs ---
    X_2_pcs = X_csv[['PC1', 'PC2']]
    X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(X_2_pcs, y_csv, test_size=0.2, random_state=42)

    for name, model in algorithms.items():
        model.fit(X_train_2, y_train_2)
        y_pred = model.predict(X_test_2)
        r2 = r2_score(y_test_2, y_pred)
        rmse = np.sqrt(mean_squared_error(y_test_2, y_pred))
        all_metrics_data.append({
            'Algorithm': name,
            'Case': '2 Principal Components',
            'R2 Score': r2,
            'RMSE': rmse
        })

    # --- 4. Create models using first 4 PCs ---
    X_4_pcs = X_csv[['PC1', 'PC2', 'PC3', 'PC4']]
    X_train_4, X_test_4, y_train_4, y_test_4 = train_test_split(X_4_pcs, y_csv, test_size=0.2, random_state=42)

    for name, model in algorithms.items():
        model.fit(X_train_4, y_train_4)
        y_pred = model.predict(X_test_4)
        r2 = r2_score(y_test_4, y_pred)
        rmse = np.sqrt(mean_squared_error(y_test_4, y_pred))
        all_metrics_data.append({
            'Algorithm': name,
            'Case': '4 Principal Components',
            'R2 Score': r2,
            'RMSE': rmse
        })
    
    print("Processed metrics for 2-PC and 4-PC cases from CSV.")

except FileNotFoundError:
    print("Error: 'sheet 1.csv' not found. Please make sure the file is in the correct directory.")

Processed metrics for 2-PC and 4-PC cases from CSV.


## Part 2: Analysis with Original Features from `curse-of-dimensionality.xlsx`

In [5]:
# 5. Read "Sheet3" data from "curse-of-dimensionality.xlsx"
try:
    df_excel = pd.read_excel("curse-of-dimensionality.xlsx", sheet_name="Sheet3")
    df_excel_cleaned = df_excel.dropna()

    # 6. Identify columns based on headers
    y_excel = df_excel_cleaned['y']
    X_excel = df_excel_cleaned[['x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7', 'x8']]

    # 7. Create same ML regression models
    X_train_excel, X_test_excel, y_train_excel, y_test_excel = train_test_split(X_excel, y_excel, test_size=0.2, random_state=42)

    for name, model in algorithms.items():
        model.fit(X_train_excel, y_train_excel)
        y_pred = model.predict(X_test_excel)
        r2 = r2_score(y_test_excel, y_pred)
        rmse = np.sqrt(mean_squared_error(y_test_excel, y_pred))
        all_metrics_data.append({
            'Algorithm': name,
            'Case': 'Original Features',
            'R2 Score': r2,
            'RMSE': rmse
        })

    print("Processed metrics for Original Features case from Excel.")

except FileNotFoundError:
    print("Error: 'curse-of-dimensionality.xlsx' not found. Please make sure the file is in the correct directory.")

Processed metrics for Original Features case from Excel.


In [6]:
# Create a DataFrame from the collected data
results_df = pd.DataFrame(all_metrics_data)

# Pivot the DataFrame to get the desired structure for the report
final_report = results_df.pivot_table(
    index='Algorithm',
    columns='Case',
    values=['R2 Score', 'RMSE']
)

# Reorder the columns for better readability
final_report = final_report.reorder_levels([1, 0], axis=1)
column_order = [
    '2 Principal Components',
    '4 Principal Components',
    'Original Features'
]
final_report = final_report.reindex(columns=column_order, level=0)

print("--- Consolidated Model Performance Report ---")
print(final_report)

--- Consolidated Model Performance Report ---
Case              2 Principal Components           4 Principal Components  \
                                R2 Score      RMSE               R2 Score   
Algorithm                                                                   
GradientBoosting                0.831953  0.393178               0.852496   
KNN                             0.862759  0.355317               0.862759   
Linear Regression               0.854173  0.366263               0.880922   
MLP                             0.860280  0.358512               0.865046   
RandomForest                    0.840930  0.382533               0.868168   
SVR                             0.869939  0.345898               0.868629   

Case                        Original Features            
                       RMSE          R2 Score      RMSE  
Algorithm                                                
GradientBoosting   0.368362          0.858391  0.360926  
KNN                0.355317 