<a href="https://colab.research.google.com/github/alfredqbit/NU-DDS-8515/blob/main/sepulvedaADDS-8515-4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Assignment 4: Implementing Factor Analysis and Evaluating Machine Learning Model Performance

Factor Analysis and Logistic Regression on the Consumer Behavior Dataset

This notebook applies Factor Analysis (FA) to the Consumer Behavior dataset. We perform preprocessing,
factor extraction and rotation, and evaluate how FA impacts machine learning model performance.

In [None]:
import os
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.decomposition import FactorAnalysis
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.pipeline import Pipeline

# Create figures directory
FIG_DIR = "figures"
os.makedirs(FIG_DIR, exist_ok=True)

# Helper function to save DataFrame as PNG
def save_df_as_png(df_to_save, filename, title=""):
    fig, ax = plt.subplots(figsize=(12, len(df_to_save.index) * 0.4 + 1)) # Adjust figsize dynamically
    ax.axis('off')
    table = ax.table(cellText=df_to_save.values, colLabels=df_to_save.columns, loc='center')
    table.auto_set_font_size(False)
    table.set_fontsize(10)
    table.scale(1.2, 1.2)
    ax.set_title(title, fontsize=14)
    plt.tight_layout()
    plt.savefig(filename)
    plt.close() # Close the figure to free memory

# Helper function to save text as PNG
def save_text_as_png(text_content, filename, title=""):
    fig, ax = plt.subplots(figsize=(10, 6))
    ax.axis('off')
    ax.text(0.01, 0.99, text_content, va='top', ha='left', fontsize=10, fontfamily='monospace', transform=ax.transAxes)
    ax.set_title(title, fontsize=14)
    plt.tight_layout()
    plt.savefig(filename)
    plt.close() # Close the figure to free memory


Step 1: Dataset Selection, Loading, and Exploratory Data Analysis (EDA)

In [None]:
# Load the UCI Online Shoppers Purchasing Intention Dataset
# Dataset URL from UCI Machine Learning Repository
data_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00468/online_shoppers_intention.csv"
df = pd.read_csv(data_url)

# The target variable is 'Revenue', which is boolean. Convert it to integer (0 or 1).
df['Revenue'] = df['Revenue'].astype(int)

# Identify features and target.
# Drop categorical columns for now to align with numerical nature of Factor Analysis
# and the original synthetic data which was purely numerical. These can be one-hot encoded later if needed.
categorical_cols = ['Month', 'OperatingSystems', 'Browser', 'Region', 'TrafficType', 'VisitorType', 'Weekend']
numerical_features_df = df.drop(columns=categorical_cols + ['Revenue'])

y = df['Revenue']
X = numerical_features_df

# Update feature_names based on the numerical columns remaining in X
feature_names = X.columns.tolist()

# Show basic information for the loaded dataset
# Create a directory for exported tables
TABLE_DIR = "tables"
os.makedirs(TABLE_DIR, exist_ok=True)

# Export df.head() as PNG
save_df_as_png(df.head(), os.path.join(FIG_DIR, 'df_head.png'), title='DataFrame Head')
print(f"Exported df_head.png to {FIG_DIR}")

# Export df.describe() as CSV and PNG
save_df_as_png(df.describe(), os.path.join(FIG_DIR, 'df_describe.png'), title='DataFrame Description')
print(f"Exported df_describe.png to {FIG_DIR}")

# Export df.isnull().sum() as CSV and PNG
save_df_as_png(df.isnull().sum().to_frame(name='Missing Values'), os.path.join(FIG_DIR, 'df_isnull_sum.png'), title='Missing Values per Column')
print(f"Exported df_isnull_sum.png to {FIG_DIR}")

print("DataFrame head:")
display(df.head())
print("\nDataFrame Description:")
display(df.describe())
print("\nMissing values per column:")
display(df.isnull().sum())
print("\nTarget variable value counts:")
display(df['Revenue'].value_counts())


Step 2: Data Preprocessing and Feature Engineering
- standardize data

In [None]:
from sklearn.preprocessing import StandardScaler

# Standardize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X) # Use the 'X' DataFrame prepared in the previous step

# Check standardized data
X_scaled_df_head = pd.DataFrame(X_scaled, columns=feature_names).head()
display(X_scaled_df_head)

# Export df.head() after standardization as PNG
save_df_as_png(X_scaled_df_head, os.path.join(FIG_DIR, 'df_scaled_head.png'), title='Standardized DataFrame Head')
print(f"Exported df_scaled_head.png to {FIG_DIR}")


Step 3: Conduct Factor Analysis

- plot correlation heatmap
- perform factor analysis with 3 components
- form factor loading



In [None]:
# Create a DataFrame from the scaled data to preserve feature names for the heatmap
X_scaled_df = pd.DataFrame(X_scaled, columns=feature_names)

# Check correlation matrix
correlation_matrix = X_scaled_df.corr()
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, fmt='.2f', cmap='coolwarm', xticklabels=feature_names, yticklabels=feature_names)
plt.title("Correlation Matrix")
plt.savefig(os.path.join(FIG_DIR, 'correlation_matrix.png'))
plt.show()

# Perform Factor Analysis
fa = FactorAnalysis(n_components=3)
fa.fit(X_scaled)

# Factor loadings
loadings_df = pd.DataFrame(fa.components_, columns=feature_names)
loadings_df

Step 4: Data Visualization and Interpretation
- display and store heatmap of loading dataframe

In [None]:
# Plot the factor loadings
plt.figure(figsize=(10, 6))
sns.heatmap(loadings_df, annot=True, cmap='coolwarm')
plt.title('Factor Loadings for FA Components')
plt.tight_layout()
plt.savefig(os.path.join(FIG_DIR, 'factor_loadings.png'))
plt.show()

Step 5: Machine Learning Model on Original Features (LR)
- form Python pieline: standardization and logistic regression modeling
- display accuracy metrics and training time

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Logistic Regression pipeline on original features
baseline_pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", LogisticRegression(max_iter=1000, class_weight='balanced'))
])

t0 = time.time()
baseline_pipeline.fit(X_train, y_train)
t1 = time.time()

y_pred_baseline = baseline_pipeline.predict(X_test)

print(f"Baseline model accuracy: {accuracy_score(y_test, y_pred_baseline)}")
print("Classification report:")
print(classification_report(y_test, y_pred_baseline))
print(f"Training time (s): {t1 - t0}")

Step 6: Apply Factor Analysis to Transform Data

In [None]:
X_fa = fa.transform(X_scaled)

Step 7: Train Model on FA-transformed Data
- execute a standardization-factor analysis-logisitc regression pipeline

In [None]:
# Logistic Regression pipeline on FA-transformed features
pca_pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("fa", FactorAnalysis(n_components=3)),  # Same number of components
    ("clf", LogisticRegression(max_iter=1000, multi_class="auto", class_weight='balanced'))
])

t0 = time.time()
pca_pipeline.fit(X_train, y_train)
t1 = time.time()

y_pred_fa = pca_pipeline.predict(X_test)

print(f"FA-transformed model accuracy: {accuracy_score(y_test, y_pred_fa)}")
print("Classification report:")
print(classification_report(y_test, y_pred_fa))
print(f"Training time (s): {t1 - t0}")

# Export classification report for baseline model as TXT and PNG
baseline_report = classification_report(y_test, y_pred_baseline)
save_text_as_png(baseline_report, os.path.join(FIG_DIR, 'classification_report_baseline.png'), title='Classification Report: Baseline Model')
print(f"Exported classification_report_baseline.png to {FIG_DIR}")

# Export classification report for FA-transformed model as TXT and PNG
fa_report = classification_report(y_test, y_pred_fa)
save_text_as_png(fa_report, os.path.join(FIG_DIR, 'classification_report_fa.png'), title='Classification Report: FA-transformed Model')
print(f"Exported classification_report_fa.png to {FIG_DIR}")


Confusion Matrix for Baseline Model (Original Features)

In [None]:
cm_baseline = confusion_matrix(y_test, y_pred_baseline)
plt.figure(figsize=(6, 5))
sns.heatmap(cm_baseline, annot=True, fmt='d', cmap='Blues', cbar=False,
            xticklabels=['Predicted 0', 'Predicted 1'], yticklabels=['Actual 0', 'Actual 1'])
plt.title('Confusion Matrix: Baseline Model')
plt.ylabel('Actual Label')
plt.xlabel('Predicted Label')
plt.savefig(os.path.join(FIG_DIR, 'confusion_matrix_baseline.png'))
plt.show()

Confusion Matrix for FA-transformed Model

In [None]:
cm_fa = confusion_matrix(y_test, y_pred_fa)
plt.figure(figsize=(6, 5))
sns.heatmap(cm_fa, annot=True, fmt='d', cmap='Blues', cbar=False,
            xticklabels=['Predicted 0', 'Predicted 1'], yticklabels=['Actual 0', 'Actual 1'])
plt.title('Confusion Matrix: FA-transformed Model')
plt.ylabel('Actual Label')
plt.xlabel('Predicted Label')
plt.savefig(os.path.join(FIG_DIR, 'confusion_matrix_fa.png'))
plt.show()

Step 8: Compare Performance Before and After FA
- store accuracy metrics before and after FA
- show and store bar chart of accuracy comparison before and after FA

In [None]:
# Accuracy comparison
results = pd.DataFrame({
    "Model": ["Baseline (Original Features)", "FA-transformed Features"],
    "Accuracy": [accuracy_score(y_test, y_pred_baseline), accuracy_score(y_test, y_pred_fa)],
})

# Save the results as PNG
save_df_as_png(results, os.path.join(FIG_DIR, 'results_table.png'), title='Model Accuracy Comparison Table')
print(f"Exported results_table.png to {FIG_DIR}")

# Bar chart comparison
plt.figure()
plt.bar(results["Model"], results["Accuracy"])
plt.ylabel("Accuracy")
plt.title("Model Accuracy Comparison: Original vs FA-Transformed Features")
plt.tight_layout()
plt.savefig(os.path.join(FIG_DIR, 'accuracy_comparison.png'))
plt.show()

### Scree Plot
A Scree Plot displays the eigenvalues associated with a component or factor in descending order versus the number of the component or factor. It helps in determining the number of factors to retain.

In [None]:
# Calculate eigenvalues of the correlation matrix for the Scree Plot
corr_matrix = np.corrcoef(X_scaled.T)
eigenvalues = np.linalg.eigvalsh(corr_matrix)
eigenvalues = eigenvalues[::-1] # Sort in descending order

# Plot Scree Plot
plt.figure(figsize=(10, 6))
plt.plot(range(1, len(eigenvalues) + 1), eigenvalues, marker='o', linestyle='-')
plt.title('Scree Plot')
plt.xlabel('Number of Factors')
plt.ylabel('Eigenvalue')
plt.axhline(y=1, color='r', linestyle='--', label='Eigenvalue = 1') # Kaiser criterion line
plt.legend()
plt.grid(True)
plt.savefig(os.path.join(FIG_DIR, 'scree_plot.png'))
plt.show()