<a href="https://colab.research.google.com/github/alfredqbit/NU-DDS-8515/blob/main/sepulvedaADDS-8515-4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Factor Analysis and Logistic Regression on the Consumer Behavior Dataset

This notebook applies Factor Analysis (FA) to the Consumer Behavior dataset. We perform preprocessing,
factor extraction and rotation, and evaluate how FA impacts machine learning model performance.

In [None]:
import os
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.decomposition import FactorAnalysis
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.pipeline import Pipeline

# Create figures directory
FIG_DIR = "figures"
os.makedirs(FIG_DIR, exist_ok=True)

Step 1: Dataset Selection, Loading, and Exploratory Data Analysis (EDA)

In [None]:
# Create synthetic data for illustration purposes
np.random.seed(42)
n_samples = 200
n_features = 10
X = np.random.randn(n_samples, n_features)
y = np.random.randint(0, 2, n_samples)  # Binary classification

# Create DataFrame
feature_names = [f'Feature_{i+1}' for i in range(n_features)]
df = pd.DataFrame(X, columns=feature_names)
df['target'] = y

# Show basic information
df.head(), df.describe(), df.isnull().sum(), df['target'].value_counts()

Step 2: Data Preprocessing and Feature Engineering

In [None]:
# Standardize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df.drop(columns='target'))

# Check standardized data
pd.DataFrame(X_scaled, columns=feature_names).head()

Step 3: Conduct Factor Analysis

In [None]:
# Check correlation matrix
correlation_matrix = np.corrcoef(X_scaled.T)
sns.heatmap(correlation_matrix, annot=True, fmt='.2f', cmap='coolwarm')
plt.title("Correlation Matrix")
plt.savefig(os.path.join(FIG_DIR, 'correlation_matrix.png'))
plt.show()

# Perform Factor Analysis
fa = FactorAnalysis(n_components=3)
fa.fit(X_scaled)

# Factor loadings
loadings_df = pd.DataFrame(fa.components_, columns=feature_names)
loadings_df

Step 4: Data Visualization and Interpretation

In [None]:
# Plot the factor loadings
plt.figure(figsize=(10, 6))
sns.heatmap(loadings_df, annot=True, cmap='coolwarm')
plt.title('Factor Loadings for FA Components')
plt.tight_layout()
plt.savefig(os.path.join(FIG_DIR, 'factor_loadings.png'))
plt.show()

Step 5: Machine Learning Model on Original Features (LR)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Logistic Regression pipeline on original features
baseline_pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", LogisticRegression(max_iter=1000, multi_class="auto"))
])

t0 = time.time()
baseline_pipeline.fit(X_train, y_train)
t1 = time.time()

y_pred_baseline = baseline_pipeline.predict(X_test)

print(f"Baseline model accuracy: {accuracy_score(y_test, y_pred_baseline)}")
print("Classification report:")
print(classification_report(y_test, y_pred_baseline))
print(f"Training time (s): {t1 - t0}")

Step 6: Apply Factor Analysis to Transform Data

In [None]:
X_fa = fa.transform(X_scaled)

Step 7: Train Model on FA-transformed Data

In [None]:
# Logistic Regression pipeline on FA-transformed features
pca_pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("fa", FactorAnalysis(n_components=3)),  # Same number of components
    ("clf", LogisticRegression(max_iter=1000, multi_class="auto"))
])

t0 = time.time()
pca_pipeline.fit(X_train, y_train)
t1 = time.time()

y_pred_fa = pca_pipeline.predict(X_test)

print(f"FA-transformed model accuracy: {accuracy_score(y_test, y_pred_fa)}")
print("Classification report:")
print(classification_report(y_test, y_pred_fa))
print(f"Training time (s): {t1 - t0}")

Step 8: Compare Performance Before and After FA

In [None]:
# Accuracy comparison
results = pd.DataFrame({
    "Model": ["Baseline (Original Features)", "FA-transformed Features"],
    "Accuracy": [accuracy_score(y_test, y_pred_baseline), accuracy_score(y_test, y_pred_fa)],
})

# Save the results as PNG and CSV
results.to_csv(os.path.join(FIG_DIR, 'results_table.csv'), index=False)

# Bar chart comparison
plt.figure()
plt.bar(results["Model"], results["Accuracy"])
plt.ylabel("Accuracy")
plt.title("Model Accuracy Comparison: Original vs FA-Transformed Features")
plt.tight_layout()
plt.savefig(os.path.join(FIG_DIR, 'accuracy_comparison.png'))
plt.show()