<a href="https://colab.research.google.com/github/asmas-work/ML-Pipeline-with-PCA-Logistic-Regression/blob/main/PCA_Script_for_5_DataSets.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# PCA applied on Below 5 DataSets from Kaggle
### 1: [Ai Dev Productivity](https://www.kaggle.com/datasets/atharvasoundankar/ai-developer-productivity-dataset),
### 2: [Personality Dataset](https://www.kaggle.com/datasets/rakeshkapilavai/extrovert-vs-introvert-behavior-data),
### 3: [Human Activity Recognition with Smartphones](https://www.kaggle.com/datasets/uciml/human-activity-recognition-with-smartphones?select=train.csv),
### 4: [Student Habits Performance](https://www.kaggle.com/datasets/jayaantanaath/student-habits-vs-academic-performance),
### 5: [Wine Quality Red](https://www.kaggle.com/datasets/uciml/red-wine-quality-cortez-et-al-2009)

In [None]:
import os, time, csv
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression

def process_file(file_path, target_col=None, bins=3, variance_levels=None):
    if variance_levels is None:
        variance_levels = [1.00, 0.99, 0.95, 0.90, 0.85]

    # ─────────────────────────────────────────────────────────────────────────────
    # 1. Load & sniff delimiter
    with open(file_path, newline='') as f:
        sample = f.read(2048)
        try:
            delim = csv.Sniffer().sniff(sample).delimiter
        except csv.Error:
            delim = ','
    df = pd.read_csv(file_path, sep=delim)
    df.columns = df.columns.str.strip()

    # ─────────────────────────────────────────────────────────────────────────────
    # 2. Identify target column
    if target_col is None:
        target_col = df.columns[-1]
    if target_col not in df.columns:
        raise KeyError(f"Column '{target_col}' not found in {file_path}")

    # ─────────────────────────────────────────────────────────────────────────────
    # 3. Drop “ID‐like” columns:
    #    - any column ending in '_id'
    #    - or any object column with cardinality == n_rows
    id_cols = [c for c in df.columns
               if c.lower().endswith('_id')]
    n = len(df)
    for c in df.select_dtypes(include=['object']).columns:
        if df[c].nunique() == n:
            id_cols.append(c)
    id_cols = list(set(id_cols))

    # ─────────────────────────────────────────────────────────────────────────────
    # 4. Prepare X & y
    #    - if float target, bin into quantiles to make it categorical
    y_raw = df[target_col]
    if y_raw.dtype.kind == 'f':
        y = pd.qcut(y_raw, q=bins, labels=False)
    else:
        y = y_raw.values

    X = df.drop(columns=id_cols + [target_col])

    # ─────────────────────────────────────────────────────────────────────────────
    # 5. One-hot encode categoricals
    X = pd.get_dummies(X, drop_first=True)

    # 6. Impute missing values
    imputer = SimpleImputer(strategy='mean')
    X_imp = imputer.fit_transform(X)

    # 7. Train/test split
    stratify_arg = y if y_raw.dtype.kind != 'f' else None
    X_train, X_test, y_train, y_test = train_test_split(
        X_imp, y,
        test_size=0.20,
        random_state=42,
        stratify=stratify_arg
    )

    # 8. Standardize
    scaler = StandardScaler().fit(X_train)
    X_train_s = scaler.transform(X_train)
    X_test_s  = scaler.transform(X_test)

    # 9. PCA + Logistic Regression
    records = []
    n_features = X_train_s.shape[1]
    for var in variance_levels:
        n_comp_param = n_features if var >= 1.0 else var
        pca = PCA(n_components=n_comp_param, svd_solver='full').fit(X_train_s)
        n_comp = pca.n_components_
        X_tr = pca.transform(X_train_s)
        X_te = pca.transform(X_test_s)

        clf = LogisticRegression(solver='lbfgs', multi_class='auto', max_iter=1000)
        start = time.time()
        clf.fit(X_tr, y_train)
        elapsed = time.time() - start
        acc = clf.score(X_te, y_test)

        records.append((var, n_comp, elapsed, acc))

    results = pd.DataFrame(
        records,
        columns=["Variance Retained", "Number of Components", "Time (s)", "Accuracy"]
    )
    return results

if __name__ == "__main__":
    # ─────────────────────────────────────────────────────────────────────────
    #  List your files here (full paths or relative).
    files = [
        "/content/ai_dev_productivity.csv",
        "/content/personality_dataset.csv",
        "/content/human_activity_recognition_with_smartphones.csv",
        "/content/student_habits_performance.csv",
        "/content/winequality-red.csv"
    ]

    # If you need to override the target column for a given file, list it here.
    target_map = {
        # "/mnt/data/student_habits_performance.csv": "exam_score",
        # "/mnt/data/ai_dev_productivity.csv":     "task_success",
        # etc...
    }

    for fp in files:
        print(f"\n=== Results for: {os.path.basename(fp)} ===")
        results = process_file(fp, target_col=target_map.get(fp))
        print(results.to_string(index=False))



=== Results for: winequality-red.csv ===




 Variance Retained  Number of Components  Time (s)  Accuracy
              1.00                    11  0.144193  0.590625
              0.99                    10  0.124519  0.600000
              0.95                     9  0.367768  0.603125
              0.90                     7  0.198867  0.575000
              0.85                     6  0.085734  0.578125

=== Results for: ai_dev_productivity.csv ===




 Variance Retained  Number of Components  Time (s)  Accuracy
              1.00                     8  0.151705      0.93
              0.99                     8  0.044499      0.93
              0.95                     6  0.015794      0.91
              0.90                     5  0.010782      0.80
              0.85                     5  0.010647      0.80

=== Results for: personality_dataset.csv ===




 Variance Retained  Number of Components  Time (s)  Accuracy
              1.00                     7  0.021113  0.905172
              0.99                     6  0.025299  0.905172
              0.95                     5  0.017509  0.917241
              0.90                     3  0.038222  0.917241
              0.85                     2  0.009388  0.917241

=== Results for: human_activity_recognition_with_smartphones.csv ===




 Variance Retained  Number of Components  Time (s)  Accuracy
              1.00                   562 11.171856  0.985044
              0.99                   179  4.658196  0.983005
              0.95                   102  5.545699  0.979606
              0.90                    63  9.421645  0.960571
              0.85                    41 11.812925  0.948334

=== Results for: student_habits_performance.csv ===
 Variance Retained  Number of Components  Time (s)  Accuracy
              1.00                    18  0.025555     0.810
              0.99                    18  0.023478     0.810
              0.95                    17  0.017087     0.805
              0.90                    15  0.009268     0.815
              0.85                    14  0.009040     0.790


