In [5]:
# logistic_train_from_csv.py

# Loads dataset from DATA_PATH and runs the LogisticRegression pipeline + evaluation plots.

# Place the CSV in the same folder or update DATA_PATH accordingly, then run:

#   python logistic_train_from_csv.py



import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import label_binarize
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report, roc_curve, auc

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression

import warnings
warnings.filterwarnings('ignore')

# ---------- CONFIG ----------
DATA_PATH = "project_risk_raw_dataset.csv"   # <--- your CSV file
TEST_SIZE = 0.20
RANDOM_STATE = 42
TARGET_COL = 'Risk_Level'   # change if your target has a different name
# ----------------------------

if not pd.io.common.file_exists(DATA_PATH):
    raise FileNotFoundError(f"Dataset not found at '{DATA_PATH}'. Put the CSV there or change DATA_PATH.")

# load dataset
PROJECTS = pd.read_csv(DATA_PATH)
print("Loaded dataset shape:", PROJECTS.shape)

if TARGET_COL not in PROJECTS.columns:
    raise ValueError(f"Target column '{TARGET_COL}' not found in dataset. Available columns: {PROJECTS.columns.tolist()}")

# 1) Prepare features and target (keeps your original names)
features = PROJECTS.drop(columns=[TARGET_COL])
targets = PROJECTS[TARGET_COL]

# 2) Train-test split (stratify to preserve class distribution)
features_train, features_test, targets_train, targets_test = train_test_split(
    features, targets, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=targets)

# 3) Build preprocessing pipeline
numeric_cols = features.select_dtypes(include=['int64', 'float64', 'int32', 'float32']).columns.tolist()
categorical_cols = features.select_dtypes(include=['object', 'category', 'bool']).columns.tolist()

print("Numeric columns:", len(numeric_cols), "Categorical columns:", len(categorical_cols))

# numeric pipeline: median impute + standard scale
numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# categorical pipeline: most frequent impute + one-hot encode
categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse=False))
])

preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numeric_cols),
    ('cat', categorical_transformer, categorical_cols)
], remainder='drop')

# 4) Full pipeline: preprocessing -> logistic regression
pipeline = Pipeline([
    ('pre', preprocessor),
    ('clf', LogisticRegression(max_iter=2000, random_state=RANDOM_STATE))
])

# 5) Fit the pipeline
pipeline.fit(features_train, targets_train)

# 6) Predictions & probabilities
targets_pred = pipeline.predict(features_test)
y_score = pipeline.predict_proba(features_test)

# 7) Confusion matrix plot (keeps your style)
class_labels = np.unique(PROJECTS[TARGET_COL])
cm = confusion_matrix(targets_test, targets_pred, labels=class_labels)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=class_labels)
disp.plot(cmap=plt.cm.Blues)
plt.title('LogisticRegression Confusion Matrix')
plt.xlabel('Predicted Project Risk Level')
plt.ylabel('True Project Risk Level')
plt.tight_layout()
plt.show()

# 8) Classification report heatmap
report = classification_report(targets_test, targets_pred, target_names=class_labels, output_dict=True)
report_df = pd.DataFrame(report).transpose().iloc[:-3, :3]   # drop accuracy/avg rows for heatmap
plt.figure(figsize=(10, 5))
sns.heatmap(report_df, annot=True, cmap='Blues', fmt='.2f')
plt.title('LogisticRegression Classification Report')
plt.xlabel('Metrics')
plt.ylabel('Project Risk Level')
plt.tight_layout()
plt.show()

# 9) ROC curves (one-vs-rest)
classes_sorted = np.sort(class_labels)
y_test_bin = label_binarize(targets_test, classes=classes_sorted)
n_classes = y_test_bin.shape[1]

plt.figure(figsize=(10, 10))
for i in range(n_classes):
    fpr, tpr, _ = roc_curve(y_test_bin[:, i], y_score[:, i])
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, lw=2, label=f'{classes_sorted[i]} (AUC = {roc_auc:.2f})')

plt.plot([0, 1], [0, 1], 'k--', lw=2)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('LogisticRegression ROC Curves')
plt.legend(loc='upper left')
plt.tight_layout()
plt.show()

# 10) Print a brief textual summary
print("Model: LogisticRegression")
print("Train shape:", features_train.shape)
print("Test shape:", features_test.shape)
print("\nClassification report (text):\n")
print(classification_report(targets_test, targets_pred, zero_division=0))

FileNotFoundError: Dataset not found at 'project_risk_raw_dataset.csv'. Put the CSV there or change DATA_PATH.