<a href="https://colab.research.google.com/github/ahmedsyed00000-hash/Nick-and-Ahmed/blob/main/Master_File.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import logging
from datetime import datetime
import os

# -----------------------------
# Logging setup
# -----------------------------
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s | %(levelname)s | %(message)s",
    handlers=[
        logging.FileHandler("process_log.log"),
        logging.StreamHandler()
    ]
)

# -----------------------------
# Google Drive data path
# -----------------------------
RAW_DATA_PATH = "/content/drive/MyDrive/Data tests/credit_card_process_activities.csv"
GOLDEN_STAGE_PATH = "/content/drive/MyDrive/Data tests/golden_stage_master.csv"
GOLDEN_APP_PATH = "/content/drive/MyDrive/Data tests/golden_application_master.csv"

# -----------------------------
# Load dataset
# -----------------------------
if not os.path.exists(RAW_DATA_PATH):
    logging.error(f"❌ Raw data not found at {RAW_DATA_PATH}")
    raise FileNotFoundError(f"Missing file: {RAW_DATA_PATH}")

df = pd.read_csv(RAW_DATA_PATH)
logging.info(f"✅ Loaded raw dataset with {df.shape[0]} rows and {df.shape[1]} columns.")

# Convert timestamps safely
for col in ["Activity_Timestamp", "Stage_Start_Timestamp", "Stage_End_Timestamp"]:
    if col in df.columns:
        df[col] = pd.to_datetime(df[col], errors="coerce")

# -----------------------------
# Data Quality Checks
# -----------------------------
def run_data_quality_checks(df):
    issues = {}

    null_counts = df.isnull().sum()
    issues["nulls"] = {c: int(v) for c, v in null_counts[null_counts > 0].items()}

    duplicates = df.duplicated().sum()
    if duplicates > 0:
        issues["duplicates"] = int(duplicates)

    invalid_time = df[df["Stage_Start_Timestamp"] > df["Stage_End_Timestamp"]]
    if not invalid_time.empty:
        issues["invalid_timestamps"] = len(invalid_time)

    if issues:
        logging.warning(f"⚠️ Data quality issues detected: {issues}")
    else:
        logging.info("✅ Data quality checks passed.")
    return issues

dq_issues = run_data_quality_checks(df)

# -----------------------------
# ProcessMasterBuilder
# -----------------------------
class ProcessMasterBuilder:
    def __init__(self, activity_df):
        self.activity_df = activity_df.copy()
        self.application_master = pd.DataFrame()
        self.stage_master = pd.DataFrame()
        self.build_stage_master()
        self.build_application_master()

    def build_stage_master(self):
        stage_summary = []
        for (app_id, stage), group in self.activity_df.groupby(['Application_ID', 'Stage']):
            try:
                stage_start = group['Stage_Start_Timestamp'].min()
                stage_end = group['Stage_End_Timestamp'].max()
                tat_minutes = (stage_end - stage_start).total_seconds() / 60 if pd.notnull(stage_start) and pd.notnull(stage_end) else None
                app_start = self.activity_df[self.activity_df['Application_ID'] == app_id]['Activity_Timestamp'].min()
                age_days = (stage_end - app_start).days if pd.notnull(stage_end) and pd.notnull(app_start) else None
                stage_row = {
                    'Application_ID': app_id,
                    'Stage': stage,
                    'Stage_Start': stage_start,
                    'Stage_End': stage_end,
                    'TAT_Minutes': tat_minutes,
                    'Age_Days': age_days,
                    'Risk_Grade': group['Risk_Grade'].iloc[-1],
                    'UW_Decision': group['UW_Decision'].iloc[-1] if 'UW_Decision' in group.columns else None,
                    'Stage_Status': group['Status_After_Activity'].iloc[-1],
                    'Performed_By': ", ".join(group['Performed_By'].dropna().unique()),
                    'Issues_Count': 0,
                }
                stage_summary.append(stage_row)
            except Exception as e:
                logging.error(f"Error building stage for App={app_id}, Stage={stage} | {e}")
        self.stage_master = pd.DataFrame(stage_summary)
        logging.info(f"✅ Stage Master built with {len(self.stage_master)} rows.")

    def build_application_master(self):
        app_summary = []
        for app_id, group in self.activity_df.groupby('Application_ID'):
            try:
                app_start = group['Activity_Timestamp'].min()
                app_end = group['Activity_Timestamp'].max()
                total_tat_minutes = (app_end - app_start).total_seconds() / 60 if pd.notnull(app_start) and pd.notnull(app_end) else None
                total_age_days = (datetime.now() - app_start).days if pd.notnull(app_start) else None
                app_row = {
                    'Application_ID': app_id,
                    'Product_Type': group['Product_Type'].iloc[0],
                    'Channel': group['Channel'].iloc[0],
                    'Application_Start': app_start,
                    'Application_End': app_end,
                    'Total_TAT_Minutes': total_tat_minutes,
                    'Age_Days': total_age_days,
                    'Final_Risk_Grade': group['Risk_Grade'].iloc[-1],
                    'Final_UW_Decision': group['UW_Decision'].iloc[-1] if 'UW_Decision' in group.columns else None,
                    'Application_Status': group['Status_After_Activity'].iloc[-1],
                    'Performed_By': ", ".join(group['Performed_By'].dropna().unique()),
                    'Issues_Count': 0,
                }
                app_summary.append(app_row)
            except Exception as e:
                logging.error(f"Error building application master for App={app_id} | {e}")
        self.application_master = pd.DataFrame(app_summary)
        logging.info(f"✅ Application Master built with {len(self.application_master)} rows.")

# -----------------------------
# Build Master Tables
# -----------------------------
builder = ProcessMasterBuilder(df)

# -----------------------------
# Validation against Golden Data
# -----------------------------
def validate_outputs(stage_master, app_master, golden_stage_path, golden_app_path):
    if not os.path.exists(golden_stage_path) or not os.path.exists(golden_app_path):
        logging.warning("⚠️ Golden datasets not found. Skipping validation.")
        return

    golden_stage = pd.read_csv(golden_stage_path, parse_dates=["Stage_Start", "Stage_End"])
    golden_app = pd.read_csv(golden_app_path, parse_dates=["Application_Start", "Application_End"])

    diff_stage = stage_master.compare(golden_stage, keep_shape=True, keep_equal=False) if not stage_master.empty else pd.DataFrame()
    diff_app = app_master.compare(golden_app, keep_shape=True, keep_equal=False) if not app_master.empty else pd.DataFrame()

    if not diff_stage.empty or not diff_app.empty:
        logging.warning("⚠️ Validation differences detected.")
        print("\nStage Master differences:\n", diff_stage)
        print("\nApplication Master differences:\n", diff_app)
    else:
        logging.info("✅ Validation passed against golden sample.")

validate_outputs(builder.stage_master, builder.application_master, GOLDEN_STAGE_PATH, GOLDEN_APP_PATH)

# -----------------------------
# Save outputs back to Google Drive
# -----------------------------
output_stage_path = "/content/drive/MyDrive/data sets/output_stage_master.csv"
output_app_path = "/content/drive/MyDrive/data sets/output_application_master.csv"

builder.stage_master.to_csv(output_stage_path, index=False)
builder.application_master.to_csv(output_app_path, index=False)

logging.info(f"✅ Saved Stage Master to {output_stage_path}")
logging.info(f"✅ Saved Application Master to {output_app_path}")

# -----------------------------
# Display small preview
# -----------------------------
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)

print("\n✅ Final Stage Master (sample):")
print(builder.stage_master.head(10))

print("\n✅ Final Application Master (sample):")
print(builder.application_master.head(10))





Stage Master differences:
     Application_ID       Stage       Stage_Start       Stage_End        \
              self other  self other        self other      self other   
0              NaN   NaN   NaN   NaN         NaT   NaT       NaT   NaT   
1              NaN   NaN   NaN   NaN         NaT   NaT       NaT   NaT   
2              NaN   NaN   NaN   NaN         NaT   NaT       NaT   NaT   
3              NaN   NaN   NaN   NaN         NaT   NaT       NaT   NaT   
4              NaN   NaN   NaN   NaN         NaT   NaT       NaT   NaT   
..             ...   ...   ...   ...         ...   ...       ...   ...   
245            NaN   NaN   NaN   NaN         NaT   NaT       NaT   NaT   
246            NaN   NaN   NaN   NaN         NaT   NaT       NaT   NaT   
247            NaN   NaN   NaN   NaN         NaT   NaT       NaT   NaT   
248            NaN   NaN   NaN   NaN         NaT   NaT       NaT   NaT   
249            NaN   NaN   NaN   NaN         NaT   NaT       NaT   NaT   

    TAT_M