In [2]:
!pip list

Package                            Version
---------------------------------- -------------------
absl-py                            1.4.0
accelerate                         1.3.0
aiohappyeyeballs                   2.6.1
aiohttp                            3.11.13
aiosignal                          1.3.2
alabaster                          1.0.0
albucore                           0.0.23
albumentations                     2.0.5
ale-py                             0.10.2
altair                             5.5.0
annotated-types                    0.7.0
anyio                              3.7.1
argon2-cffi                        23.1.0
argon2-cffi-bindings               21.2.0
array_record                       0.7.1
arviz                              0.20.0
astropy                            7.0.1
astropy-iers-data                  0.2025.3.10.0.29.26
astunparse                         1.6.3
atpublic                           4.1.0
attrs                              25.2.0
audioread          

In [6]:
import pandas as pd

# Load the dataset
data = pd.read_csv("/content/adult.csv", header=None, na_values=" ?")

# Assign column names
columns = [
    "age", "workclass", "fnlwgt", "education", "education_num", "marital_status",
    "occupation", "relationship", "race", "sex", "capital_gain", "capital_loss",
    "hours_per_week", "native_country", "income"
]
data.columns = columns

# Define schema
schema = {
    "age": "int64",  # Numerical
    "workclass": "category",  # Categorical
    "fnlwgt": "int64",  # Numerical
    "education": "category",  # Categorical
    "education_num": "int64",  # Numerical
    "marital_status": "category",  # Categorical
    "occupation": "category",  # Categorical
    "relationship": "category",  # Categorical
    "race": "category",  # Categorical
    "sex": "category",  # Categorical
    "capital_gain": "int64",  # Numerical
    "capital_loss": "int64",  # Numerical
    "hours_per_week": "int64",  # Numerical
    "native_country": "category",  # Categorical
    "income": "category"  # Target (Categorical)
}

# Apply schema
for col, dtype in schema.items():
    if dtype == "category":
        data[col] = data[col].astype("category")
    else:
        data[col] = data[col].astype(dtype)

# Save as Parquet
data.to_parquet("/content/datasets/adult.parquet", index=False)

In [2]:
!pip install ydata_profiling



In [7]:
from ydata_profiling import ProfileReport

# Generate profile report
profile = ProfileReport(data, title="Adult Dataset Profile Report")
profile.to_file("datasets/adult_profile_report.pdf")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]



Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

In [4]:
from sklearn.model_selection import train_test_split

# Split into train+test and production
train_test_data, prod_data = train_test_split(data, test_size=0.2, random_state=42)

# Split train+test into train and test
train_data, test_data = train_test_split(train_test_data, test_size=0.25, random_state=42)

# Save datasets as Parquet
train_data.to_parquet("datasets/train_data.parquet", index=False)
test_data.to_parquet("datasets/test_data.parquet", index=False)
prod_data.to_parquet("datasets/prod_data.parquet", index=False)

In [13]:
!pip install mlflow

Collecting mlflow
  Downloading mlflow-2.21.0-py3-none-any.whl.metadata (30 kB)
Collecting mlflow-skinny==2.21.0 (from mlflow)
  Downloading mlflow_skinny-2.21.0-py3-none-any.whl.metadata (31 kB)
Collecting alembic!=1.10.0,<2 (from mlflow)
  Downloading alembic-1.15.1-py3-none-any.whl.metadata (7.2 kB)
Collecting docker<8,>=4.0.0 (from mlflow)
  Downloading docker-7.1.0-py3-none-any.whl.metadata (3.8 kB)
Collecting graphene<4 (from mlflow)
  Downloading graphene-3.4.3-py2.py3-none-any.whl.metadata (6.9 kB)
Collecting gunicorn<24 (from mlflow)
  Downloading gunicorn-23.0.0-py3-none-any.whl.metadata (4.4 kB)
Collecting databricks-sdk<1,>=0.20.0 (from mlflow-skinny==2.21.0->mlflow)
  Downloading databricks_sdk-0.46.0-py3-none-any.whl.metadata (38 kB)
Collecting fastapi<1 (from mlflow-skinny==2.21.0->mlflow)
  Downloading fastapi-0.115.11-py3-none-any.whl.metadata (27 kB)
Collecting uvicorn<1 (from mlflow-skinny==2.21.0->mlflow)
  Downloading uvicorn-0.34.0-py3-none-any.whl.metadata (6.5 k

In [15]:
# Import necessary libraries
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier  # Placeholder model

# Step 1: Load dataset from GitHub raw file link
train_data_url = "https://raw.githubusercontent.com/Mukesh-Khemani/MLOps_FinalProject/ce482a767db3df114d27a5aebe3f41ca317369a4/train_data.parquet"
test_data_url = "https://raw.githubusercontent.com/Mukesh-Khemani/MLOps_FinalProject/ce482a767db3df114d27a5aebe3f41ca317369a4/test_data.parquet"

# Read the datasets in Parquet format
train_data = pd.read_parquet(train_data_url)
test_data = pd.read_parquet(test_data_url)

# Separate features and target
X_train, y_train = train_data.drop("income", axis=1), train_data["income"]
X_test, y_test = test_data.drop("income", axis=1), test_data["income"]

# Step 2: Define preprocessing steps
# Numerical features
numeric_features = [
    "age", "fnlwgt", "education_num", "capital_gain", "capital_loss", "hours_per_week"
]

# Categorical features
categorical_features = [
    "workclass", "education", "marital_status", "occupation", "relationship",
    "race", "sex", "native_country"
]

# Numerical transformer: Impute missing values with median and scale using StandardScaler
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),  # Handle missing values
    ("scaler", StandardScaler())                   # Scale numerical features
])

# Categorical transformer: Impute missing values with most frequent and encode using OneHotEncoder
categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),  # Handle missing values
    ("onehot", OneHotEncoder(handle_unknown="ignore"))     # Encode categorical features
])

# Combine preprocessing steps using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)
    ]
)

# Step 3: Define the ML pipeline
# Combine preprocessing and a placeholder model into a single pipeline
pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),  # Preprocessing step
    ("classifier", RandomForestClassifier(random_state=42))  # Placeholder model
])

# Print the pipeline structure
print("Pipeline Created:")
print(pipeline)

# Optional: Save the pipeline structure for reference
with open("pipeline_structure.txt", "w") as f:
    f.write(str(pipeline))

print("Pipeline structure saved to 'pipeline_structure.txt'")

Pipeline Created:
Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['age', 'fnlwgt',
                                                   'education_num',
                                                   'capital_gain',
                                                   'capital_loss',
                                                   'hours_per_week']),
                                                 ('cat',
                                                  Pipeline(steps=[('imputer',
                                                                   Simp

In [38]:
import mlflow
import mlflow.sklearn
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import cross_val_score, KFold
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
import joblib


# Step 2: Define a function to evaluate and log experiments
def evaluate_and_log_experiment(model_name, pipeline, X_train, y_train, X_test, y_test, params=None):
    with mlflow.start_run(run_name=model_name):
        # Log parameters
        if params:
            for param, value in params.items():
                mlflow.log_param(param, value)

        # Evaluate with k-fold cross-validation
        kf = KFold(n_splits=5, shuffle=True, random_state=42)
        cv_scores = cross_val_score(pipeline, X_train, y_train, cv=kf, scoring="accuracy")
        mlflow.log_metric("cv_accuracy_mean", cv_scores.mean())
        mlflow.log_metric("cv_accuracy_std", cv_scores.std())

        # Train the model
        pipeline.fit(X_train, y_train)

        # Evaluate on the test set
        y_pred = pipeline.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, pos_label=" >50K")
        recall = recall_score(y_test, y_pred, pos_label=" >50K")
        f1 = f1_score(y_test, y_pred, pos_label=" >50K")

        # Log test set metrics
        mlflow.log_metric("test_accuracy", accuracy)
        mlflow.log_metric("test_precision", precision)
        mlflow.log_metric("test_recall", recall)
        mlflow.log_metric("test_f1_score", f1)

        # Log the model
        mlflow.sklearn.log_model(pipeline, f"{model_name}_model")

        # Print results
        print(f"Experiment: {model_name}")
        print(f"Cross-Validation Accuracy: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")
        print(f"Test Accuracy: {accuracy:.4f}")
        print(f"Test Precision: {precision:.4f}")
        print(f"Test Recall: {recall:.4f}")
        print(f"Test F1 Score: {f1:.4f}")

        return accuracy  # Return accuracy for comparison

# Step 3: Run multiple experiments
experiments = {
    "Baseline_RandomForest": Pipeline(steps=[
        ("preprocessor", preprocessor),
        ("classifier", RandomForestClassifier(random_state=42))
    ]),
    "Logistic_Regression": Pipeline(steps=[
        ("preprocessor", preprocessor),
        ("classifier", LogisticRegression(max_iter=1000, random_state=42))
    ]),
    "Tuned_RandomForest": Pipeline(steps=[
        ("preprocessor", preprocessor),
        ("classifier", RandomForestClassifier(
            n_estimators=200, max_depth=15, min_samples_split=5, random_state=42
        ))
    ]),
    "Gradient_Boosting": Pipeline(steps=[
        ("preprocessor", preprocessor),
        ("classifier", GradientBoostingClassifier(
            n_estimators=100, learning_rate=0.1, random_state=42
        ))
    ]),
    "Decision_Tree": Pipeline(steps=[
        ("preprocessor", preprocessor),
        ("classifier", DecisionTreeClassifier(max_depth=10, random_state=42))
    ])
}

# Track metrics for all experiments
experiment_metrics = {}
for name, pipeline in experiments.items():
    accuracy = evaluate_and_log_experiment(name, pipeline, X_train, y_train, X_test, y_test)
    experiment_metrics[name] = accuracy

# Step 4: Identify the best experiment
best_experiment = max(experiment_metrics, key=experiment_metrics.get)
best_accuracy = experiment_metrics[best_experiment]
print(f"\nBest Experiment: {best_experiment} with Test Accuracy: {best_accuracy:.4f}")

# Step 5: Save the best model as a .pkl file
best_pipeline = experiments[best_experiment]
joblib.dump(best_pipeline, f"{best_experiment}_best_model.pkl")
print(f"Saved the best model as '{best_experiment}_best_model.pkl'")



Experiment: Baseline_RandomForest
Cross-Validation Accuracy: 0.8516 ± 0.0052
Test Accuracy: 0.8490
Test Precision: 0.7032
Test Recall: 0.6221
Test F1 Score: 0.6602




Experiment: Logistic_Regression
Cross-Validation Accuracy: 0.8509 ± 0.0048
Test Accuracy: 0.8451
Test Precision: 0.7002
Test Recall: 0.5993
Test F1 Score: 0.6458




Experiment: Tuned_RandomForest
Cross-Validation Accuracy: 0.8600 ± 0.0040
Test Accuracy: 0.8587
Test Precision: 0.7630
Test Recall: 0.5811
Test F1 Score: 0.6598




Experiment: Gradient_Boosting
Cross-Validation Accuracy: 0.8653 ± 0.0037
Test Accuracy: 0.8613
Test Precision: 0.7607
Test Recall: 0.6007
Test F1 Score: 0.6713




Experiment: Decision_Tree
Cross-Validation Accuracy: 0.8511 ± 0.0059
Test Accuracy: 0.8507
Test Precision: 0.7441
Test Recall: 0.5590
Test F1 Score: 0.6384

Best Experiment: Gradient_Boosting with Test Accuracy: 0.8613
Saved the best model as 'Gradient_Boosting_best_model.pkl'
