In [1]:
import sys
sys.path.append("C:/Users/Subbu/OneDrive/Desktop/M.TechStuff/Sem2/2.DMML/Assignment/")

In [3]:
# !pip3 install mlflow

In [5]:
import mlflow

mlflow.set_tracking_uri("file:///C:/Users/Subbu/OneDrive/Desktop/M.TechStuff/Sem2/2.DMML/Assignment/Model_Building/mlruns")
mlflow.set_experiment("Customer Churn Prediction")

# Enable notebook mode
mlflow.autolog()

In [7]:
import os
import mlflow
import mlflow.sklearn
import pandas as pd
from Data_Transformation.connection import SnowflakeConnection
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


def get_all_features():
    """Retrieve all features from the feature store."""
    conn = SnowflakeConnection()
    query = """
    WITH LatestFeatures AS (
        SELECT *, ROW_NUMBER() OVER (PARTITION BY "customerID" ORDER BY FEATURE_VERSION DESC) AS rn
        FROM CUSTOMER_CHURN.PUBLIC.FEATURE_STORE_TABLE
    )
    SELECT * FROM LatestFeatures WHERE rn = 1;
    """
    df = conn.execute_query(query)
    conn.close()
    return df


# Define directories
OUTPUT_DIR = "C:/Users/Subbu/OneDrive/Desktop/M.TechStuff/Sem2/2.DMML/Assignment/Model_Building/"
# os.makedirs(OUTPUT_DIR, exist_ok=True)

df = get_all_features()

# Remove non-numeric columns
if "customerID" in df.columns:
    df.drop(columns=["customerID"], inplace=True)

# Define features and target
if "Churn" not in df.columns:
    raise KeyError("The target column 'Churn' is missing from the dataset.")

X = df.drop(columns=["Churn"])
y = df["Churn"]

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Standardize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Initialize models
models = {
    "Logistic Regression": LogisticRegression(),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42)
}


mlflow.set_experiment("Customer Churn Prediction") 

# Train and evaluate models using MLflow
results = []
for name, model in models.items():
    with mlflow.start_run(run_name=name):
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        
        # Calculate performance metrics
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        
        # Log metrics
        mlflow.log_metric("accuracy", accuracy)
        mlflow.log_metric("precision", precision)
        mlflow.log_metric("recall", recall)
        mlflow.log_metric("f1_score", f1)

        # Log model
        mlflow.sklearn.log_model(model, name)
        print(f"{name} Model logged to MLflow")

        results.append([name, accuracy, precision, recall, f1])

        print(f"{name} Performance:")
        print(f"Accuracy: {accuracy:.4f}")
        print(f"Precision: {precision:.4f}")
        print(f"Recall: {recall:.4f}")
        print(f"F1-Score: {f1:.4f}\n")

# Save results
results_df = pd.DataFrame(results, columns=["Model", "Accuracy", "Precision", "Recall", "F1-Score"])
results_path = os.path.join(OUTPUT_DIR, "model_performance.csv")
results_df.to_csv(results_path, index=False)

print(f"Model performance results saved at '{results_path}'.")

2025/03/13 21:33:37 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.


Logistic Regression Model logged to MLflow
Logistic Regression Performance:
Accuracy: 0.8204
Precision: 0.6933
Recall: 0.5802
F1-Score: 0.6317





Random Forest Model logged to MLflow
Random Forest Performance:
Accuracy: 0.8013
Precision: 0.6667
Recall: 0.5027
F1-Score: 0.5732

Model performance results saved at 'C:/Users/Subbu/OneDrive/Desktop/M.TechStuff/Sem2/2.DMML/Assignment/Model_Building/model_performance.csv'.


In [None]:
!mlflow ui

In [9]:
import pickle
import mlflow.sklearn

# Load the best model
logged_model = "C:/Users/Subbu/OneDrive/Desktop/M.TechStuff/Sem2/2.DMML/Assignment/Model_Building/mlruns/621738802032390636/87b7aebe71af4ad291e41fa55e58f8f9/artifacts/model"
model = mlflow.sklearn.load_model(logged_model)


model_path = "C:/Users/Subbu/OneDrive/Desktop/M.TechStuff/Sem2/2.DMML/Assignment/Model_Building/best_model.pkl"

with open(model_path, "wb") as file:
    pickle.dump(model, file)

print(f"Best model saved at: {model_path}")

Best model saved at: C:/Users/Subbu/OneDrive/Desktop/M.TechStuff/Sem2/2.DMML/Assignment/Model_Building/best_model.pkl


In [11]:
# Load the saved model
with open(model_path, "rb") as file:
    loaded_model = pickle.load(file)

# Make predictions
y_pred = loaded_model.predict(X_test)

print("Model successfully loaded and predictions made.")

Model successfully loaded and predictions made.
