# Model Seclection

In [4]:
# Import libraries
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Load data
X_train = pd.read_csv("../data/processed/X_train.csv")
X_test = pd.read_csv("../data/processed/X_test.csv")
y_train = pd.read_csv("../data/processed/y_train.csv")
y_test = pd.read_csv("../data/processed/y_test.csv")

# Load selected features from Task 3
selected_features = joblib.load("../models/selected_features.pkl")

# Filter datasets
X_train_selected = X_train[selected_features]
X_test_selected = X_test[selected_features]

print("✅ Data prepared for Task 4 with selected features:", selected_features)


✅ Data prepared for Task 4 with selected features: ['tenure', 'InternetService_Fiber optic', 'OnlineSecurity_No internet service', 'OnlineBackup_No internet service', 'DeviceProtection_No internet service', 'TechSupport_No internet service', 'StreamingTV_No internet service', 'StreamingMovies_No internet service', 'Contract_Two year', 'PaymentMethod_Electronic check']


In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import pandas as pd

# Define candidate models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42)
}

# Evaluate models
results = {}

for name, model in models.items():
    model.fit(X_train_selected, y_train.values.ravel())
    y_pred = model.predict(X_test_selected)
    
    results[name] = {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1-Score": f1_score(y_test, y_pred),
        "ROC-AUC": roc_auc_score(y_test, y_pred)
    }

# Convert to DataFrame for easy comparison
results_df = pd.DataFrame(results).T
results_df.sort_values(by="ROC-AUC", ascending=False)


Unnamed: 0,Accuracy,Precision,Recall,F1-Score,ROC-AUC
Gradient Boosting,0.794579,0.644599,0.498652,0.56231,0.69986
Logistic Regression,0.798146,0.669231,0.469003,0.551506,0.692794
Random Forest,0.776034,0.603636,0.447439,0.513932,0.670858


In [6]:
import joblib

# Save best model (Gradient Boosting)
best_model = models["Gradient Boosting"]
joblib.dump(best_model, "../models/gradient_boosting_model.pkl")

print("✅ Gradient Boosting model saved successfully for Task 5.")


✅ Gradient Boosting model saved successfully for Task 5.


### 🧩 Task 4 Summary: Model Selection

I compared three models — **Logistic Regression**, **Random Forest**, and **Gradient Boosting** — using the top 10 features selected in Task 3.  
Performance was evaluated using Accuracy, Precision, Recall, F1-Score, and ROC-AUC.

| Model | Accuracy | Precision | Recall | F1-Score | ROC-AUC |
|--------|-----------|------------|----------|------------|-----------|
| Gradient Boosting | 0.7946 | 0.6446 | 0.4987 | 0.5623 | **0.6999** |
| Logistic Regression | 0.7981 | **0.6692** | 0.4690 | 0.5515 | 0.6928 |
| Random Forest | 0.7760 | 0.6036 | 0.4474 | 0.5139 | 0.6709 |

**Conclusion:** Gradient Boosting achieved the best balance between precision and recall, with the highest ROC-AUC score.  
Hence, it was chosen as the **best model** for further fine-tuning and evaluation in **Task 5**.
