In [6]:
import joblib
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import make_column_selector as selector
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
import pandas as pd

# Define preprocessing for numerical features
num_features = selector(dtype_include='number')
num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Define preprocessing for categorical features
cat_features = selector(dtype_include='object')
cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

# Combine transformers into a preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, num_features),
        ('cat', cat_transformer, cat_features)
    ]
)

# Define models
models = {
    'Logistic Regression': LogisticRegression(),
    'Random Forest': RandomForestClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'SVC': SVC(),
    'KNN': KNeighborsClassifier()
}

# Load your dataset
data = pd.read_csv('./data/customer_churn_org.csv')
X= data.drop('Churn', axis = 1)
y = data.iloc[:, -1]


# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

best_model = None
best_accuracy = 0
best_model_name = ''

for name, model in models.items():
    # Create pipeline with preprocessor and model
    pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('model', model)])
    
    # Train and evaluate model
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    
    print(f'{name} Accuracy: {accuracy:.4f}')
    
    # Check if this model is the best so far
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_model = pipeline
        best_model_name = name

# Save the best model
if best_model:
    model_filename = f'best_model_{best_model_name.replace(" ", "_")}.joblib'
    joblib.dump(best_model, model_filename)
    print(f'Best Model saved as: {model_filename}')
else:
    print('No model was trained.')


Logistic Regression Accuracy: 0.8121
Random Forest Accuracy: 0.7913
Gradient Boosting Accuracy: 0.8041
SVC Accuracy: 0.8064
KNN Accuracy: 0.7657
Best Model saved as: best_model_Logistic_Regression.joblib
