In [4]:
import pandas as pd
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB

# Load the dataset
dataset = pd.read_csv("cleaned_df_luxottica_churn_updated_0108.csv", index_col=None)

# Create a copy of the dataset for transformation
dataset_transformed = pd.get_dummies(dataset, drop_first=True)

# Separate features and target variable
X = dataset_transformed.drop('Churn_Yes', axis=1)
y = dataset_transformed['Churn_Yes']

# Apply Min-Max scaling to ensure non-negative values
min_max_scaler = MinMaxScaler()
X_scaled = min_max_scaler.fit_transform(X)

# Initialize the SelectKBest with chi2
select_k_best = SelectKBest(score_func=chi2, k=10)
X_selected = select_k_best.fit_transform(X_scaled, y)

# Get the selected feature indices and names
selected_features_indices = select_k_best.get_support(indices=True)
selected_features = X.columns[selected_features_indices]
print("Selected Feature Indices:\n", selected_features_indices)
print("Selected Features:\n", selected_features)

# Create the final feature and target datasets with selected features
X_final = X[selected_features]
y_final = y

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_final, y_final, test_size=0.25, random_state=0)

# Apply SMOTE to the training data
smote = SMOTE(random_state=0)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Standardize the features using MinMaxScaler
scaler = MinMaxScaler()
X_train_resampled = scaler.fit_transform(X_train_resampled)
X_test = scaler.transform(X_test)

# Dictionary to store models
models = {
    "Random Forest": RandomForestClassifier(random_state=0),
    "Logistic Regression": LogisticRegression(random_state=0),
    "Decision Tree": DecisionTreeClassifier(random_state=0),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "Support Vector Machine": SVC(random_state=0),
    "Gradient Boosting": GradientBoostingClassifier(random_state=0),
    "Naive Bayes": GaussianNB()
}

# Train and evaluate each model
for model_name, model in models.items():
    print(f"\n{model_name}")
    model.fit(X_train_resampled, y_train_resampled)
    y_pred = model.predict(X_test)

    # Evaluate the model
    conf_matrix = confusion_matrix(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)
    class_report = classification_report(y_test, y_pred)

    # Display the results
    print("Confusion Matrix:\n", conf_matrix)
    print("Accuracy:", accuracy)
    print("Classification Report:\n", class_report)


Selected Feature Indices:
 [   0    3    5    6   10   11   13 3299 3300 3301]
Selected Features:
 Index(['Age', 'Customer_Support_Interactions', 'Customer_Satisfaction',
       'Purchase_Frequency', 'Lifetime_Value', 'Average_Order_Value',
       'Number_of_Product_Categories_Purchased',
       'Loyalty_Program_Participation_Inactive',
       'Engagement_with_Promotions_Low', 'Engagement_with_Promotions_Medium'],
      dtype='object')

Random Forest
Confusion Matrix:
 [[17143    44]
 [    0  6589]]
Accuracy: 0.9981493943472409
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     17187
           1       0.99      1.00      1.00      6589

    accuracy                           1.00     23776
   macro avg       1.00      1.00      1.00     23776
weighted avg       1.00      1.00      1.00     23776


Logistic Regression
Confusion Matrix:
 [[17062   125]
 [    0  6589]]
Accuracy: 0.9947425975773889
Classification 

In [5]:
from sklearn.model_selection import cross_val_score

# Evaluate model using cross-validation
for model_name, model in models.items():
    cv_scores = cross_val_score(model, X_final, y_final, cv=5)
    print(f"{model_name} - Cross-Validation Accuracy: {cv_scores.mean():.4f} (+/- {cv_scores.std():.4f})")


Random Forest - Cross-Validation Accuracy: 0.9981 (+/- 0.0003)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Logistic Regression - Cross-Validation Accuracy: 0.9682 (+/- 0.0013)
Decision Tree - Cross-Validation Accuracy: 0.9965 (+/- 0.0003)
K-Nearest Neighbors - Cross-Validation Accuracy: 0.7520 (+/- 0.0007)
Support Vector Machine - Cross-Validation Accuracy: 0.7361 (+/- 0.0010)
Gradient Boosting - Cross-Validation Accuracy: 0.9980 (+/- 0.0003)
Naive Bayes - Cross-Validation Accuracy: 0.9836 (+/- 0.0004)


In [6]:
# Load the dataset
dataset = pd.read_csv("cleaned_df_luxottica_churn_updated_0108.csv", index_col=None)

# Create a copy of the dataset for transformation
dataset_transformed = pd.get_dummies(dataset, drop_first=True)

# Separate features and target variable
X = dataset_transformed.drop('Churn_Yes', axis=1)
y = dataset_transformed['Churn_Yes']

# Apply Min-Max scaling to ensure non-negative values
min_max_scaler = MinMaxScaler()
X_scaled = min_max_scaler.fit_transform(X)

# Initialize the SelectKBest with chi2
select_k_best = SelectKBest(score_func=chi2, k=10)
X_selected = select_k_best.fit_transform(X_scaled, y)

# Get the selected feature indices and names
selected_features_indices = select_k_best.get_support(indices=True)
selected_features = X.columns[selected_features_indices]
print("Selected Feature Indices:\n", selected_features_indices)
print("Selected Features:\n", selected_features)

# Create the final feature and target datasets with selected features
X_final = X[selected_features]
y_final = y

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_final, y_final, test_size=0.25, random_state=0)

# Apply SMOTE to the training data
smote = SMOTE(random_state=0)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Standardize the features using MinMaxScaler
scaler = MinMaxScaler()
X_train_resampled = scaler.fit_transform(X_train_resampled)
X_test = scaler.transform(X_test)

# Dictionary to store models
models = {
    "Random Forest": RandomForestClassifier(random_state=0),
    "Logistic Regression": LogisticRegression(random_state=0),
    "Decision Tree": DecisionTreeClassifier(random_state=0),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "Support Vector Machine": SVC(random_state=0),
    "Gradient Boosting": GradientBoostingClassifier(random_state=0),
    "Naive Bayes": GaussianNB()
}

# Train, evaluate each model, and compare training and testing accuracy
for model_name, model in models.items():
    print(f"\n{model_name}")

    # Train the model
    model.fit(X_train_resampled, y_train_resampled)
    
    # Predict on training data
    y_train_pred = model.predict(X_train_resampled)
    # Predict on testing data
    y_test_pred = model.predict(X_test)

    # Calculate accuracies
    train_accuracy = accuracy_score(y_train_resampled, y_train_pred)
    test_accuracy = accuracy_score(y_test, y_test_pred)

    # Print accuracies
    print(f"Training Accuracy: {train_accuracy:.4f}")
    print(f"Testing Accuracy: {test_accuracy:.4f}")

    # Evaluate the model
    conf_matrix = confusion_matrix(y_test, y_test_pred)
    class_report = classification_report(y_test, y_test_pred)

    # Display the results
    print("Confusion Matrix:\n", conf_matrix)
    print("Classification Report:\n", class_report)

Selected Feature Indices:
 [   0    3    5    6   10   11   13 3299 3300 3301]
Selected Features:
 Index(['Age', 'Customer_Support_Interactions', 'Customer_Satisfaction',
       'Purchase_Frequency', 'Lifetime_Value', 'Average_Order_Value',
       'Number_of_Product_Categories_Purchased',
       'Loyalty_Program_Participation_Inactive',
       'Engagement_with_Promotions_Low', 'Engagement_with_Promotions_Medium'],
      dtype='object')

Random Forest
Training Accuracy: 1.0000
Testing Accuracy: 0.9981
Confusion Matrix:
 [[17143    44]
 [    0  6589]]
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     17187
           1       0.99      1.00      1.00      6589

    accuracy                           1.00     23776
   macro avg       1.00      1.00      1.00     23776
weighted avg       1.00      1.00      1.00     23776


Logistic Regression
Training Accuracy: 0.9965
Testing Accuracy: 0.9947
Confusion Matrix:
 [[