In [1]:
!pip install pycaret pandas
!pip install mlflow



In [2]:
from google.colab import files

# Upload cleaned churn data and new churn data CSVs
uploaded = files.upload()  # Select cleaned_churn_data2.csv and new_churn_data.csv


In [3]:
import pandas as pd
from pycaret.classification import *

In [4]:
# Load the uploaded churn data CSV files
df = pd.read_csv('cleaned_churn_data2.csv')
new_data = pd.read_csv('new_churn_data.csv')

In [11]:
# Select only the common columns between both datasets
common_columns = ['tenure', 'PhoneService', 'Contract', 'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn']
df_common = df[common_columns]

In [12]:
# Set up PyCaret with the reduced dataset (common columns)
clf = setup(data=df_common, target='Churn', session_id=123, log_experiment=False)

Unnamed: 0,Description,Value
0,Session id,123
1,Target,Churn
2,Target type,Binary
3,Original data shape,"(7043, 7)"
4,Transformed data shape,"(7043, 7)"
5,Transformed train set shape,"(4930, 7)"
6,Transformed test set shape,"(2113, 7)"
7,Numeric features,6
8,Preprocess,True
9,Imputation type,simple


In [13]:
# Compare models based on AUC and select the best one
best_model = compare_models(sort='AUC')

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
gbc,Gradient Boosting Classifier,0.7933,0.839,0.5054,0.6401,0.5636,0.431,0.4368,0.546
ada,Ada Boost Classifier,0.7895,0.8389,0.5001,0.6316,0.5568,0.4215,0.4272,0.215
lr,Logistic Regression,0.7939,0.8336,0.5085,0.6433,0.5665,0.434,0.4401,0.101
lightgbm,Light Gradient Boosting Machine,0.7852,0.8279,0.5192,0.6127,0.5604,0.42,0.4234,0.464
qda,Quadratic Discriminant Analysis,0.7454,0.8229,0.7386,0.5142,0.606,0.4268,0.4424,0.031
ridge,Ridge Classifier,0.7901,0.8214,0.4389,0.6581,0.5248,0.3973,0.4116,0.043
lda,Linear Discriminant Analysis,0.7886,0.8214,0.4947,0.6316,0.5535,0.418,0.4241,0.055
xgboost,Extreme Gradient Boosting,0.7787,0.8154,0.5146,0.5969,0.5517,0.4061,0.4087,0.112
nb,Naive Bayes,0.7146,0.8075,0.76,0.4767,0.5855,0.385,0.4098,0.032
rf,Random Forest Classifier,0.7688,0.7963,0.4771,0.5784,0.5217,0.3715,0.3751,0.84


Processing:   0%|          | 0/65 [00:00<?, ?it/s]

In [14]:
# Save the best model
save_model(best_model, 'best_model')

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=Memory(location=None),
          steps=[('numerical_imputer',
                  TransformerWrapper(exclude=None,
                                     include=['tenure', 'PhoneService',
                                              'Contract', 'PaymentMethod',
                                              'MonthlyCharges', 'TotalCharges'],
                                     transformer=SimpleImputer(add_indicator=False,
                                                               copy=True,
                                                               fill_value=None,
                                                               keep_empty_features=False,
                                                               missing_values=nan,
                                                               strategy='mean'))),
                 ('categorical_imputer',...
                                             criterion='friedman_mse', init=None,
                      

In [15]:
# Load the saved model (this is for later use)
model = load_model('best_model')

Transformation Pipeline and Model Successfully Loaded


In [16]:
# Prepare new data by selecting only the common columns (except 'Churn' since it's the target)
new_data_common = new_data[['tenure', 'PhoneService', 'Contract', 'PaymentMethod', 'MonthlyCharges', 'TotalCharges']]

In [28]:
# Define the churn prediction function with inspection
def predict_churn(df, original_data):
    # Make predictions using the loaded model
    predictions = predict_model(model, data=df)
    # Print the columns to inspect the output
    print(predictions.columns)
        # Add the customerID to the predictions
    predictions['customerID'] = original_data['customerID']
    # Print the predictions dataframe to understand the column names
    print(predictions.head())
    # Return the churn probabilities and predicted labels with customerID
    return predictions


In [29]:
# Use the function to predict churn on the new data
predictions = predict_churn(new_data_common, new_data)

# Print the predictions
print(predictions)

Index(['tenure', 'PhoneService', 'Contract', 'PaymentMethod', 'MonthlyCharges',
       'TotalCharges', 'prediction_label', 'prediction_score'],
      dtype='object')
   tenure  PhoneService  Contract  PaymentMethod  MonthlyCharges  \
0      22             1         0              2       97.400002   
1       8             0         1              1       77.300003   
2      28             1         0              0       28.250000   
3      62             1         0              2      101.699997   
4      10             0         0              1       51.150002   

   TotalCharges  prediction_label  prediction_score  customerID  
0    811.700012                 0            0.7290  9305-CKSKC  
1   1701.949951                 0            0.7871  1452-KNGVK  
2    250.899994                 0            0.8573  6723-OKKJM  
3   3106.560059                 0            0.7433  7832-POPKP  
4   3440.969971                 0            0.5780  6348-TACGU  
   tenure  PhoneService  Cont

In [23]:
# If you want to save the predictions, you can export them as a CSV
predictions.to_csv('predictions.csv', index=False)

In [24]:
pd = pd.read_csv('predictions.csv')
pd.head()

Unnamed: 0,tenure,PhoneService,Contract,PaymentMethod,MonthlyCharges,TotalCharges,prediction_label,prediction_score,customerID
0,22,1,0,2,97.4,811.7,0,0.729,9305-CKSKC
1,8,0,1,1,77.3,1701.95,0,0.7871,1452-KNGVK
2,28,1,0,0,28.25,250.9,0,0.8573,6723-OKKJM
3,62,1,0,2,101.7,3106.56,0,0.7433,7832-POPKP
4,10,0,0,1,51.15,3440.97,0,0.578,6348-TACGU
