In [1]:
import pandas as pd
import joblib
from sklearn.preprocessing import StandardScaler


In [2]:
target_variable_name = "charge_per_tenure"


In [3]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [5]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score


In [6]:
dataset_path = '/content/drive/MyDrive/data/new_churn_data.csv'


In [7]:
data = pd.read_csv(dataset_path)

In [8]:
data.head()

Unnamed: 0,customerID,tenure,PhoneService,Contract,PaymentMethod,MonthlyCharges,TotalCharges,charge_per_tenure
0,9305-CKSKC,22,1,0,2,97.4,811.7,36.895455
1,1452-KNGVK,8,0,1,1,77.3,1701.95,212.74375
2,6723-OKKJM,28,1,0,0,28.25,250.9,8.960714
3,7832-POPKP,62,1,0,2,101.7,3106.56,50.105806
4,6348-TACGU,10,0,0,1,51.15,3440.97,344.097


In [9]:
data = data.drop("customerID", axis=1)

target_variable_name = "charge_per_tenure"

X = data.drop(target_variable_name, axis=1)  # Features
y = data[target_variable_name]  # Target variable

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

# Standardize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Initialize regressors (since this is a regression problem)
regressors = {
    "Linear Regression": LinearRegression(),
    "Random Forest Regressor": RandomForestRegressor(),
    "SVR": SVR(),
}

# Train and evaluate regressors
metrics = {
    "MSE": mean_squared_error,
    "RMSE": lambda y_true, y_pred: np.sqrt(mean_squared_error(y_true, y_pred)),
    "MAE": mean_absolute_error,
    "R-squared": r2_score,
}

results = {}

for regressor_name, regressor in regressors.items():
    regressor.fit(X_train, y_train)
    y_pred = regressor.predict(X_test)
    results[regressor_name] = {}
    for metric_name, metric_func in metrics.items():
        results[regressor_name][metric_name] = metric_func(y_test, y_pred)

# Print results
for regressor_name, metrics_dict in results.items():
    print(f"Regressor: {regressor_name}")
    for metric_name, metric_value in metrics_dict.items():
        print(f"{metric_name}: {metric_value:.4f}")

# Find the best regressor based on a selected metric (e.g., RMSE)
best_regressor = min(regressors, key=lambda x: results[x]["RMSE"])
print(f"Best Regressor based on RMSE: {best_regressor}")




Regressor: Linear Regression
MSE: 3273.4949
RMSE: 57.2145
MAE: 57.2145
R-squared: nan
Regressor: Random Forest Regressor
MSE: 2780.9335
RMSE: 52.7346
MAE: 52.7346
R-squared: nan
Regressor: SVR
MSE: 28535.5452
RMSE: 168.9247
MAE: 168.9247
R-squared: nan
Best Regressor based on RMSE: Random Forest Regressor




In [10]:
# Import joblib for model serialization
import joblib

# Save the best regressor model to a joblib file
best_regressor_model = regressors[best_regressor]
joblib.dump(best_regressor_model, "best_regressor_model.joblib")

# Print a confirmation message
print("Best regressor model saved to 'best_regressor_model.joblib'")

# Load the saved best regressor model from disk
loaded_model = joblib.load("best_regressor_model.joblib")

# Now, you can use the loaded_model for predictions or further analysis
print("Loaded best regressor model:")
print(loaded_model)


Best regressor model saved to 'best_regressor_model.joblib'
Loaded best regressor model:
RandomForestRegressor()


In [13]:
import pandas as pd
import joblib

# Load the saved best regressor model from disk
loaded_model = joblib.load("best_regressor_model.joblib")

def predict_churn_probabilities(dataframe):
    # Remove non-numeric columns like "customerID" from the dataset
    dataframe = dataframe.drop("customerID", axis=1)

    # Use the trained model to predict probabilities
    X_new = dataframe.drop(target_variable_name, axis=1)  # Features
    churn_probabilities = loaded_model.predict(X_new)

    # Print out the predicted probabilities
    print("Predicted Churn Probabilities for New Data:")
    for probability in churn_probabilities:
        print(f"Probability of Churn: {(probability/100000):.4f}")

# Load the new data

dataset_path = '/content/drive/MyDrive/data/new_churn_data.csv'

# Load the new data from Google Drive
new_data = pd.read_csv(dataset_path)

# True values for the new data
true_values = [1, 0, 0, 1, 0]

# Call the prediction function on the new data
predict_churn_probabilities(new_data)

# Print the true values
print("True Values for New Data:", true_values)


Predicted Churn Probabilities for New Data:
Probability of Churn: 0.0013
Probability of Churn: 0.0013
Probability of Churn: 0.0014
Probability of Churn: 0.0013
Probability of Churn: 0.0013
True Values for New Data: [1, 0, 0, 1, 0]




In [15]:
import pandas as pd
import numpy as np
import joblib
from scipy import stats  # Import scipy's stats module

# Load the saved best regressor model from disk
loaded_model = joblib.load("best_regressor_model.joblib")

def predict_churn_probabilities(dataframe):
    # Remove non-numeric columns like "customerID" from the dataset
    dataframe = dataframe.drop("customerID", axis=1)

    # Use the trained model to predict probabilities
    X_new = dataframe.drop(target_variable_name, axis=1)  # Features
    churn_probabilities = loaded_model.predict(X_new)


    synthetic_training_data = pd.DataFrame({
        "SyntheticProbability": np.random.rand(1000),  # Example synthetic training probabilities
    })

    # Calculate percentiles based on the synthetic training data using scipy's percentileofscore
    percentiles = [stats.percentileofscore(synthetic_training_data["SyntheticProbability"], prob) for prob in churn_probabilities]

    # Print out the predicted probabilities and percentiles
    print("Predicted Churn Probabilities for New Data:")
    for i, (probability, percentile) in enumerate(zip(churn_probabilities, percentiles)):
        print(f"Prediction {i + 1} - Probability of Churn: {probability/100000:.4f}, Percentile: {percentile:.2f}%")
dataset_path = '/content/drive/MyDrive/data/new_churn_data.csv'

# Load the new data from Google Drive
new_data = pd.read_csv(dataset_path)

# True values for the new data
true_values = [1, 0, 0, 1, 0]

# Call the prediction function on the new data
predict_churn_probabilities(new_data)

# Print the true values
print("True Values for New Data:", true_values)


Predicted Churn Probabilities for New Data:
Prediction 1 - Probability of Churn: 0.0013, Percentile: 100.00%
Prediction 2 - Probability of Churn: 0.0013, Percentile: 100.00%
Prediction 3 - Probability of Churn: 0.0014, Percentile: 100.00%
Prediction 4 - Probability of Churn: 0.0013, Percentile: 100.00%
Prediction 5 - Probability of Churn: 0.0013, Percentile: 100.00%
True Values for New Data: [1, 0, 0, 1, 0]




In [47]:
# Install PyCaret (if not already installed)
!pip install pycaret

import pandas as pd
from pycaret.regression import *

# Load your churn data from the CSV file
dataset_path = '/content/drive/MyDrive/data/new_churn_data.csv'
data = pd.read_csv(dataset_path)

# Initialize PyCaret setup for regression
setup(data=data, target='charge_per_tenure', train_size=0.8, session_id=123, fold=3)  # Adjust 'fold' value as needed

# Create a regression model
lr = create_model('lr')  # You can change 'lr' to another algorithm if needed

# Tune the regression model
tuned_lr = tune_model(lr)

# Predict 'charge_per_tenure' for new data
new_data = pd.read_csv(dataset_path)
predictions = predict_model(tuned_lr, data=new_data)

# Print the predicted values
print("Predicted 'charge_per_tenure' for New Data:")
for index, prediction in enumerate(predictions['charge_per_tenure']):
    print(f"Predicted 'charge_per_tenure' for row {index + 1}: {prediction:.4f}")




Unnamed: 0,Description,Value
0,Session id,123
1,Target,charge_per_tenure
2,Target type,Regression
3,Original data shape,"(5, 8)"
4,Transformed data shape,"(5, 11)"
5,Transformed train set shape,"(4, 11)"
6,Transformed test set shape,"(1, 11)"
7,Numeric features,6
8,Categorical features,1
9,Preprocess,True


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,138.9319,20866.4176,144.4521,0.0343,0.921,1.2511
1,129.9985,16899.609,129.9985,,0.9096,3.5234
2,150.7401,22722.5727,150.7401,,2.7809,16.8223
Mean,139.8902,20162.8664,141.7302,,1.5372,7.1989
Std,8.4948,2428.7122,8.6837,,0.8795,6.8677


Processing:   0%|          | 0/4 [00:00<?, ?it/s]

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,138.9319,20866.4176,144.4521,0.0343,0.921,1.2511
1,129.9985,16899.609,129.9985,,0.9096,3.5234
2,150.7401,22722.5727,150.7401,,2.7809,16.8223
Mean,139.8902,20162.8664,141.7302,,1.5372,7.1989
Std,8.4948,2428.7122,8.6837,,0.8795,6.8677


Processing:   0%|          | 0/7 [00:00<?, ?it/s]

Fitting 3 folds for each of 2 candidates, totalling 6 fits


Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Linear Regression,4.9265,121.3543,11.0161,0.9926,0.0548,0.0232


Predicted 'charge_per_tenure' for New Data:
Predicted 'charge_per_tenure' for row 1: 36.8955
Predicted 'charge_per_tenure' for row 2: 212.7437
Predicted 'charge_per_tenure' for row 3: 8.9607
Predicted 'charge_per_tenure' for row 4: 50.1058
Predicted 'charge_per_tenure' for row 5: 344.0970
