<img src= "https://cdn.oreillystatic.com/images/sitewide-headers/oreilly_logo_mark_red.svg"/>&nbsp;&nbsp;<font size="16"><b>AI, ML and GenAI in the Lakehouse<b></font></span>
<img style="float: left; margin: 0px 15px 15px 0px; width:30%; height: auto;" src="https://i.imgur.com/pQvJTVf.jpeg"   />   


 
  
   Name:          chapter 05-7-Load Telco Customer Churn Dataset
 
   Author:    Bennie Haelen
   Date:      12-24-2024

   Purpose:   This notebook will read the customer transaction analysis dataset from Kaggle and transform the data into features
                 
      An outline of the different sections in this notebook:
        1 - Read the Delta table witeh the housing prices
        2 - Start the modeling phase
            2-1 - Perform a train/test split of the data
            2-2 - Investigate the Shape of the datasets
            2-3 - Convert our training Pandas Dataframe to Spark
            2-4 - Start the AutoML Regression
        3 - Study the results of the regression and make predictions
            3-1 - Retrieve the URI of the best model
            3-2 - Create the Test Features
            3-3 - Load the best model from the MLflow function
            3-4 - Use the model to make prediction
            3-5 - Combine predictions and actual
            3-6 - Create a plot comparing the actuals with the predictions
            3-7 - Create a joint plot of actual vs predicted

%md
#Handle Pre-Requisites

##Make sure that kaggle and kagglehub are installed

In [0]:
%pip install kaggle

In [0]:
%pip install kagglehub

##Make sure to run the notebook with our constants

In [0]:
import pandas as pd

from databricks import feature_store
from databricks.feature_store import FeatureStoreClient
from sklearn.model_selection import train_test_split

In [0]:
%run "../common/Constants"

In [0]:
# File locations
TELCO_LOCAL_FILE_NAME = "Telco_customer_churn.csv"
KAGGLE_FILE_LOCATION    = "aadityabansalcodes/telecommunications-industry-customer-churn-dataset/versions/4"

# Table Name
FEATURE_TABLE_NAME   = "customer_churn"

#Use KaggleHub to download the Kaggle Dataset
[Link to the dataset] https://www.kaggle.com/datasets/aadityabansalcodes/telecommunications-industry-customer-churn-dataset

##Download the dataset to a local path

In [0]:
# Import the 'kagglehub' module to interact with Kaggle datasets.
import kagglehub  

# Download the latest version of the specified dataset.
# 'dataset_download' takes the dataset identifier as an argument.
# In this case, it downloads the dataset 'telecommunications-industry-customer-churn-dataset' by the user 'aadityabansalcodes',
# Which is the Telco Customer Churn Dataset
local_path = kagglehub.dataset_download(KAGGLE_FILE_LOCATION)

# Print the local file path where the dataset files have been downloaded.
print("Path to dataset files:", local_path)

%md
##Copy the local file to our DBFS datasets location

In [0]:
import shutil

# Construct the full local path by appending the file name to the existing local directory path
local_path = f"{local_path}/{TELCO_LOCAL_FILE_NAME}"

# Print the local path to verify correctness
print(f"The file has been downloaded to local path: {local_path}")  

# Define the DBFS path where you want to move the file
# This path specifies where the file will be stored in the Databricks File System (DBFS)
dbfs_path = f"{DBFS_DATASET_DIRECTORY}/{TELCO_LOCAL_FILE_NAME}"
print(f"The file will be copied to the dfbs location: {dbfs_path}")

# Use shutil.copy() to move the file from the local path to the DBFS path
# This function copies the file to the specified DBFS directory, making it accessible to Databricks
shutil.copy(local_path, dbfs_path)

In [0]:
%fs
ls dbfs:/FileStore/datasets/

#Load the dataset

In [0]:
# Check if the path starts with '/dbfs'
# The '/dbfs' prefix is used for local file system access, but Spark needs the path in DBFS format
if dbfs_path.startswith("/dbfs"):
    dbfs_path = dbfs_path[5:]  # Remove the first 5 characters to strip the '/dbfs' prefix

# Print the adjusted DBFS path to verify it has been modified correctly
print(f"Adjusted DBFS path: {dbfs_path}")

# Read the CSV file from the adjusted DBFS path using Spark
# The 'header=True' option specifies that the first row of the file contains column names
df = spark.read.csv(dbfs_path, header=True, inferSchema=True)

# Display the first 5 rows of the DataFrame to verify successful loading
display(df)

In [0]:
# Clean column names (remove spaces)
for col_name in df.columns:
    df = df.withColumnRenamed(col_name, col_name.replace(" ", "_"))

In [0]:
%sql
CREATE SCHEMA IF NOT EXISTS book_ai_ml_lakehouse.feature_store_db;

In [0]:
# Define the fully qualified feature table name (Unity Catalog)
feature_table_name = f"{CATALOG_NAME}.{FEATURE_STORE_DB}.{FEATURE_TABLE_NAME}"

In [0]:
from databricks.feature_store import FeatureStoreClient
from pyspark.sql.functions import col

# Initialize Feature Store client
fs = FeatureStoreClient()

# Select initial features

initial_features = df.select(
    "CustomerID",
    "Gender",
    "SeniorCitizen",
    "Partner",
    "Dependents",
    "Tenure_Months",
    "Monthly_Charges",
    "Churn_Value"
)

# Write initial features to the Feature Store
fs.create_table(
    name="telco_churn_demographics",
    primary_keys="CustomerID",
    df=initial_features,
    description="Initial feature set for Telco Customer Churn demographics"
)


In [0]:
from pyspark.sql.functions import when, col, lit, mean

# Data Cleaning and Preprocessing
# Replace missing or null values in the dataset
df = df.fillna({"Total Charges": 0.0})

# Cast 'SeniorCitizen' to a string for encoding
df = df.withColumn("SeniorCitizen", when(col("Senior Citizen") == 1, "Yes").otherwise("No"))

# Create new features for demonstration
df = df.withColumn("TotalChargesPerMonth", col("Total Charges") / (col("Tenure Months") + lit(1)))
df = df.withColumn("IsLongTermContract", when(col("Contract") == "Two year", lit(1)).otherwise(lit(0)))

# Create a DataFrame for features
feature_df = df.select(
    "customerID",
    "Tenure Months",
    "Monthly Charges",
    "TotalChargesPerMonth",
    "IsLongTermContract",
    "SeniorCitizen",
    "CLTV"
)
