%md
#Handle Pre-Requisites

##Make sure that kaggle and kagglehub are installed

In [0]:
%pip install kaggle

In [0]:
%pip install kagglehub

##Make sure to run the notebook with our constants

In [0]:
import pandas as pd

from databricks import feature_store
from databricks.feature_store import FeatureStoreClient
from sklearn.model_selection import train_test_split

In [0]:
%run "../common/Constants"

In [0]:
# File locations
TELCO_LOCAL_FILE_NAME = "Telco_customer_churn.csv"
KAGGLE_FILE_LOCATION    = "aadityabansalcodes/telecommunications-industry-customer-churn-dataset/versions/4"

# Table Name
FEATURE_TABLE_NAME   = "customer_churn"

#Use KaggleHub to download the Kaggle Dataset
[Link to the dataset] https://www.kaggle.com/datasets/aadityabansalcodes/telecommunications-industry-customer-churn-dataset

##Download the dataset to a local path


In [0]:
# Import the 'kagglehub' module to interact with Kaggle datasets.
import kagglehub  

# Download the latest version of the specified dataset.
# 'dataset_download' takes the dataset identifier as an argument.
# In this case, it downloads the dataset 'telecommunications-industry-customer-churn-dataset' by the user 'aadityabansalcodes',
# Which is the Telco Customer Churn Dataset
local_path = kagglehub.dataset_download(KAGGLE_FILE_LOCATION)

# Print the local file path where the dataset files have been downloaded.
print("Path to dataset files:", local_path)

##Copy the local file to our DBFS datasets location

In [0]:
import shutil

# Construct the full local path by appending the file name to the existing local directory path
local_path = f"{local_path}/{TELCO_LOCAL_FILE_NAME}"

# Print the local path to verify correctness
print(f"The file has been downloaded to local path: {local_path}")  

# Define the DBFS path where you want to move the file
# This path specifies where the file will be stored in the Databricks File System (DBFS)
dbfs_path = f"{DBFS_DATASET_DIRECTORY}/{TELCO_LOCAL_FILE_NAME}"
print(f"The file will be copied to the dfbs location: {dbfs_path}")

# Use shutil.copy() to move the file from the local path to the DBFS path
# This function copies the file to the specified DBFS directory, making it accessible to Databricks
shutil.copy(local_path, dbfs_path)

In [0]:
%fs
ls dbfs:/FileStore/datasets/

#Load the dataset

In [0]:
# Check if the path starts with '/dbfs'
# The '/dbfs' prefix is used for local file system access, but Spark needs the path in DBFS format
if dbfs_path.startswith("/dbfs"):
    dbfs_path = dbfs_path[5:]  # Remove the first 5 characters to strip the '/dbfs' prefix

# Print the adjusted DBFS path to verify it has been modified correctly
print(f"Adjusted DBFS path: {dbfs_path}")

# Read the CSV file from the adjusted DBFS path using Spark
# The 'header=True' option specifies that the first row of the file contains column names
df = spark.read.csv(dbfs_path, header=True, inferSchema=True)

# Display the first 5 rows of the DataFrame to verify successful loading
display(df)

##Clean up the Column names (replace the spaces with underscores)

In [0]:
for col_name in df.columns:
    df = df.withColumnRenamed(col_name, col_name.replace(" ", "_"))

##Do some basic pre-processing
Convert Total_Charges to numeric and handle errors

In [0]:
from pyspark.sql.functions import regexp_replace, col

df = df.withColumn("Total_Charges", regexp_replace(col("Total_Charges"), "[^0-9.]", ""))
df = df.withColumn("Total_Charges", col("Total_Charges").cast("double"))

In [0]:
%sql
CREATE SCHEMA IF NOT EXISTS book_ai_ml_lakehouse.feature_store_db;

#Start the Feature Store Work

##Select an initial set of features and write them to  a Feature Table

In [0]:
from databricks.feature_store import FeatureStoreClient

# Feature Store Client
fs = FeatureStoreClient()

FEATURE_TABLE_NAME = "telco_churn_demographic_features"
# Define the fully qualified feature table name (Unity Catalog)
feature_table_name = f"{CATALOG_NAME}.{FEATURE_STORE_DB}.{FEATURE_TABLE_NAME}"
print(f"Feature table name: {feature_table_name}")

# Create an initial feature table with demographic features
demo_features = df.select(
    "CustomerID", "Gender", "Senior_Citizen", "Partner", "Dependents"
)

# Use the fully qualified name for the table creation
fs.create_table(
    name=feature_table_name,
    primary_keys=["CustomerID"],
    schema=demo_features.schema,
    description="Demographic features of customers"
)

# Write the table using the fully qualified name
fs.write_table(
    name=feature_table_name,
    df=demo_features,
    mode="overwrite"
)

In [0]:
# Create another feature table for service-related features
service_features = df.select(
    "CustomerID",
    "Phone_Service",
    "Multiple_Lines",
    "Internet_Service",
    "Online_Security",
    "Online_Backup",
    "Device_Protection",
    "Tech_Support",
    "Streaming_TV",
    "Streaming_Movies"
)

FEATURE_TABLE_NAME = "telco_churn_service_feature"
# Define the fully qualified feature table name (Unity Catalog)
feature_table_name = f"{CATALOG_NAME}.{FEATURE_STORE_DB}.{FEATURE_TABLE_NAME}"

fs.create_table(
    name=feature_table_name,
    primary_keys=["CustomerID"],
    schema=service_features.schema,
    description="Service features of customers"
)
fs.write_table(
    name=feature_table_name,
    df=service_features,
    mode="overwrite"
)


In [0]:
# Add new features using merge
new_features = df.select(
    "CustomerID",
    "Tenure_Months",
    "Monthly_Charges",
    "Total_Charges"
)

FEATURE_TABLE_NAME = "telco_churn_demographic_features"
# Define the fully qualified feature table name (Unity Catalog)
feature_table_name = f"{CATALOG_NAME}.{FEATURE_STORE_DB}.{FEATURE_TABLE_NAME}"

fs.write_table(
    name=feature_table_name,
    df=new_features,
    mode="merge"
)

In [0]:
# Add or update rows in the feature store with custom values
# Add or update rows in the feature store with custom values
updated_rows = spark.createDataFrame([
    ("12345", "Female", "0", "Yes", "No"),
    ("67890", "Male", "1", "No", "Yes")
], [
    "CustomerID", "Gender", "Senior_Citizen", "Partner", "Dependents"
])

FEATURE_TABLE_NAME = "telco_churn_demographic_features"
# Define the fully qualified feature table name (Unity Catalog)
feature_table_name = f"{CATALOG_NAME}.{FEATURE_STORE_DB}.{FEATURE_TABLE_NAME}"

fs.write_table(
    name=feature_table_name,
    df=updated_rows,
    mode="merge"
)

# Create a training set from the feature store

In [0]:
from databricks.feature_store import FeatureLookup

FEATURE_TABLE_NAME = "telco_churn_demographic_features"
# Define the fully qualified feature table name (Unity Catalog)
feature_table_name_demographic = f"{CATALOG_NAME}.{FEATURE_STORE_DB}.{FEATURE_TABLE_NAME}"

FEATURE_TABLE_NAME = "telco_churn_service_feature"
# Define the fully qualified feature table name (Unity Catalog)
feature_table_name_churn_service = f"{CATALOG_NAME}.{FEATURE_STORE_DB}.{FEATURE_TABLE_NAME}"

# Drop columns from the input DataFrame that overlap with feature store output
overlapping_columns = [
    "Gender", "Senior_Citizen", "Partner", "Dependents",
    "Tenure_Months", "Monthly_Charges", "Total_Charges",
    "Phone_Service", "Multiple_Lines", "Internet_Service",
    "Online_Security", "Online_Backup", "Device_Protection",
    "Tech_Support", "Streaming_TV", "Streaming_Movies"
]
df_cleaned = df.drop(*overlapping_columns)

# Create a training dataset from the feature store
training_set = fs.create_training_set(
    df_cleaned,
    feature_lookups=[
        FeatureLookup(
            table_name=feature_table_name_demographic,
            lookup_key="CustomerID"
        ),
        FeatureLookup(
            table_name=feature_table_name_churn_service,
            lookup_key="CustomerID"
        )
    ],
    label="Churn_Value",
    exclude_columns=["CustomerID"]
)

In [0]:
training_df = training_set.load_df().toPandas()

# Split the data
X = training_df.drop(columns=["Churn_Value"])
y = training_df["Churn_Value"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [0]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
# One-hot encoding for categorical columns
X_train = pd.get_dummies(X_train, drop_first=True)
X_test = pd.get_dummies(X_test, drop_first=True)

# Align train and test sets
X_train, X_test = X_train.align(X_test, join='left', axis=1)
X_test = X_test.fillna(0)

# Ensure no missing values
X_train = X_train.fillna(0)

# Train the model
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, y_train)

# Evaluate the model
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))


In [0]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from ydata_profiling import ProfileReport 

# Accuracy Score
print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")

# Generate a Confusion Matrix heatmap
conf_matrix = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=clf.classes_, yticklabels=clf.classes_)
plt.title("Confusion Matrix")
plt.ylabel("True Label")
plt.xlabel("Predicted Label")
plt.show()

# Generate a feature importance plot
feature_importances = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': clf.feature_importances_
}).sort_values(by='Importance', ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=feature_importances.head(10))
plt.title("Top 10 Feature Importances")
plt.xlabel("Importance Score")
plt.ylabel("Features")
plt.tight_layout()
plt.show()

# Profiling Report
print("Generating Profiling Report for Training Data...")
profile = ProfileReport(X_train, title="Training Data Report", explorative=True)
profile.to_file("training_data_report.html")  # Opens the report as an HTML file

# SHAP Analysis
import shap

explainer = shap.TreeExplainer(clf)
shap_values = explainer.shap_values(X_test)

# Summary Plot
shap.summary_plot(shap_values[1], X_test, plot_type="bar")

# Detailed force plot for the first prediction
shap.force_plot(explainer.expected_value[1], shap_values[1][0, :], X_test.iloc[0, :])