In [0]:
data = spark.read.options(header=True).csv(
    "/Volumes/workspace/default/customer_churn/customer_churn.csv"
)
display(data)

#### Explore Data with Summary Stats

1. Basic Summary Stats

In [0]:
# Dsiplay summary stats with spark
display(data.summary())


2. dbutils approach

In [0]:
dbutils.help()

In [0]:
dbutils.data.summarize(data)

#### Display

In [0]:
data.columns

In [0]:
from pyspark.sql.functions import ceil, col

data = data.withColumn("tenure_group", ceil(col("Tenure Months") / 5) * 5)

In [0]:
data.columns

In [0]:
display(data.select("Tenure Months", "tenure_group"))

In [0]:
display(data)

Databricks visualization. Run in Databricks to view.

Databricks visualization. Run in Databricks to view.

In [0]:
display(data.groupBy("Payment Method").count().orderBy("count", ascending=False))

In [0]:
from pyspark.sql.functions import avg, col, expr

display(
    data.groupBy("Payment Method")
        .agg(avg(expr("try_cast(`Total Charges` as double)")).alias("avg_total_charges"))
        .orderBy(col("avg_total_charges").desc())
)

#### Pandas

In [0]:
data_pd=data.toPandas()
display(data_pd)

#### Correlation Heatmap

In [0]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

In [0]:
num_columns = ["tenure_group", "Total Charges", "Monthly Charges"]

corr = data_pd[num_columns].apply(pd.to_numeric, errors='coerce').corr()
plt.figure(figsize=(10,6))
sns.heatmap(corr, annot=True, cmap='coolwarm', linewidths=.5)
plt.title("Correlation Heatmap for Telco dataset")
plt.show()

#### Pairplot

In [0]:
data_ppdf = data_pd[num_columns + ['Churn Value']]

In [0]:
data_ppdf = data_ppdf.apply(pd.to_numeric, errors='coerce')

In [0]:
display(data_ppdf.dtypes)

In [0]:
display(data_ppdf)

Databricks visualization. Run in Databricks to view.

In [0]:
sns.pairplot(data_ppdf, hue='Churn Value', diag_kind='kde')
plt.suptitle('Pairplot for Telco Dataset', y=1.02)
plt.show()

#### Boxplot

In [0]:
plt.figure(figsize=(10,6))
sns.boxplot(
    x='Churn Value',
    y='Monthly Charges',
    hue='Churn Value',
    data=data_ppdf,
    palette=['skyblue', 'salmon'],
    legend=False
)
plt.title('Monthly Charges by Churn Status')
plt.show()

Save to Tables

In [0]:
for col_name in data.columns:
    if " " in col_name:
        data = data.withColumnRenamed(col_name, col_name.replace(" ", "_"))
data

In [0]:
# Now save
table_name_bronzes = "telco_missing_bronzes"
data.write.saveAsTable(table_name_bronzes)

#### Time-Travel with Delta

In [0]:
# Drop columns and overwrite table
to_drop_wrong = ["gender", "Senior_Citizen"]

data_dropped = data.drop(*to_drop_wrong)
data_dropped.write.mode("overwrite").option("overwriteSchema", True).saveAsTable(table_name_bronzes)

#### Reverting Chnages by Version

In [0]:
%sql
DESCRIBE HISTORY telco_missing_bronzes

In [0]:
spark.table(Table_name_bronzes).printSchema()

In [0]:
%sql
DESCRIBE telco_missing_bronzes

#### Reverting Changes by Version

In [0]:
telco_df_v0 = (
    spark.read
        .option("versionAsOf", 0)
        .table(table_name_bronzes)
)

telco_df_v0.printSchema()

#### Reverting Cjanges by Timestamp

In [0]:
# Extarct timestamo of firts version (can also be set manually)
timestamp_v0 = spark.sql(f"DESCRIBE HISTORY telco_missing_bronzes").orderBy("version").first().timestamp
print(timestamp_v0)

In [0]:
(spark
    .read
    .option("timestampAsOf", timestamp_v0)
    .table(table_name_bronzes)
    .printSchema()
)

#### Rewrite back

In [0]:
to_drop = ['']

data_dropped = data.drop(*to_drop)
data_dropped.write.mode("overwrite").option("overwriteSchema", True).saveAsTable(table_name_bronzes)

In [0]:
%sql
DESCRIBE HISTORY telco_missing_bronzes