In [0]:
# read the bronze data
telco_df = spark.read.format("delta").load("dbfs:/user/arundhuti/delta/customer_churn/bronze")

Handling Null values

In [0]:
from pyspark.sql.functions import col, when

# Replacing string "null" with actual NULL values
for column in telco_df.columns:
  telco_df = telco_df.withColumn(column, when(col(column) == "null", None).otherwise(col(column)))

In [0]:
# Converting Data Type
telco_df = (telco_df.withColumn("TotalCharges", col("TotalCharges").cast("double"))
                    .withColumn("SeniorCitizen", when(col("SeniorCitizen")==1,True).otherwise(False))
                    ) 


In [0]:
# splitting the data into train and test
train_df,test_df = telco_df.randomSplit([0.8,0.2],seed=42)

Transformaing the data

In [0]:
from pyspark.sql.types import IntegerType, BooleanType, StringType, DoubleType
from pyspark.sql.functions import col, when, count
# Get a list of integer & boolean columns
int_cols = [column.name for column in train_df.schema.fields if (column.dataType == IntegerType() or column.dataType == BooleanType())]

# Loop through integer columns and convert to double
for column in int_cols:
  train_df = train_df.withColumn(column, col(column).cast("double"))
  test_df = test_df.withColumn(column, col(column).cast("double"))


Find Numeric columns with Missing Values

In [0]:
from pyspark.sql.functions import count,when
# Identify numeric columns
num_cols = [c.name for c in train_df.schema.fields if c.dataType == DoubleType()]

# Count missing values in numeric columns
num_missing_values_logic = [count(when(col(column).isNull(), column)).alias(column) for column in num_cols]
row_dict_num = train_df.select(num_missing_values_logic).first().asDict()
num_missing_cols = [column for column in row_dict_num if row_dict_num[column] > 0]

print(f"Numeric columns with missing values: {num_missing_cols}")

In [0]:
# Identify numeric columns for IntegerType
int_cols = [c.name for c in train_df.schema.fields if c.dataType == IntegerType()]

# Count missing values in numeric columns
int_missing_values_logic = [count(when(col(column).isNull(), column)).alias(column) for column in int_cols]
row_dict_int = train_df.select(int_missing_values_logic).first().asDict()
int_missing_cols = [column for column in row_dict_int if row_dict_int[column] > 0]

print(f"Integer columns with missing values: {int_missing_cols}")

In [0]:
# Identify string columns
string_cols = [c.name for c in train_df.schema.fields if c.dataType == StringType()]

# Count missing values in numeric columns
string_missing_values_logic = [count(when(col(column).isNull(), column)).alias(column) for column in string_cols]
row_dict_string = train_df.select(string_missing_values_logic).first().asDict()
string_missing_cols = [column for column in row_dict_string if row_dict_string[column] > 0]

print(f"String columns with missing values: {string_missing_cols}")

In [0]:
%python
# write the trained and test data into the location '/dbfs/user/arundhuti/delta/customer_churn/silver/train_data/' from the train_df DataFrame
train_df.write.format("delta").mode("overwrite").option("mergeSchema", "true").save("dbfs:/user/arundhuti/delta/customer_churn/silver/train_data")
test_df.write.format("delta").mode("overwrite").option("mergeSchema", "true").save("dbfs:/user/arundhuti/delta/customer_churn/silver/test_data")