In [0]:
# read the bronze data
member_df = spark.read.format("delta").load("dbfs:/user/arundhuti/delta/member_churn/bronze")
display(member_df)

In [0]:
from pyspark.sql.functions import col, when, count

# count total number of null records of each field and only show the name of the fields if the counts > 0
null_counts = member_df.select([count(when(col(column).isNull(), column)).alias(column) for column in member_df.columns])
non_zero_null_counts = [column for column in null_counts.columns if null_counts.collect()[0][column] > 0]
display(null_counts.select(non_zero_null_counts))

Handling Null values and data types

In [0]:
from pyspark.sql.functions import col, when

# Replacing string "null" with actual NULL values
for column in member_df.columns:
  member_df = member_df.withColumn(column, when(col(column) == "null", None).otherwise(col(column)))

member_df.printSchema()

Find with Missing Values Fields

In [0]:
# Identify string columns
string_cols = [c.name for c in member_df.schema.fields if c.dataType == StringType()]
print(f"String columns: {string_cols}")

# Count missing values in numeric columns
string_missing_values_logic = [count(when(col(column).isNull(), column)).alias(column) for column in string_cols]
row_dict_string = member_df.select(string_missing_values_logic).first().asDict()
string_missing_cols = [column for column in row_dict_string if row_dict_string[column] > 0]

print(f"String columns with missing values: {string_missing_cols}")



In [0]:
from pyspark.sql.functions import count, when, col
from pyspark.sql.types import BooleanType

# Identify boolean columns
bool_cols = [c.name for c in member_df.schema.fields if c.dataType == BooleanType()]
print(f"Boolean columns: {bool_cols}")

# Count missing (null) values in boolean columns
bool_missing_values_logic = [count(when(col(column).isNull(), column)).alias(column) for column in bool_cols]
row_dict_bool = member_df.select(bool_missing_values_logic).first().asDict()
bool_missing_cols = [column for column in row_dict_bool if row_dict_bool[column] > 0]

print(f"Boolean columns with missing values: {bool_missing_cols}")

In [0]:
%python
from pyspark.sql.functions import mean, col, when, lit
from pyspark.sql.types import IntegerType

# Identify numeric columns for IntegerType
int_cols = [c.name for c in member_df.schema.fields if c.dataType == IntegerType()]

# Count missing values in numeric columns
int_missing_values_logic = [count(when(col(column).isNull(), column)).alias(column) for column in int_cols]
row_dict_int = member_df.select(int_missing_values_logic).first().asDict()
int_missing_cols = [column for column in row_dict_int if row_dict_int[column] > 0]

print(f"Integer columns with missing values: {int_missing_cols}")

# Fill nulls with mean and cast to integer
for column in int_missing_cols:
    mean_value = member_df.select(mean(col(column))).first()[0]
    member_df = member_df.withColumn(column + "_filled", col(column).cast("double"))
    member_df = member_df.fillna({column + "_filled": mean_value})
    member_df = member_df.withColumn(column + "_filled", col(column + "_filled").cast("integer"))

# Drop the old fields
member_df = member_df.drop(*int_missing_cols)

# Rename the columns_filled to the old name columns
# if the colum name contains _filled, then rename it to the old name
member_df = member_df.drop(*int_missing_cols)
for column in int_missing_cols:
    member_df = member_df.withColumnRenamed(column + "_filled", column)


display(member_df)

Adjusting Data type

In [0]:
# write the cleaned data into the location '/dbfs/user/arundhuti/delta/customer_churn/silver/' from the train_df DataFrame
member_df.write.format("delta").mode("overwrite").option("mergeSchema", "true").save("dbfs:/user/arundhuti/delta/member_churn/silver/")

In [0]:
# splitting the data into train and test
# train_df,test_df = member_df.randomSplit([0.8,0.2],seed=42)

In [0]:
# %python
# # write the trained and test data into the location '/dbfs/user/arundhuti/delta/customer_churn/silver/train_data/' from the train_df DataFrame
# train_df.write.format("delta").mode("overwrite").option("mergeSchema", "true").save("dbfs:/user/arundhuti/delta/member_churn/silver/train_data")
# test_df.write.format("delta").mode("overwrite").option("mergeSchema", "true").save("dbfs:/user/arundhuti/delta/member_churn/silver/test_data")