## EDA 

In [1]:
import sys
sys.path.append('/home/jovyan/work/work/scripts')
from pyspark.sql import functions as F, types as T
from pyspark.sql.types import *
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
# Initialize Spark
from spark_init import init_spark

# Initialize Spark with proper configuration
spark = init_spark("EDA of the generated CDR AT - Complete Pipeline")
print("✅ SparkSession initialized")

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/06/25 00:43:53 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


✅ SparkSession initialized (App: EDA of the generated CDR AT - Complete Pipeline, Spark: 3.5.1)
✅ Hive Warehouse: hdfs://namenode:9000/user/hive/warehouse
✅ Hive Metastore URI: thrift://hive-metastore:9083
✅ SparkSession initialized


In [None]:
# ---- 2. Load Data ----
CUSTOMERS_PATH = "/mnt/generated_at_cdr/customers.parquet"
CDR_PATH = "/mnt/generated_at_cdr/cdr_*.parquet"   # use wildcard if many

customers = spark.read.parquet(CUSTOMERS_PATH)
cdr = spark.read.parquet(CDR_PATH)


In [None]:
# ---- 3. Quick Schema & Row Counts ----
customers.printSchema()
cdr.printSchema()

print(f"Customers: {customers.count():,}")
print(f"CDRs: {cdr.count():,}")


In [None]:
# ---- 4. Data Preview ----
customers.show(30, truncate=False)
cdr.show(30, truncate=False)


In [None]:
# ---- 5. Anonymization Audit ----
print("Sample anonymized fields:")
customers.select("customer_id", "phone_number").show(30, truncate=False)
cdr.select("customer_id", "phone_number", "b_number").show(30, truncate=False)
# Check for possible de-anonymization leaks (should be no integer-only or obvious formats)


In [None]:
# ---- 6. Data Quality: NULL/Missing/Uniqueness ----
from pyspark.sql.functions import col, count, isnan, countDistinct

# Null count per column (customers)
for colname in customers.columns:
    nulls = customers.filter(col(colname).isNull()).count()
    print(f"{colname}: {nulls} NULLs")

# Unique customer_id
print("Unique customer_id in customers:", customers.select("customer_id").distinct().count())
print("Unique phone_number in customers:", customers.select("phone_number").distinct().count())


In [None]:
# ---- 7. Customer Distribution by Segments ----
customers.groupBy("customer_type").count().show()
customers.groupBy("service_type").count().show()
customers.groupBy("wilaya_name").count().orderBy(F.desc("count")).show(10)

# Pie chart (optional, pandas)
df_cust = customers.groupBy("service_type").count().toPandas()
df_cust.plot.pie(y="count", labels=df_cust["service_type"], legend=False, autopct="%.1f%%")
plt.title("Customer Distribution by Service Type")
plt.ylabel("")
plt.show()


In [None]:
# ---- 8. CDR Distribution: Types, Service, Wilaya ----
cdr.groupBy("cdr_type").count().show()
cdr.groupBy("service_type").count().show()
cdr.groupBy("wilaya_name").count().orderBy(F.desc("count")).show(10)

cdr.groupBy("cdr_type", "service_type").count().orderBy(F.desc("count")).show(12)


In [None]:
# ---- 9. Joinability Test & CDR Coverage ----
joined = cdr.join(customers, "customer_id", "left")
# % of CDRs with a matching customer
matched = joined.filter("wilaya_name is not null").count() / cdr.count()
print(f"CDRs with matching customer: {matched*100:.2f}%")
# Should be ~100% unless simulating churned/ghost customers.


In [None]:
# ---- 10. Churn/Activation Patterns ----
customers.groupBy("is_active").count().show()

# Churn by service type/wilaya
customers.groupBy("service_type", "is_active").count().orderBy("service_type", "is_active").show()

# Time since activation
import pyspark.sql.functions as F
from datetime import datetime

customers = customers.withColumn("activation_date_dt", F.from_unixtime((F.col("activation_date")/1000).cast("long")))
customers.select(F.min("activation_date_dt"), F.max("activation_date_dt")).show()


In [None]:
# ---- 11. Business Logic Checks ----
# Offer-price-bandwidth alignment
customers.groupBy("offer_name", "offer_price", "bandwidth_mbps").count().orderBy(F.desc("count")).show(12)
# Out-of-range/implausible checks
customers.filter("offer_price < 100 or offer_price > 10000").show()
customers.filter("bandwidth_mbps > 2000").show()
cdr.filter("duration_seconds > 4*3600").show()  # Calls > 4 hours? Outliers.


In [None]:
# ---- 12. Basic Usage Stats ----
cdr.groupBy("cdr_type").agg(
    F.count("*").alias("count"),
    F.mean("duration_seconds").alias("avg_duration_sec"),
    F.mean("cost_da").alias("avg_cost_da"),
    F.sum("cost_da").alias("total_revenue_da")
).show()


In [None]:
# ---- 13. Outage/Plan Change Events ----
cdr.filter("cdr_type = 'OUTAGE'").groupBy("wilaya_name", "outage_type").count().orderBy(F.desc("count")).show(10)
cdr.filter("cdr_type = 'PLAN_CHANGE'").groupBy("old_offer", "new_offer").count().show(10)
cdr.filter("cdr_type = 'RECHARGE'").groupBy("payment_method").count().show()


In [None]:
# ---- 14. International & Special Event Patterns ----
cdr.filter("cdr_type = 'VOICE' and call_type = 'INTERNATIONAL'").groupBy("wilaya_name").count().orderBy(F.desc("count")).show(10)
cdr.filter("cdr_type = 'VOICE' and call_type = 'FAVORI'").groupBy("wilaya_name").count().orderBy(F.desc("count")).show(10)
cdr.groupBy(F.dayofmonth("timestamp").alias("day")).count().orderBy("day").show()


In [None]:
# ---- 15. Data Usage Patterns (by hour, day, wilaya, service) ----
cdr_data = cdr.filter("cdr_type = 'DATA'")
cdr_data = cdr_data.withColumn("hour", F.hour("timestamp"))
cdr_data.groupBy("hour").agg(F.mean("data_volume_mb").alias("avg_data_mb")).orderBy("hour").show(24)

# Plot hourly average
df_hour = cdr_data.groupBy("hour").agg(F.mean("data_volume_mb").alias("avg_data_mb")).orderBy("hour").toPandas()
plt.plot(df_hour["hour"], df_hour["avg_data_mb"])
plt.xlabel("Hour of Day")
plt.ylabel("Avg Data Usage (MB)")
plt.title("Hourly Avg Data Usage")
plt.show()


In [None]:
# ---- 16. Data Quality Red Flags: Anomaly Checks ----
# Impossible/negative durations or costs
cdr.filter("duration_seconds < 0 or cost_da < 0").show()
# Impossible data usage
cdr_data.filter("data_volume_mb < 0").show()
# Duplicates
dup_cdr = cdr.groupBy("cdr_id").count().filter("count > 1").count()
print(f"Duplicate CDR IDs: {dup_cdr}")


In [None]:
# ---- 17. Save Cleaned Data if Needed ----
# Example: Remove rows with null customer_id or negative cost
clean_cdr = cdr.filter("customer_id is not null and (cost_da is null or cost_da >= 0)")
clean_cdr.write.mode("overwrite").parquet("/mnt/cleaned/cdr_cleaned.parquet")
