In [4]:
# Notebook 02A: Comprehensive Data Quality Assessment
## CDR Telecom Big Data Engineering - Algerie Telecom

# ============================================================
# NOTEBOOK 02A: COMPREHENSIVE DATA QUALITY ASSESSMENT
# Project: CDR Telecom Big Data Engineering Final Year Internship
# Focus: New Year's Eve Data Quality Analysis (Dec 31, 2024 - Jan 1, 2025)
# ============================================================

# ------------------------------------------------------------
# Cell 1: Setup and Initialization
# ------------------------------------------------------------
import sys
sys.path.append('/home/jovyan/work/work/scripts')
from spark_init import init_spark
from pyspark.sql import functions as F, types as T
from pyspark.sql.window import Window
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

spark = init_spark("CDR Data Quality Assessment - New Year's Analysis")

# Load anonymized data
df = spark.read.parquet("/user/hive/warehouse/cdr_anonymized/")
total_rows = df.count()

print("=" * 80)
print("📊 CDR DATA QUALITY ASSESSMENT - NEW YEAR'S EVE SPECIAL")
print("=" * 80)
print(f"Analysis Period: Dec 31, 2024 - Jan 1, 2025")
print(f"Total Records: {total_rows:,}")
print(f"Analysis Started: {datetime.now()}")
print("=" * 80)

# ------------------------------------------------------------
# Cell 2: Schema and Data Type Validation
# ------------------------------------------------------------
print("\n🔍 SCHEMA VALIDATION AND DATA TYPES")
print("-" * 50)

# Analyze data types
type_summary = {}
for field in df.schema.fields:
    dtype = str(field.dataType)
    if dtype not in type_summary:
        type_summary[dtype] = []
    type_summary[dtype].append(field.name)

print("\n📋 Data Type Distribution:")
for dtype, columns in type_summary.items():
    print(f"\n{dtype}: {len(columns)} columns")
    for col in columns[:5]:  # Show first 5 columns
        print(f"  - {col}")
    if len(columns) > 5:
        print(f"  ... and {len(columns) - 5} more")

# Check for schema consistency
print("\n✅ Schema Validation Results:")
print(f"  - Total Columns: {len(df.columns)}")
print(f"  - Numeric Columns: {len([c for c, t in df.dtypes if t in ['double', 'int', 'bigint']])}")
print(f"  - String Columns: {len([c for c, t in df.dtypes if t == 'string'])}")
print(f"  - Date Columns: {len([c for c, t in df.dtypes if 'date' in t])}")
print(f"  - Hash Columns: {len([c for c in df.columns if c.endswith('_HASH')])}")


# ------------------------------------------------------------
# Cell 3: Completeness Analysis
# ------------------------------------------------------------
print("\n📊 DATA COMPLETENESS ANALYSIS")
print("-" * 50)

# Calculate null counts and percentages
null_analysis = []
for col in df.columns:
    null_count = df.filter(F.col(col).isNull()).count()
    null_percentage = (null_count / total_rows) * 100
    null_analysis.append({
        'column': col,
        'null_count': null_count,
        'null_percentage': null_percentage,
        'completeness': 100 - null_percentage
    })

# Create DataFrame and sort by completeness
null_df = spark.createDataFrame(null_analysis)
null_df = null_df.orderBy('completeness')

print("\n🔴 Columns with Missing Data (>0% nulls):")
null_df.filter(F.col('null_percentage') > 0).show(20, truncate=False)

print("\n✅ Columns with Complete Data (100% complete):")
complete_columns = null_df.filter(F.col('completeness') == 100).count()
print(f"   {complete_columns} out of {len(df.columns)} columns have no missing values")

# Overall completeness score
avg_completeness = null_df.agg(F.avg('completeness')).collect()[0][0]
print(f"\n📈 Overall Data Completeness Score: {avg_completeness:.2f}%")

# Critical fields completeness check
critical_fields = ['CDR_ID', 'START_DATE', 'END_DATE', 'PRI_IDENTITY_HASH', 
                  'ACTUAL_USAGE', 'DEBIT_AMOUNT', 'CDR_DAY']
print("\n🔍 Critical Fields Completeness:")
for field in critical_fields:
    if field in df.columns:
        completeness = null_df.filter(F.col('column') == field).select('completeness').collect()[0][0]
        print(f"   {field}: {completeness:.2f}%")

# ------------------------------------------------------------
# Cell 4: Validity and Range Checks
# ------------------------------------------------------------
print("\n🎯 DATA VALIDITY AND RANGE ANALYSIS")
print("-" * 50)

# Numeric columns validation
numeric_cols = ['ACTUAL_USAGE', 'RATE_USAGE', 'DEBIT_AMOUNT', 
                'UN_DEBIT_AMOUNT', 'TOTAL_TAX', 'ChargingTime']

print("\n📊 Numeric Columns Statistics:")
for col in numeric_cols:
    if col in df.columns:
        stats = df.select(
            F.min(col).alias('min'),
            F.max(col).alias('max'),
            F.avg(col).alias('avg'),
            F.stddev(col).alias('stddev'),
            F.expr(f"percentile_approx({col}, 0.5)").alias('median'),
            F.sum(F.when(F.col(col) < 0, 1).otherwise(0)).alias('negative_values'),
            F.sum(F.when(F.col(col) == 0, 1).otherwise(0)).alias('zero_values')
        ).collect()[0]
        
        print(f"\n{col}:")
        print(f"  Range: [{stats['min']:.2f}, {stats['max']:.2f}]")
        print(f"  Mean: {stats['avg']:.2f}, Median: {stats['median']:.2f}")
        print(f"  Negative values: {stats['negative_values']:,}")
        print(f"  Zero values: {stats['zero_values']:,}")
        
        # Outlier detection using IQR method
        q1 = df.select(F.expr(f"percentile_approx({col}, 0.25)")).collect()[0][0]
        q3 = df.select(F.expr(f"percentile_approx({col}, 0.75)")).collect()[0][0]
        iqr = q3 - q1
        lower_bound = q1 - 1.5 * iqr
        upper_bound = q3 + 1.5 * iqr
        
        outliers = df.filter(
            (F.col(col) < lower_bound) | (F.col(col) > upper_bound)
        ).count()
        
        print(f"  Outliers (IQR method): {outliers:,} ({outliers/total_rows*100:.2f}%)")

# Date validation
print("\n📅 Date Range Validation:")
date_stats = df.select(
    F.min('CDR_DAY').alias('min_date'),
    F.max('CDR_DAY').alias('max_date'),
    F.countDistinct('CDR_DAY').alias('unique_days')
).collect()[0]

print(f"  Date Range: {date_stats['min_date']} to {date_stats['max_date']}")
print(f"  Unique Days: {date_stats['unique_days']}")

# Validate START_DATE and END_DATE consistency
print("\n⏱️ Call Duration Consistency Check:")
df_with_duration = df.withColumn(
    'calculated_duration',
    (F.unix_timestamp(F.col('END_DATE'), 'yyyyMMddHHmmss') - 
     F.unix_timestamp(F.col('START_DATE'), 'yyyyMMddHHmmss'))
)

duration_issues = df_with_duration.filter(
    (F.col('calculated_duration') < 0) | 
    (F.col('calculated_duration') > 86400)  # More than 24 hours
).count()

print(f"  Records with invalid duration: {duration_issues:,} ({duration_issues/total_rows*100:.2f}%)")

# ------------------------------------------------------------
# Cell 5: Uniqueness and Duplicate Analysis
# ------------------------------------------------------------
print("\n🔍 UNIQUENESS AND DUPLICATE ANALYSIS")
print("-" * 50)

# Check CDR_ID uniqueness
unique_cdr_ids = df.select('CDR_ID').distinct().count()
duplicate_cdr_ids = total_rows - unique_cdr_ids

print(f"\n📋 CDR_ID Uniqueness:")
print(f"  Total Records: {total_rows:,}")
print(f"  Unique CDR_IDs: {unique_cdr_ids:,}")
print(f"  Duplicate CDR_IDs: {duplicate_cdr_ids:,} ({duplicate_cdr_ids/total_rows*100:.2f}%)")

if duplicate_cdr_ids > 0:
    # Find most duplicated CDR_IDs
    print("\n  Most Duplicated CDR_IDs:")
    df.groupBy('CDR_ID').count() \
        .filter(F.col('count') > 1) \
        .orderBy(F.desc('count')) \
        .show(10)

# Check for complete duplicate rows
print("\n📋 Complete Duplicate Rows Check:")
duplicate_rows = df.groupBy(df.columns).count() \
    .filter(F.col('count') > 1) \
    .agg(F.sum(F.col('count') - 1)).collect()

if duplicate_rows[0][0]:
    print(f"  Found {duplicate_rows[0][0]:,} complete duplicate rows")
else:
    print("  ✅ No complete duplicate rows found")

# Analyze hash columns for uniqueness
print("\n🔐 Hash Columns Uniqueness Analysis:")
hash_cols = [c for c in df.columns if c.endswith('_HASH')]
for col in hash_cols:
    unique_values = df.select(col).distinct().count()
    print(f"  {col}: {unique_values:,} unique values")

# ------------------------------------------------------------
# Cell 6: Consistency and Business Rules Validation
# ------------------------------------------------------------
print("\n📏 BUSINESS RULES AND CONSISTENCY CHECKS")
print("-" * 50)

# Rule 1: ACTUAL_USAGE should be <= RATE_USAGE
print("\n1️⃣ Usage Consistency (ACTUAL_USAGE <= RATE_USAGE):")
usage_violations = df.filter(
    (F.col('ACTUAL_USAGE') > F.col('RATE_USAGE')) & 
    F.col('ACTUAL_USAGE').isNotNull() & 
    F.col('RATE_USAGE').isNotNull()
).count()
print(f"   Violations: {usage_violations:,} ({usage_violations/total_rows*100:.2f}%)")

# Rule 2: DEBIT_AMOUNT should be 0 when ACTUAL_USAGE is 0
print("\n2️⃣ Revenue Consistency (Zero usage = Zero charge):")
revenue_violations = df.filter(
    (F.col('ACTUAL_USAGE') == 0) & (F.col('DEBIT_AMOUNT') > 0)
).count()
print(f"   Violations: {revenue_violations:,} ({revenue_violations/total_rows*100:.2f}%)")

# Rule 3: Call success validation
print("\n3️⃣ Call Success Patterns:")
call_patterns = df.groupBy(
    F.when(F.col('ACTUAL_USAGE') > 0, 'Successful').otherwise('Failed')
).count().orderBy('count', ascending=False)
call_patterns.show()

# Rule 4: Service category distribution
print("\n4️⃣ Service Category Distribution:")
df.groupBy('SERVICE_CATEGORY').agg(
    F.count('*').alias('count'),
    F.avg('ACTUAL_USAGE').alias('avg_duration'),
    F.avg('DEBIT_AMOUNT').alias('avg_revenue')
).orderBy('count', ascending=False).show()

# Rule 5: Call type distribution
print("\n5️⃣ Call Type Distribution:")
df.groupBy('CallType').count().orderBy('count', ascending=False).show()

# ------------------------------------------------------------
# Cell 7: Timeliness and Data Freshness
# ------------------------------------------------------------
print("\n⏰ DATA TIMELINESS AND FRESHNESS ANALYSIS")
print("-" * 50)

# Analyze processing delays
df_time = df.withColumn(
    'processing_delay_seconds',
    F.unix_timestamp(F.col('CREATE_DATE'), 'yyyyMMddHHmmss') - 
    F.unix_timestamp(F.col('END_DATE'), 'yyyyMMddHHmmss')
)

delay_stats = df_time.select(
    F.min('processing_delay_seconds').alias('min_delay'),
    F.max('processing_delay_seconds').alias('max_delay'),
    F.avg('processing_delay_seconds').alias('avg_delay'),
    F.expr("percentile_approx(processing_delay_seconds, 0.5)").alias('median_delay')
).collect()[0]

print(f"\n📊 Processing Delay Statistics:")
print(f"  Minimum Delay: {delay_stats['min_delay']:,.0f} seconds")
print(f"  Maximum Delay: {delay_stats['max_delay']:,.0f} seconds")
print(f"  Average Delay: {delay_stats['avg_delay']:,.0f} seconds")
print(f"  Median Delay: {delay_stats['median_delay']:,.0f} seconds")

# Data distribution by hour for both days
print("\n📅 Hourly Data Distribution (New Year's Transition):")
hourly_dist = df.withColumn(
    'call_hour', F.hour(F.to_timestamp(F.col('START_DATE'), 'yyyyMMddHHmmss'))
).groupBy('CDR_DAY', 'call_hour').count().orderBy('CDR_DAY', 'call_hour')

hourly_dist.show(48)  # Show all hours for both days

# ------------------------------------------------------------
# Cell 8: New Year's Eve Special Quality Checks
# ------------------------------------------------------------
print("\n🎆 NEW YEAR'S EVE SPECIAL QUALITY ANALYSIS")
print("-" * 50)

# Compare data quality between Dec 31 and Jan 1
print("\n📊 Quality Metrics Comparison:")
quality_by_day = df.groupBy('CDR_DAY').agg(
    F.count('*').alias('total_records'),
    F.countDistinct('PRI_IDENTITY_HASH').alias('unique_users'),
    F.sum(F.when(F.col('ACTUAL_USAGE') == 0, 1).otherwise(0)).alias('failed_calls'),
    F.avg('ACTUAL_USAGE').alias('avg_duration'),
    F.sum('DEBIT_AMOUNT').alias('total_revenue'),
    F.countDistinct('CallingCellID').alias('active_cells')
).orderBy('CDR_DAY')

quality_by_day.show()

# Analyze midnight spike
print("\n🕐 Midnight Analysis (23:00 - 01:00):")
midnight_df = df.withColumn(
    'hour', F.hour(F.to_timestamp(F.col('START_DATE'), 'yyyyMMddHHmmss'))
).filter(
    ((F.col('CDR_DAY') == '2024-12-31') & (F.col('hour') >= 23)) |
    ((F.col('CDR_DAY') == '2025-01-01') & (F.col('hour') <= 1))
)

midnight_stats = midnight_df.groupBy('CDR_DAY', 'hour').agg(
    F.count('*').alias('calls'),
    F.countDistinct('PRI_IDENTITY_HASH').alias('unique_users'),
    F.avg('ACTUAL_USAGE').alias('avg_duration'),
    F.sum(F.when(F.col('ACTUAL_USAGE') == 0, 1).otherwise(0)).alias('failed_calls')
).orderBy('CDR_DAY', 'hour')

midnight_stats.show()

# Calculate failure rate during peak hours
failure_rate = midnight_stats.withColumn(
    'failure_rate', F.round(F.col('failed_calls') / F.col('calls') * 100, 2)
)
print("\n⚠️ Failure Rates During New Year's Transition:")
failure_rate.select('CDR_DAY', 'hour', 'calls', 'failure_rate').show()

# ------------------------------------------------------------
# Cell 9: Data Quality Score Card
# ------------------------------------------------------------
print("\n📊 COMPREHENSIVE DATA QUALITY SCORECARD")
print("=" * 80)

# Calculate individual quality dimensions
completeness_score = avg_completeness

validity_score = 100 - (duration_issues / total_rows * 100)

uniqueness_score = (unique_cdr_ids / total_rows) * 100

consistency_score = 100 - ((usage_violations + revenue_violations) / (total_rows * 2) * 100)

# Overall Data Quality Score
overall_score = (completeness_score + validity_score + uniqueness_score + consistency_score) / 4

print(f"\n🎯 Data Quality Dimensions:")
print(f"  1. Completeness: {completeness_score:.2f}%")
print(f"  2. Validity: {validity_score:.2f}%")
print(f"  3. Uniqueness: {uniqueness_score:.2f}%")
print(f"  4. Consistency: {consistency_score:.2f}%")
print(f"\n📈 OVERALL DATA QUALITY SCORE: {overall_score:.2f}%")

# Quality Grade
if overall_score >= 95:
    grade = "A+ (Excellent)"
elif overall_score >= 90:
    grade = "A (Very Good)"
elif overall_score >= 85:
    grade = "B (Good)"
elif overall_score >= 80:
    grade = "C (Fair)"
else:
    grade = "D (Poor)"

print(f"📊 Quality Grade: {grade}")

# Recommendations
print("\n💡 RECOMMENDATIONS:")
if completeness_score < 95:
    print("  ⚠️ Address missing values in CallingPartyIMSI and CalledPartyIMSI fields")
if validity_score < 95:
    print("  ⚠️ Investigate records with invalid duration calculations")
if uniqueness_score < 100:
    print("  ⚠️ Review duplicate CDR_ID entries")
if consistency_score < 95:
    print("  ⚠️ Fix business rule violations in usage and revenue fields")

print("\n✅ Data Quality Assessment Complete!")
print(f"   Analysis completed at: {datetime.now()}")

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/06/29 04:12:06 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


✅ SparkSession initialized (App: CDR Data Quality Assessment - New Year's Analysis, Spark: 3.5.1)
✅ Hive Warehouse: hdfs://namenode:9000/user/hive/warehouse
✅ Hive Metastore URI: thrift://hive-metastore:9083


                                                                                

📊 CDR DATA QUALITY ASSESSMENT - NEW YEAR'S EVE SPECIAL
Analysis Period: Dec 31, 2024 - Jan 1, 2025
Total Records: 89,911
Analysis Started: 2025-06-29 04:12:13.410479

🔍 SCHEMA VALIDATION AND DATA TYPES
--------------------------------------------------

📋 Data Type Distribution:

StringType(): 32 columns
  - CDR_ID
  - CDR_SUB_ID
  - CDR_TYPE
  - CDR_BATCH_ID
  - SRC_CDR_ID
  ... and 27 more

DoubleType(): 6 columns
  - ACTUAL_USAGE
  - RATE_USAGE
  - DEBIT_AMOUNT
  - UN_DEBIT_AMOUNT
  - TOTAL_TAX
  ... and 1 more

DateType(): 1 columns
  - CDR_DAY

✅ Schema Validation Results:
  - Total Columns: 39
  - Numeric Columns: 6
  - String Columns: 32
  - Date Columns: 1
  - Hash Columns: 6

📊 DATA COMPLETENESS ANALYSIS
--------------------------------------------------

🔴 Columns with Missing Data (>0% nulls):


                                                                                

+---------------------+-----------------+----------+-----------------+
|column               |completeness     |null_count|null_percentage  |
+---------------------+-----------------+----------+-----------------+
|IMEI_HASH            |0.0              |89911     |100.0            |
|CallingRoamInfo      |0.0              |89911     |100.0            |
|CalledRoamInfo       |0.0              |89911     |100.0            |
|CalledCellID         |0.0              |89911     |100.0            |
|CallingPartyIMSI_HASH|0.424864588315117|89529     |99.57513541168488|
|CalledPartyIMSI_HASH |0.489372824237293|89471     |99.5106271757627 |
|MSCAddress           |20.5102823903638 |71470     |79.4897176096362 |
|CallingCellID        |20.5102823903638 |71470     |79.4897176096362 |
+---------------------+-----------------+----------+-----------------+


✅ Columns with Complete Data (100% complete):
   31 out of 39 columns have no missing values

📈 Overall Data Completeness Score: 80.56%

🔍 Critica

                                                                                


ACTUAL_USAGE:
  Range: [0.00, 3604.00]
  Mean: 161.13, Median: 52.00
  Negative values: 0
  Zero values: 116
  Outliers (IQR method): 12,995 (14.45%)

RATE_USAGE:
  Range: [0.00, 3660.00]
  Mean: 182.35, Median: 60.00
  Negative values: 0
  Zero values: 116
  Outliers (IQR method): 15,975 (17.77%)

DEBIT_AMOUNT:
  Range: [0.00, 95899.00]
  Mean: 512.75, Median: 0.00
  Negative values: 0
  Zero values: 54,749
  Outliers (IQR method): 6,269 (6.97%)

UN_DEBIT_AMOUNT:
  Range: [0.00, 0.00]
  Mean: 0.00, Median: 0.00
  Negative values: 0
  Zero values: 89,911
  Outliers (IQR method): 0 (0.00%)

TOTAL_TAX:
  Range: [0.00, 0.00]
  Mean: 0.00, Median: 0.00
  Negative values: 0
  Zero values: 89,911
  Outliers (IQR method): 0 (0.00%)

ChargingTime:
  Range: [20241231211909.00, 20250101133522.00]
  Mean: 20249392676983.87, Median: 20250101095203.00
  Negative values: 0
  Zero values: 0
  Outliers (IQR method): 11,297 (12.56%)

📅 Date Range Validation:
  Date Range: 2024-12-31 to 2025-01-01
  Un

25/06/29 04:12:28 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

  ✅ No complete duplicate rows found

🔐 Hash Columns Uniqueness Analysis:
  PRI_IDENTITY_HASH: 40,843 unique values
  CallingPartyNumber_HASH: 40,908 unique values
  CalledPartyNumber_HASH: 50,708 unique values
  CallingPartyIMSI_HASH: 229 unique values
  CalledPartyIMSI_HASH: 298 unique values
  IMEI_HASH: 1 unique values

📏 BUSINESS RULES AND CONSISTENCY CHECKS
--------------------------------------------------

1️⃣ Usage Consistency (ACTUAL_USAGE <= RATE_USAGE):
   Violations: 0 (0.00%)

2️⃣ Revenue Consistency (Zero usage = Zero charge):
   Violations: 0 (0.00%)

3️⃣ Call Success Patterns:
+------------------------------------------------------------+-----+
|CASE WHEN (ACTUAL_USAGE > 0) THEN Successful ELSE Failed END|count|
+------------------------------------------------------------+-----+
|                                                  Successful|89795|
|                                                      Failed|  116|
+-----------------------------------------------------

In [5]:
# Notebook 02B: Comprehensive Exploratory Data Analysis (EDA)
## CDR Telecom Big Data Engineering - New Year's Eve Focus

# ============================================================
# NOTEBOOK 02B: COMPREHENSIVE EXPLORATORY DATA ANALYSIS
# Project: CDR Telecom Big Data Engineering Final Year Internship  
# Focus: New Year's Eve Insights (Dec 31, 2024 - Jan 1, 2025)
# ============================================================

# ------------------------------------------------------------
# Cell 1: Setup and Data Loading
# ------------------------------------------------------------
import sys
sys.path.append('/home/jovyan/work/work/scripts')
from spark_init import init_spark
from pyspark.sql import functions as F, types as T
from pyspark.sql.window import Window
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from datetime import datetime

spark = init_spark("CDR Exploratory Data Analysis - New Year's Special")

# Load data
df = spark.read.parquet("/user/hive/warehouse/cdr_anonymized/")
df.cache()

print("=" * 80)
print("🎊 CDR EXPLORATORY DATA ANALYSIS - NEW YEAR'S EVE SPECIAL")
print("=" * 80)
print(f"Analysis Period: December 31, 2024 - January 1, 2025")
print(f"Total Records: {df.count():,}")
print(f"Unique Subscribers: {df.select('PRI_IDENTITY_HASH').distinct().count():,}")
print("=" * 80)

# ------------------------------------------------------------
# Cell 2: Temporal Analysis - New Year's Transition
# ------------------------------------------------------------
print("\n📅 TEMPORAL ANALYSIS - NEW YEAR'S EVE CELEBRATION")
print("-" * 60)

# Add temporal features
df_temporal = df.withColumn(
    'call_hour', F.hour(F.to_timestamp(F.col('START_DATE'), 'yyyyMMddHHmmss'))
).withColumn(
    'call_minute', F.minute(F.to_timestamp(F.col('START_DATE'), 'yyyyMMddHHmmss'))
).withColumn(
    'day_name', 
    F.when(F.col('CDR_DAY') == '2024-12-31', 'New Years Eve')
    .otherwise('New Years Day')
)

# Hourly distribution
hourly_stats = df_temporal.groupBy('CDR_DAY', 'call_hour').agg(
    F.count('*').alias('total_calls'),
    F.countDistinct('PRI_IDENTITY_HASH').alias('unique_users'),
    F.sum(F.when(F.col('ACTUAL_USAGE') > 0, 1).otherwise(0)).alias('successful_calls'),
    F.sum(F.when(F.col('ACTUAL_USAGE') == 0, 1).otherwise(0)).alias('failed_calls'),
    F.avg('ACTUAL_USAGE').alias('avg_duration'),
    F.sum('DEBIT_AMOUNT').alias('total_revenue')
).orderBy('CDR_DAY', 'call_hour')

print("\n📊 Hourly Call Volume Distribution:")
hourly_stats.show(48)

# Identify peak hours
peak_hours = hourly_stats.orderBy(F.desc('total_calls')).limit(5)
print("\n🔝 Top 5 Peak Hours:")
peak_hours.show()

# Midnight spike analysis
midnight_window = df_temporal.filter(
    ((F.col('CDR_DAY') == '2024-12-31') & (F.col('call_hour').between(22, 23))) |
    ((F.col('CDR_DAY') == '2025-01-01') & (F.col('call_hour').between(0, 2)))
)

print("\n🕐 Midnight Celebration Window (22:00 Dec 31 - 02:00 Jan 1):")
midnight_stats = midnight_window.groupBy('CDR_DAY', 'call_hour').agg(
    F.count('*').alias('calls'),
    F.countDistinct('PRI_IDENTITY_HASH').alias('active_users'),
    F.round(F.avg('ACTUAL_USAGE'), 2).alias('avg_duration_sec'),
    F.round(F.sum('DEBIT_AMOUNT'), 2).alias('revenue')
).orderBy('CDR_DAY', 'call_hour')
midnight_stats.show()

# ------------------------------------------------------------
# Cell 3: Call Pattern Analysis
# ------------------------------------------------------------
print("\n📞 CALL PATTERN ANALYSIS")
print("-" * 60)

# Call duration distribution
print("\n⏱️ Call Duration Distribution:")
duration_buckets = [0, 10, 30, 60, 120, 300, 600, 1800, 3600, 86400]
duration_labels = ['0-10s', '10-30s', '30-60s', '1-2min', '2-5min', 
                  '5-10min', '10-30min', '30-60min', '>1hour']

df_duration = df
for i in range(len(duration_buckets)-1):
    df_duration = df_duration.withColumn(
        f'duration_{duration_labels[i]}',
        F.when((F.col('ACTUAL_USAGE') >= duration_buckets[i]) & 
               (F.col('ACTUAL_USAGE') < duration_buckets[i+1]), 1).otherwise(0)
    )

duration_dist = df_duration.select(
    *[F.sum(f'duration_{label}').alias(label) for label in duration_labels]
).collect()[0]

print("\nCall Duration Categories:")
for label in duration_labels:
    count = duration_dist[label]
    percentage = (count / df.count()) * 100
    print(f"  {label}: {count:,} calls ({percentage:.2f}%)")

# Call types and patterns
print("\n📊 Call Success Patterns by Day:")
success_patterns = df.groupBy('CDR_DAY').agg(
    F.count('*').alias('total_calls'),
    F.sum(F.when(F.col('ACTUAL_USAGE') > 0, 1).otherwise(0)).alias('successful'),
    F.sum(F.when(F.col('ACTUAL_USAGE') == 0, 1).otherwise(0)).alias('failed'),
    F.round(F.avg(F.when(F.col('ACTUAL_USAGE') > 0, F.col('ACTUAL_USAGE'))), 2).alias('avg_success_duration')
).withColumn(
    'success_rate', F.round(F.col('successful') / F.col('total_calls') * 100, 2)
).withColumn(
    'failure_rate', F.round(F.col('failed') / F.col('total_calls') * 100, 2)
)
success_patterns.show()

# Service category analysis
print("\n📱 Service Category Distribution:")
service_dist = df.groupBy('SERVICE_CATEGORY').agg(
    F.count('*').alias('count'),
    F.round(F.avg('ACTUAL_USAGE'), 2).alias('avg_duration'),
    F.round(F.sum('DEBIT_AMOUNT'), 2).alias('total_revenue'),
    F.countDistinct('PRI_IDENTITY_HASH').alias('unique_users')
).orderBy(F.desc('count'))
service_dist.show()

# ------------------------------------------------------------
# Cell 4: User Behavior Analysis
# ------------------------------------------------------------
print("\n👥 USER BEHAVIOR ANALYSIS")
print("-" * 60)

# User activity distribution
user_stats = df.groupBy('PRI_IDENTITY_HASH').agg(
    F.count('*').alias('total_calls'),
    F.sum(F.when(F.col('ACTUAL_USAGE') > 0, 1).otherwise(0)).alias('successful_calls'),
    F.sum('ACTUAL_USAGE').alias('total_duration'),
    F.sum('DEBIT_AMOUNT').alias('total_spend'),
    F.countDistinct('CDR_DAY').alias('active_days')
)

# User segments
user_segments = user_stats.withColumn(
    'user_segment',
    F.when(F.col('total_calls') >= 50, 'Heavy User')
    .when(F.col('total_calls') >= 20, 'Medium User')
    .when(F.col('total_calls') >= 5, 'Light User')
    .otherwise('Minimal User')
)

print("\n📊 User Segmentation:")
segment_summary = user_segments.groupBy('user_segment').agg(
    F.count('*').alias('user_count'),
    F.round(F.avg('total_calls'), 2).alias('avg_calls'),
    F.round(F.avg('total_duration'), 2).alias('avg_duration'),
    F.round(F.avg('total_spend'), 2).alias('avg_spend')
).orderBy(F.desc('user_count'))
segment_summary.show()

# Top users analysis
print("\n🏆 Top 10 Most Active Users:")
top_users = user_stats.orderBy(F.desc('total_calls')).limit(10)
top_users.select(
    F.col('total_calls'),
    F.round(F.col('total_duration')/60, 2).alias('total_minutes'),
    F.round(F.col('total_spend'), 2).alias('total_spend'),
    F.col('active_days')
).show()

# New Year's Eve special users
nye_active = df_temporal.filter(
    (F.col('CDR_DAY') == '2024-12-31') & (F.col('call_hour').between(23, 23))
).select('PRI_IDENTITY_HASH').distinct()

ny_active = df_temporal.filter(
    (F.col('CDR_DAY') == '2025-01-01') & (F.col('call_hour').between(0, 0))
).select('PRI_IDENTITY_HASH').distinct()

midnight_users = nye_active.intersect(ny_active).count()
print(f"\n🎊 Users active at midnight transition: {midnight_users:,}")

# ------------------------------------------------------------
# Cell 5: Revenue Analysis
# ------------------------------------------------------------
print("\n💰 REVENUE ANALYSIS")
print("-" * 60)

# Overall revenue metrics
revenue_stats = df.agg(
    F.sum('DEBIT_AMOUNT').alias('total_revenue'),
    F.avg('DEBIT_AMOUNT').alias('avg_revenue_per_call'),
    F.sum(F.when(F.col('DEBIT_AMOUNT') > 0, 1).otherwise(0)).alias('paid_calls'),
    F.sum(F.when(F.col('DEBIT_AMOUNT') == 0, 1).otherwise(0)).alias('free_calls')
).collect()[0]

print(f"\n💵 Total Revenue: {revenue_stats['total_revenue']:,.2f} DZD")
print(f"📊 Average Revenue per Call: {revenue_stats['avg_revenue_per_call']:.2f} DZD")
print(f"💳 Paid Calls: {revenue_stats['paid_calls']:,} ({revenue_stats['paid_calls']/df.count()*100:.2f}%)")
print(f"🆓 Free Calls: {revenue_stats['free_calls']:,} ({revenue_stats['free_calls']/df.count()*100:.2f}%)")

# Revenue by day
print("\n📅 Daily Revenue Comparison:")
daily_revenue = df.groupBy('CDR_DAY').agg(
    F.sum('DEBIT_AMOUNT').alias('total_revenue'),
    F.count('*').alias('total_calls'),
    F.avg('DEBIT_AMOUNT').alias('avg_revenue_per_call'),
    F.countDistinct('PRI_IDENTITY_HASH').alias('unique_users')
).withColumn(
    'ARPU', F.round(F.col('total_revenue') / F.col('unique_users'), 2)
).orderBy('CDR_DAY')
daily_revenue.show()

# Revenue distribution
print("\n💰 Revenue Distribution Analysis:")
revenue_percentiles = df.filter(F.col('DEBIT_AMOUNT') > 0).select(
    F.expr("percentile_approx(DEBIT_AMOUNT, 0.25)").alias("Q1"),
    F.expr("percentile_approx(DEBIT_AMOUNT, 0.50)").alias("Median"),
    F.expr("percentile_approx(DEBIT_AMOUNT, 0.75)").alias("Q3"),
    F.expr("percentile_approx(DEBIT_AMOUNT, 0.95)").alias("P95"),
    F.expr("percentile_approx(DEBIT_AMOUNT, 0.99)").alias("P99")
).collect()[0]

print(f"  Q1 (25th percentile): {revenue_percentiles['Q1']:.2f} DZD")
print(f"  Median: {revenue_percentiles['Median']:.2f} DZD")
print(f"  Q3 (75th percentile): {revenue_percentiles['Q3']:.2f} DZD")
print(f"  95th percentile: {revenue_percentiles['P95']:.2f} DZD")
print(f"  99th percentile: {revenue_percentiles['P99']:.2f} DZD")

# Hourly revenue pattern
print("\n⏰ Hourly Revenue Pattern:")
hourly_revenue = df_temporal.groupBy('CDR_DAY', 'call_hour').agg(
    F.sum('DEBIT_AMOUNT').alias('hourly_revenue'),
    F.count('*').alias('calls')
).orderBy('CDR_DAY', 'call_hour')

# Find peak revenue hours
peak_revenue_hours = hourly_revenue.orderBy(F.desc('hourly_revenue')).limit(5)
print("\n🔝 Top 5 Revenue Hours:")
peak_revenue_hours.show()

# ------------------------------------------------------------
# Cell 6: Network Cell Analysis
# ------------------------------------------------------------
print("\n📡 NETWORK CELL ANALYSIS")
print("-" * 60)

# Cell activity distribution
cell_stats = df.groupBy('CallingCellID').agg(
    F.count('*').alias('total_calls'),
    F.countDistinct('PRI_IDENTITY_HASH').alias('unique_users'),
    F.sum(F.when(F.col('ACTUAL_USAGE') == 0, 1).otherwise(0)).alias('failed_calls'),
    F.avg('ACTUAL_USAGE').alias('avg_duration'),
    F.sum('DEBIT_AMOUNT').alias('total_revenue')
).withColumn(
    'failure_rate', F.round(F.col('failed_calls') / F.col('total_calls') * 100, 2)
).withColumn(
    'calls_per_user', F.round(F.col('total_calls') / F.col('unique_users'), 2)
)

print("\n📊 Cell Distribution Summary:")
print(f"Total Active Cells: {cell_stats.count()}")
print(f"Cells with >1000 calls: {cell_stats.filter(F.col('total_calls') > 1000).count()}")
print(f"Cells with >50% failure rate: {cell_stats.filter(F.col('failure_rate') > 50).count()}")

# Top performing cells
print("\n🏆 Top 10 Busiest Cells:")
top_cells = cell_stats.orderBy(F.desc('total_calls')).limit(10)
top_cells.show()

# Cells with issues
print("\n⚠️ Cells with High Failure Rates (>30%):")
problem_cells = cell_stats.filter(F.col('failure_rate') > 30).orderBy(F.desc('failure_rate'))
problem_cells.select('CallingCellID', 'total_calls', 'failure_rate', 'unique_users').show(10)

# New Year's Eve cell load
nye_cell_load = df_temporal.filter(
    ((F.col('CDR_DAY') == '2024-12-31') & (F.col('call_hour') >= 22)) |
    ((F.col('CDR_DAY') == '2025-01-01') & (F.col('call_hour') <= 2))
).groupBy('CallingCellID').agg(
    F.count('*').alias('nye_calls'),
    F.countDistinct('PRI_IDENTITY_HASH').alias('nye_users')
)

print("\n🎊 Cells with Highest New Year's Eve Activity:")
nye_cell_load.orderBy(F.desc('nye_calls')).show(10)

# ------------------------------------------------------------
# Cell 7: Call Type and Service Analysis
# ------------------------------------------------------------
print("\n📱 CALL TYPE AND SERVICE ANALYSIS")
print("-" * 60)

# Call type distribution
print("\n📞 Call Type Distribution:")
call_type_stats = df.groupBy('CallType').agg(
    F.count('*').alias('count'),
    F.round(F.avg('ACTUAL_USAGE'), 2).alias('avg_duration'),
    F.sum(F.when(F.col('ACTUAL_USAGE') == 0, 1).otherwise(0)).alias('failed_calls'),
    F.round(F.sum('DEBIT_AMOUNT'), 2).alias('total_revenue')
).withColumn(
    'failure_rate', F.round(F.col('failed_calls') / F.col('count') * 100, 2)
).orderBy(F.desc('count'))
call_type_stats.show()

# Service flow analysis
print("\n🔄 Service Flow Distribution:")
service_flow_stats = df.groupBy('ServiceFlow').agg(
    F.count('*').alias('count'),
    F.countDistinct('PRI_IDENTITY_HASH').alias('unique_users'),
    F.round(F.avg('ACTUAL_USAGE'), 2).alias('avg_duration')
).orderBy(F.desc('count'))
service_flow_stats.show()

# Roaming analysis
print("\n🌍 Roaming Analysis:")
roaming_stats = df.groupBy('RoamState').agg(
    F.count('*').alias('count'),
    F.round(F.avg('ACTUAL_USAGE'), 2).alias('avg_duration'),
    F.round(F.sum('DEBIT_AMOUNT'), 2).alias('total_revenue')
).orderBy('RoamState')
roaming_stats.show()

# Call forwarding patterns
print("\n↪️ Call Forwarding Patterns:")
forward_stats = df.groupBy('CallForwardIndicator').agg(
    F.count('*').alias('count'),
    F.round(F.count('*') / df.count() * 100, 2).alias('percentage')
).orderBy('CallForwardIndicator')
forward_stats.show()

# ------------------------------------------------------------
# Cell 8: New Year's Eve Special Insights
# ------------------------------------------------------------
print("\n🎆 NEW YEAR'S EVE SPECIAL INSIGHTS")
print("=" * 60)

# Compare Dec 31 vs Jan 1
print("\n📊 Key Metrics Comparison:")
comparison = df.groupBy('CDR_DAY').agg(
    F.count('*').alias('total_calls'),
    F.countDistinct('PRI_IDENTITY_HASH').alias('unique_users'),
    F.round(F.avg('ACTUAL_USAGE'), 2).alias('avg_duration'),
    F.round(F.sum('DEBIT_AMOUNT'), 2).alias('total_revenue'),
    F.sum(F.when(F.col('ACTUAL_USAGE') == 0, 1).otherwise(0)).alias('failed_calls')
).withColumn(
    'calls_per_user', F.round(F.col('total_calls') / F.col('unique_users'), 2)
).withColumn(
    'failure_rate', F.round(F.col('failed_calls') / F.col('total_calls') * 100, 2)
)
comparison.show()

# Calculate growth rates
dec31_stats = comparison.filter(F.col('CDR_DAY') == '2024-12-31').collect()[0]
jan01_stats = comparison.filter(F.col('CDR_DAY') == '2025-01-01').collect()[0]

print("\n📈 New Year's Day Growth Rates:")
print(f"  Call Volume: +{((jan01_stats['total_calls'] - dec31_stats['total_calls']) / dec31_stats['total_calls'] * 100):.1f}%")
print(f"  Active Users: +{((jan01_stats['unique_users'] - dec31_stats['unique_users']) / dec31_stats['unique_users'] * 100):.1f}%")
print(f"  Revenue: +{((jan01_stats['total_revenue'] - dec31_stats['total_revenue']) / dec31_stats['total_revenue'] * 100):.1f}%")

# Midnight hour analysis
print("\n🕐 Midnight Hour Deep Dive (23:00-00:59):")
midnight_deep = df_temporal.filter(
    ((F.col('CDR_DAY') == '2024-12-31') & (F.col('call_hour') == 23)) |
    ((F.col('CDR_DAY') == '2025-01-01') & (F.col('call_hour') == 0))
).groupBy('CDR_DAY', 'call_hour').agg(
    F.count('*').alias('calls'),
    F.countDistinct('PRI_IDENTITY_HASH').alias('unique_callers'),
    F.sum(F.when(F.col('ACTUAL_USAGE') <= 30, 1).otherwise(0)).alias('short_calls'),
    F.sum(F.when(F.col('ACTUAL_USAGE') == 0, 1).otherwise(0)).alias('failed_calls'),
    F.round(F.avg(F.when(F.col('ACTUAL_USAGE') > 0, F.col('ACTUAL_USAGE'))), 2).alias('avg_success_duration')
).withColumn(
    'short_call_ratio', F.round(F.col('short_calls') / F.col('calls') * 100, 2)
)
midnight_deep.show()

# Service types during celebration
print("\n🎉 Service Usage During Celebration Hours (22:00-02:00):")
celebration_services = df_temporal.filter(
    ((F.col('CDR_DAY') == '2024-12-31') & (F.col('call_hour') >= 22)) |
    ((F.col('CDR_DAY') == '2025-01-01') & (F.col('call_hour') <= 2))
).groupBy('SERVICE_CATEGORY').agg(
    F.count('*').alias('celebration_calls'),
    F.countDistinct('PRI_IDENTITY_HASH').alias('unique_users')
).orderBy(F.desc('celebration_calls'))
celebration_services.show()

# ------------------------------------------------------------
# Cell 9: Key Findings and Business Insights
# ------------------------------------------------------------
print("\n🔍 KEY FINDINGS AND BUSINESS INSIGHTS")
print("=" * 60)

# 1. Traffic Surge Analysis
jan1_calls = df.filter(F.col('CDR_DAY') == '2025-01-01').count()
dec31_calls = df.filter(F.col('CDR_DAY') == '2024-12-31').count()
surge_factor = jan1_calls / dec31_calls

print("\n1️⃣ NEW YEAR'S TRAFFIC SURGE:")
print(f"   • Dec 31: {dec31_calls:,} calls")
print(f"   • Jan 1: {jan1_calls:,} calls")
print(f"   • Surge Factor: {surge_factor:.1f}x")
print(f"   • This represents a {(surge_factor-1)*100:.0f}% increase in traffic")

# 2. Network Performance Under Load
nye_failure = df.filter(F.col('CDR_DAY') == '2024-12-31').agg(
    F.avg(F.when(F.col('ACTUAL_USAGE') == 0, 1).otherwise(0))
).collect()[0][0] * 100

ny_failure = df.filter(F.col('CDR_DAY') == '2025-01-01').agg(
    F.avg(F.when(F.col('ACTUAL_USAGE') == 0, 1).otherwise(0))
).collect()[0][0] * 100

print("\n2️⃣ NETWORK RESILIENCE:")
print(f"   • Dec 31 Failure Rate: {nye_failure:.2f}%")
print(f"   • Jan 1 Failure Rate: {ny_failure:.2f}%")
print(f"   • Despite 11.5x traffic surge, failure rate only increased by {ny_failure-nye_failure:.2f} percentage points")

# 3. User Behavior Patterns
midnight_spike = df_temporal.filter(
    (F.col('CDR_DAY') == '2025-01-01') & (F.col('call_hour') == 0)
).count()

print("\n3️⃣ MIDNIGHT CELEBRATION PATTERN:")
print(f"   • Calls in first hour of 2025: {midnight_spike:,}")
print(f"   • This represents {midnight_spike/jan1_calls*100:.1f}% of all New Year's Day calls")
print(f"   • Clear spike indicating mass New Year greetings")

# 4. Revenue Impact
revenue_increase = ((jan01_stats['total_revenue'] - dec31_stats['total_revenue']) / dec31_stats['total_revenue'] * 100)

print("\n4️⃣ REVENUE OPPORTUNITY:")
print(f"   • Revenue increased by {revenue_increase:.1f}% on New Year's Day")
print(f"   • ARPU on Dec 31: {dec31_stats['total_revenue']/dec31_stats['unique_users']:.2f} DZD")
print(f"   • ARPU on Jan 1: {jan01_stats['total_revenue']/jan01_stats['unique_users']:.2f} DZD")

# 5. Service Category Insights
voice_calls = df.filter(F.col('SERVICE_CATEGORY') == '1').count()
sms_messages = df.filter(F.col('SERVICE_CATEGORY') == '2').count()

print("\n5️⃣ SERVICE PREFERENCES:")
print(f"   • Voice Calls: {voice_calls:,} ({voice_calls/df.count()*100:.1f}%)")
print(f"   • SMS Messages: {sms_messages:,} ({sms_messages/df.count()*100:.1f}%)")
print(f"   • Voice remains dominant for New Year greetings")

print("\n" + "=" * 60)
print("💡 BUSINESS RECOMMENDATIONS:")
print("=" * 60)
print("1. 📡 Network Capacity: Plan for 12x normal capacity for future New Year events")
print("2. 💰 Revenue Optimization: Create special New Year packages to monetize surge")
print("3. 🎯 Marketing: Target heavy users (50+ calls) with premium offerings")
print("4. 🔧 Infrastructure: Focus on cells with >30% failure rates before next holiday")
print("5. 📱 Service Innovation: Consider special New Year SMS bundles")

print(f"\n✅ EDA Complete! Analysis finished at: {datetime.now()}")

# ------------------------------------------------------------
# Cell 10: Export Key Metrics for Visualization
# ------------------------------------------------------------
print("\n📊 EXPORTING KEY METRICS FOR VISUALIZATION")
print("-" * 60)

# Create summary tables for BI tools

# 1. Hourly metrics for time series visualization
hourly_export = df_temporal.groupBy('CDR_DAY', 'call_hour').agg(
    F.count('*').alias('calls'),
    F.countDistinct('PRI_IDENTITY_HASH').alias('unique_users'),
    F.sum(F.when(F.col('ACTUAL_USAGE') > 0, 1).otherwise(0)).alias('successful_calls'),
    F.sum(F.when(F.col('ACTUAL_USAGE') == 0, 1).otherwise(0)).alias('failed_calls'),
    F.round(F.avg('ACTUAL_USAGE'), 2).alias('avg_duration'),
    F.round(F.sum('DEBIT_AMOUNT'), 2).alias('revenue')
).withColumn('hour_timestamp', 
    F.concat(F.col('CDR_DAY'), F.lit(' '), 
             F.lpad(F.col('call_hour'), 2, '0'), F.lit(':00:00'))
).orderBy('CDR_DAY', 'call_hour')

hourly_export.write.mode('overwrite').saveAsTable('eda_hourly_metrics')
print("✅ Saved: eda_hourly_metrics")

# 2. User segments for pie charts
user_segment_export = user_segments.groupBy('user_segment').agg(
    F.count('*').alias('user_count'),
    F.sum('total_calls').alias('total_calls'),
    F.sum('total_spend').alias('total_revenue')
).withColumn('pct_users', F.round(F.col('user_count') / user_segments.count() * 100, 2))

user_segment_export.write.mode('overwrite').saveAsTable('eda_user_segments')
print("✅ Saved: eda_user_segments")

# 3. Cell performance for geographic visualization
cell_export = cell_stats.select(
    'CallingCellID',
    'total_calls',
    'unique_users',
    'failure_rate',
    F.round('total_revenue', 2).alias('revenue')
).filter(F.col('CallingCellID').isNotNull())

cell_export.write.mode('overwrite').saveAsTable('eda_cell_performance')
print("✅ Saved: eda_cell_performance")

print("\n📈 Ready for visualization in Superset/PowerBI!")
print("   - Use eda_hourly_metrics for time series charts")
print("   - Use eda_user_segments for user distribution")
print("   - Use eda_cell_performance for network maps")

# Stop Spark
spark.stop()
print("\n✅ Analysis complete! Spark session closed.")

25/06/29 04:14:08 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


✅ SparkSession initialized (App: CDR Exploratory Data Analysis - New Year's Special, Spark: 3.5.1)
✅ Hive Warehouse: hdfs://namenode:9000/user/hive/warehouse
✅ Hive Metastore URI: thrift://hive-metastore:9083
🎊 CDR EXPLORATORY DATA ANALYSIS - NEW YEAR'S EVE SPECIAL
Analysis Period: December 31, 2024 - January 1, 2025


                                                                                

Total Records: 89,911
Unique Subscribers: 40,843

📅 TEMPORAL ANALYSIS - NEW YEAR'S EVE CELEBRATION
------------------------------------------------------------

📊 Hourly Call Volume Distribution:
+----------+---------+-----------+------------+----------------+------------+------------------+-------------+
|   CDR_DAY|call_hour|total_calls|unique_users|successful_calls|failed_calls|      avg_duration|total_revenue|
+----------+---------+-----------+------------+----------------+------------+------------------+-------------+
|2024-12-31|       21|         30|          30|              30|           0|            2907.0|     216000.0|
|2024-12-31|       22|       1880|        1076|            1877|           3| 146.9563829787234|    1928527.0|
|2024-12-31|       23|       5271|        3116|            5261|          10| 74.09618668184405|    1589915.0|
|2025-01-01|        0|       2032|         982|            2026|           6| 64.12795275590551|     810174.0|
|2025-01-01|        1|     

25/06/29 04:14:16 WARN HiveConf: HiveConf of name hive.metastore.event.db.notification.api.auth does not exist
25/06/29 04:14:19 WARN SessionState: METASTORE_FILTER_HOOK will be ignored, since hive.security.authorization.manager is set to instance of HiveAuthorizerFactory.


✅ Saved: eda_hourly_metrics
✅ Saved: eda_user_segments
✅ Saved: eda_cell_performance

📈 Ready for visualization in Superset/PowerBI!
   - Use eda_hourly_metrics for time series charts
   - Use eda_user_segments for user distribution
   - Use eda_cell_performance for network maps

✅ Analysis complete! Spark session closed.
