# Notebook 02: Complete Hive Hourly Table & Feature Engineering
## CDR Telecom - New Year's Eve (Transition from 31 December to 01 January) Analysis


# Focus: Creating hourly aggregations and features for Dec 31 - Jan 1


# Cell 1: Setup and Configuration


In [2]:
import sys
sys.path.append('/home/jovyan/work/work/scripts')
from spark_init import init_spark
from pyspark.sql import functions as F, types as T
from pyspark.sql.window import Window
from datetime import datetime

spark = init_spark("CDR Hourly Tables & Feature Engineering")

# Configuration
DATABASE_NAME = "algerie_telecom_cdr"
MAIN_TABLE = "cdr_anonymized"
HOURLY_TABLE = "cdr_hourly_aggregated"
HOURLY_FEATURES = "cdr_hourly_features"
MINUTE_LEVEL = "cdr_minute_aggregated"  # For midnight analysis
HOURLY_USER_BEHAVIOR = "cdr_hourly_user_behavior"

spark.sql(f"USE {DATABASE_NAME}")
print(f"✅ Using database: {DATABASE_NAME}")
print(f"📊 Creating advanced hourly analytics tables")


# ------------------------------------------------------------
# Cell 2: Create Comprehensive Hourly Table
# ------------------------------------------------------------
print("\n⏰ CREATING COMPREHENSIVE HOURLY AGGREGATION TABLE")
print("-" * 60)

# Load main data with temporal parsing
df = spark.table(MAIN_TABLE)

# Add detailed temporal features
df_temporal = df.withColumn(
    "call_timestamp", F.to_timestamp(F.col("START_DATE"), "yyyyMMddHHmmss")
).withColumn(
    "call_hour", F.hour("call_timestamp")
).withColumn(
    "call_minute", F.minute("call_timestamp")
).withColumn(
    "call_date", F.to_date("call_timestamp")
).withColumn(
    "hour_key", F.concat_ws("_", F.col("CDR_DAY"), F.lpad(F.col("call_hour"), 2, "0"))
)

# Create comprehensive hourly aggregation
hourly_agg = df_temporal.groupBy("CDR_DAY", "call_hour", "hour_key").agg(
    # Volume metrics
    F.count("*").alias("total_calls"),
    F.countDistinct("PRI_IDENTITY_HASH").alias("unique_users"),
    F.countDistinct("CallingCellID").alias("active_cells"),
    F.countDistinct("SESSION_ID").alias("unique_sessions"),
    
    # Success metrics
    F.sum(F.when(F.col("ACTUAL_USAGE") > 0, 1).otherwise(0)).alias("successful_calls"),
    F.sum(F.when(F.col("ACTUAL_USAGE") == 0, 1).otherwise(0)).alias("failed_calls"),
    
    # Duration metrics (in seconds)
    F.sum("ACTUAL_USAGE").alias("total_duration_seconds"),
    F.avg("ACTUAL_USAGE").alias("avg_duration"),
    F.stddev("ACTUAL_USAGE").alias("stddev_duration"),
    F.min("ACTUAL_USAGE").alias("min_duration"),
    F.max("ACTUAL_USAGE").alias("max_duration"),
    F.expr("percentile_approx(ACTUAL_USAGE, 0.5)").alias("median_duration"),
    F.expr("percentile_approx(ACTUAL_USAGE, 0.95)").alias("p95_duration"),
    
    # Duration categories
    F.sum(F.when(F.col("ACTUAL_USAGE") <= 30, 1).otherwise(0)).alias("short_calls_30s"),
    F.sum(F.when((F.col("ACTUAL_USAGE") > 30) & (F.col("ACTUAL_USAGE") <= 120), 1).otherwise(0)).alias("medium_calls_2min"),
    F.sum(F.when((F.col("ACTUAL_USAGE") > 120) & (F.col("ACTUAL_USAGE") <= 300), 1).otherwise(0)).alias("normal_calls_5min"),
    F.sum(F.when(F.col("ACTUAL_USAGE") > 300, 1).otherwise(0)).alias("long_calls_over5min"),
    
    # Revenue metrics
    F.sum("DEBIT_AMOUNT").alias("total_revenue"),
    F.avg("DEBIT_AMOUNT").alias("avg_revenue_per_call"),
    F.sum(F.when(F.col("DEBIT_AMOUNT") > 0, 1).otherwise(0)).alias("paid_calls"),
    F.sum(F.when(F.col("DEBIT_AMOUNT") == 0, 1).otherwise(0)).alias("free_calls"),
    F.max("DEBIT_AMOUNT").alias("max_single_charge"),
    
    # Service breakdown
    F.sum(F.when(F.col("SERVICE_CATEGORY") == "1", 1).otherwise(0)).alias("voice_calls"),
    F.sum(F.when(F.col("SERVICE_CATEGORY") == "2", 1).otherwise(0)).alias("sms_count"),
    F.sum(F.when(F.col("SERVICE_CATEGORY") == "3", 1).otherwise(0)).alias("data_sessions"),
    
    # Call types
    F.sum(F.when(F.col("CallType") == "0", 1).otherwise(0)).alias("local_calls"),
    F.sum(F.when(F.col("CallType") == "1", 1).otherwise(0)).alias("national_calls"),
    F.sum(F.when(F.col("CallType") == "2", 1).otherwise(0)).alias("international_calls"),
    
    # Network indicators
    F.sum(F.when(F.col("RoamState") == "1", 1).otherwise(0)).alias("roaming_calls"),
    F.sum(F.when(F.col("CallForwardIndicator") == "1", 1).otherwise(0)).alias("forwarded_calls"),
    
    # Time markers
    F.min("call_timestamp").alias("first_call_time"),
    F.max("call_timestamp").alias("last_call_time")
).withColumn(
    "success_rate", F.round(F.col("successful_calls") / F.col("total_calls") * 100, 2)
).withColumn(
    "failure_rate", F.round(F.col("failed_calls") / F.col("total_calls") * 100, 2)
).withColumn(
    "avg_calls_per_user", F.round(F.col("total_calls") / F.col("unique_users"), 2)
).withColumn(
    "hourly_arpu", F.round(F.col("total_revenue") / F.col("unique_users"), 2)
).withColumn(
    "paid_call_ratio", F.round(F.col("paid_calls") / F.col("total_calls") * 100, 2)
).orderBy("CDR_DAY", "call_hour")

# Save the hourly table
hourly_agg.write.mode("overwrite").saveAsTable(HOURLY_TABLE)
print(f"✅ Created hourly aggregation table: {HOURLY_TABLE}")

# Show sample
print("\n📊 Sample Hourly Data:")
hourly_agg.show(5, truncate=False)

# ------------------------------------------------------------
# Cell 3: Create Minute-Level Table for Midnight Analysis
# ------------------------------------------------------------
print("\n🕐 CREATING MINUTE-LEVEL TABLE FOR MIDNIGHT ANALYSIS")
print("-" * 60)

# Focus on celebration window: Dec 31 22:00 - Jan 1 02:00
midnight_window = df_temporal.filter(
    ((F.col("CDR_DAY") == "2024-12-31") & (F.col("call_hour") >= 22)) |
    ((F.col("CDR_DAY") == "2025-01-01") & (F.col("call_hour") <= 2))
).withColumn(
    "minute_key", 
    F.concat_ws("_", 
        F.col("CDR_DAY"), 
        F.lpad(F.col("call_hour"), 2, "0"),
        F.lpad(F.col("call_minute"), 2, "0")
    )
)

# Minute-level aggregation
minute_agg = midnight_window.groupBy(
    "CDR_DAY", "call_hour", "call_minute", "minute_key"
).agg(
    F.count("*").alias("calls_per_minute"),
    F.countDistinct("PRI_IDENTITY_HASH").alias("unique_callers"),
    F.sum(F.when(F.col("ACTUAL_USAGE") > 0, 1).otherwise(0)).alias("successful_calls"),
    F.sum(F.when(F.col("ACTUAL_USAGE") == 0, 1).otherwise(0)).alias("failed_calls"),
    F.avg("ACTUAL_USAGE").alias("avg_duration"),
    F.sum("DEBIT_AMOUNT").alias("minute_revenue")
).withColumn(
    "timestamp", 
    F.to_timestamp(
        F.concat(F.col("CDR_DAY"), F.lit(" "), 
                F.lpad(F.col("call_hour"), 2, "0"), F.lit(":"),
                F.lpad(F.col("call_minute"), 2, "0"), F.lit(":00"))
    )
).orderBy("timestamp")

minute_agg.write.mode("overwrite").saveAsTable(MINUTE_LEVEL)
print(f"✅ Created minute-level table: {MINUTE_LEVEL}")

# Find the exact midnight spike
print("\n🎊 Midnight Spike Analysis:")
midnight_spike = minute_agg.filter(
    (F.col("call_hour") == 0) & (F.col("call_minute") < 5)
).orderBy("call_minute")
midnight_spike.show()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/06/29 04:18:33 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


✅ SparkSession initialized (App: CDR Hourly Tables & Feature Engineering, Spark: 3.5.1)
✅ Hive Warehouse: hdfs://namenode:9000/user/hive/warehouse
✅ Hive Metastore URI: thrift://hive-metastore:9083


25/06/29 04:18:36 WARN HiveConf: HiveConf of name hive.metastore.event.db.notification.api.auth does not exist


✅ Using database: algerie_telecom_cdr
📊 Creating advanced hourly analytics tables

⏰ CREATING COMPREHENSIVE HOURLY AGGREGATION TABLE
------------------------------------------------------------


25/06/29 04:18:37 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
25/06/29 04:18:44 WARN SessionState: METASTORE_FILTER_HOOK will be ignored, since hive.security.authorization.manager is set to instance of HiveAuthorizerFactory.


✅ Created hourly aggregation table: cdr_hourly_aggregated

📊 Sample Hourly Data:


                                                                                

+----------+---------+-------------+-----------+------------+------------+---------------+----------------+------------+----------------------+-----------------+------------------+------------+------------+---------------+------------+---------------+-----------------+-----------------+-------------------+-------------+--------------------+----------+----------+-----------------+-----------+---------+-------------+-----------+--------------+-------------------+-------------+---------------+-------------------+-------------------+------------+------------+------------------+-----------+---------------+
|CDR_DAY   |call_hour|hour_key     |total_calls|unique_users|active_cells|unique_sessions|successful_calls|failed_calls|total_duration_seconds|avg_duration     |stddev_duration   |min_duration|max_duration|median_duration|p95_duration|short_calls_30s|medium_calls_2min|normal_calls_5min|long_calls_over5min|total_revenue|avg_revenue_per_call|paid_calls|free_calls|max_single_charge|voice_cal

AnalysisException: [DATATYPE_MISMATCH.BINARY_OP_DIFF_TYPES] Cannot resolve "(((call_hour >= 0) AND (call_hour <= 5)) OR call_hour)" due to data type mismatch: the left and right operands of the binary operator have incompatible types ("BOOLEAN" and "INT").;
'Project [CDR_DAY#2663, call_hour#2664, hour_key#2665, total_calls#2666L, unique_users#2667L, active_cells#2668L, unique_sessions#2669L, successful_calls#2670L, failed_calls#2671L, total_duration_seconds#2672, avg_duration#2673, stddev_duration#2674, min_duration#2675, max_duration#2676, median_duration#2677, p95_duration#2678, short_calls_30s#2679L, medium_calls_2min#2680L, normal_calls_5min#2681L, long_calls_over5min#2682L, total_revenue#2683, avg_revenue_per_call#2684, paid_calls#2685L, free_calls#2686L, ... 18 more fields]
+- Project [CDR_DAY#2663, call_hour#2664, hour_key#2665, total_calls#2666L, unique_users#2667L, active_cells#2668L, unique_sessions#2669L, successful_calls#2670L, failed_calls#2671L, total_duration_seconds#2672, avg_duration#2673, stddev_duration#2674, min_duration#2675, max_duration#2676, median_duration#2677, p95_duration#2678, short_calls_30s#2679L, medium_calls_2min#2680L, normal_calls_5min#2681L, long_calls_over5min#2682L, total_revenue#2683, avg_revenue_per_call#2684, paid_calls#2685L, free_calls#2686L, ... 17 more fields]
   +- SubqueryAlias spark_catalog.algerie_telecom_cdr.cdr_hourly_aggregated
      +- Relation spark_catalog.algerie_telecom_cdr.cdr_hourly_aggregated[CDR_DAY#2663,call_hour#2664,hour_key#2665,total_calls#2666L,unique_users#2667L,active_cells#2668L,unique_sessions#2669L,successful_calls#2670L,failed_calls#2671L,total_duration_seconds#2672,avg_duration#2673,stddev_duration#2674,min_duration#2675,max_duration#2676,median_duration#2677,p95_duration#2678,short_calls_30s#2679L,medium_calls_2min#2680L,normal_calls_5min#2681L,long_calls_over5min#2682L,total_revenue#2683,avg_revenue_per_call#2684,paid_calls#2685L,free_calls#2686L,... 16 more fields] parquet


### Advanced Feature Engineering

In [3]:

print("\n🔧 ADVANCED FEATURE ENGINEERING FOR 2-DAY ANALYSIS")
print("-" * 60)

# Load hourly data for feature engineering
hourly_df = spark.table(HOURLY_TABLE)

# 1. Time-based features (FIXED: Added proper parentheses)
hourly_features = hourly_df.withColumn(
    "is_peak_hour", 
    F.when((F.col("call_hour").between(9, 11)) | (F.col("call_hour").between(18, 22)), 1).otherwise(0)
).withColumn(
    "is_night_hour",
    F.when((F.col("call_hour").between(0, 5)) | (F.col("call_hour") >= 22), 1).otherwise(0)
).withColumn(
    "is_business_hour",
    F.when(F.col("call_hour").between(8, 17), 1).otherwise(0)
).withColumn(
    "hour_of_week",
    F.when(F.col("CDR_DAY") == "2024-12-31", F.col("call_hour"))
     .otherwise(F.col("call_hour") + 24)  # Continue counting for Jan 1
)

# 2. Relative metrics (compare to daily average)
daily_avg_window = Window.partitionBy("CDR_DAY")

hourly_features = hourly_features.withColumn(
    "daily_avg_calls", F.avg("total_calls").over(daily_avg_window)
).withColumn(
    "daily_avg_revenue", F.avg("total_revenue").over(daily_avg_window)
).withColumn(
    "calls_vs_daily_avg", 
    F.round((F.col("total_calls") - F.col("daily_avg_calls")) / F.col("daily_avg_calls") * 100, 2)
).withColumn(
    "revenue_vs_daily_avg",
    F.round((F.col("total_revenue") - F.col("daily_avg_revenue")) / F.col("daily_avg_revenue") * 100, 2)
)

# 3. Spike detection using z-scores
hourly_features = hourly_features.withColumn(
    "daily_stddev_calls", F.stddev("total_calls").over(daily_avg_window)
).withColumn(
    "call_volume_zscore",
    F.when(F.col("daily_stddev_calls") > 0,
        (F.col("total_calls") - F.col("daily_avg_calls")) / F.col("daily_stddev_calls")
    ).otherwise(0)
).withColumn(
    "is_spike_hour",
    F.when(F.col("call_volume_zscore") > 2, "Major Spike")
     .when(F.col("call_volume_zscore") > 1, "Minor Spike")
     .when(F.col("call_volume_zscore") < -1, "Low Activity")
     .otherwise("Normal")
)

# 4. Network stress indicators
hourly_features = hourly_features.withColumn(
    "network_stress_score",
    (F.col("failure_rate") * 0.4 + 
     (100 - F.col("success_rate")) * 0.3 +
     F.when(F.col("total_calls") > F.col("daily_avg_calls") * 2, 30).otherwise(0))
).withColumn(
    "network_stress_level",
    F.when(F.col("network_stress_score") > 60, "Critical")
     .when(F.col("network_stress_score") > 40, "High")
     .when(F.col("network_stress_score") > 20, "Medium")
     .otherwise("Low")
)

# 5. User behavior features
hourly_features = hourly_features.withColumn(
    "user_concentration",
    F.round(F.col("unique_users") / F.col("total_calls") * 100, 2)
).withColumn(
    "avg_user_activity",
    F.round(F.col("total_calls") / F.col("unique_users"), 2)
).withColumn(
    "revenue_concentration",
    F.when(F.col("paid_calls") > 0,
        F.round(F.col("total_revenue") / F.col("paid_calls"), 2)
    ).otherwise(0)
)

# 6. Service mix features
hourly_features = hourly_features.withColumn(
    "voice_dominance",
    F.when(F.col("total_calls") > 0,
        F.round(F.col("voice_calls") / F.col("total_calls") * 100, 2)
    ).otherwise(0)
).withColumn(
    "sms_ratio",
    F.when(F.col("total_calls") > 0,
        F.round(F.col("sms_count") / F.col("total_calls") * 100, 2)
    ).otherwise(0)
)

# 7. New Year specific features
hourly_features = hourly_features.withColumn(
    "is_celebration_hour",
    F.when(
        ((F.col("CDR_DAY") == "2024-12-31") & (F.col("call_hour") >= 22)) |
        ((F.col("CDR_DAY") == "2025-01-01") & (F.col("call_hour") <= 2)), 1
    ).otherwise(0)
).withColumn(
    "hours_from_midnight",
    F.when(F.col("CDR_DAY") == "2024-12-31", F.col("call_hour") - 24)
     .otherwise(F.col("call_hour"))
)

# Save enriched hourly features
hourly_features.write.mode("overwrite").saveAsTable(HOURLY_FEATURES)
print(f"✅ Created hourly features table: {HOURLY_FEATURES}")

# Show feature summary
print("\n📊 Feature Engineering Summary:")
feature_cols = ["hour_key", "total_calls", "success_rate", "calls_vs_daily_avg", 
                "is_spike_hour", "network_stress_level", "is_celebration_hour"]
hourly_features.select(*feature_cols).filter(
    F.col("is_celebration_hour") == 1
).show()

# Additional validation
print("\n✅ Feature validation:")
print(f"   Total hours processed: {hourly_features.count()}")
print(f"   Celebration hours: {hourly_features.filter(F.col('is_celebration_hour') == 1).count()}")
print(f"   Spike hours detected: {hourly_features.filter(F.col('is_spike_hour') != 'Normal').count()}")

# ------------------------------------------------------------
# Cell 5: User Behavior Patterns by Hour
# ------------------------------------------------------------
print("\n👥 HOURLY USER BEHAVIOR ANALYSIS")
print("-" * 60)

# Create user-hour level aggregations
user_hourly = df_temporal.groupBy("PRI_IDENTITY_HASH", "CDR_DAY", "call_hour").agg(
    F.count("*").alias("user_hourly_calls"),
    F.sum(F.when(F.col("ACTUAL_USAGE") > 0, 1).otherwise(0)).alias("successful_calls"),
    F.sum("ACTUAL_USAGE").alias("total_duration"),
    F.sum("DEBIT_AMOUNT").alias("total_spend"),
    F.countDistinct("CallingCellID").alias("cells_used")
)

# Identify user patterns
user_patterns = user_hourly.groupBy("PRI_IDENTITY_HASH").agg(
    F.count("*").alias("active_hours"),
    F.sum("user_hourly_calls").alias("total_calls"),
    F.max("user_hourly_calls").alias("max_hourly_calls"),
    F.collect_list("call_hour").alias("active_hour_list"),
    F.sum(F.when(F.col("call_hour").between(22, 23), F.col("user_hourly_calls")).otherwise(0)).alias("late_night_calls"),
    F.sum(F.when(F.col("call_hour").between(0, 2), F.col("user_hourly_calls")).otherwise(0)).alias("early_morning_calls")
).withColumn(
    "is_midnight_caller",
    F.when((F.col("late_night_calls") > 0) & (F.col("early_morning_calls") > 0), 1).otherwise(0)
).withColumn(
    "user_type",
    F.when(F.col("active_hours") >= 20, "Always Active")
     .when(F.col("active_hours") >= 10, "Highly Active")
     .when(F.col("active_hours") >= 5, "Moderately Active")
     .otherwise("Low Activity")
)

# Aggregate user patterns by hour
hourly_user_behavior = user_patterns.join(
    user_hourly, on="PRI_IDENTITY_HASH"
).groupBy("CDR_DAY", "call_hour", "user_type").agg(
    F.countDistinct("PRI_IDENTITY_HASH").alias("users_by_type"),
    F.sum("user_hourly_calls").alias("calls_by_type")
).pivot("user_type").sum("users_by_type")

hourly_user_behavior.write.mode("overwrite").saveAsTable(HOURLY_USER_BEHAVIOR)
print(f"✅ Created hourly user behavior table: {HOURLY_USER_BEHAVIOR}")


🔧 ADVANCED FEATURE ENGINEERING FOR 2-DAY ANALYSIS
------------------------------------------------------------
✅ Created hourly features table: cdr_hourly_features

📊 Feature Engineering Summary:
+-------------+-----------+------------+------------------+-------------+--------------------+-------------------+
|     hour_key|total_calls|success_rate|calls_vs_daily_avg|is_spike_hour|network_stress_level|is_celebration_hour|
+-------------+-----------+------------+------------------+-------------+--------------------+-------------------+
|2024-12-31_22|       1880|       99.84|            -21.46|       Normal|                 Low|                  1|
|2024-12-31_23|       5271|       99.81|            120.21|  Minor Spike|              Medium|                  1|
|2025-01-01_00|       2032|        99.7|            -65.61|       Normal|                 Low|                  1|
|2025-01-01_01|        881|       100.0|            -85.09|       Normal|                 Low|                  1

AttributeError: 'DataFrame' object has no attribute 'pivot'

In [4]:
# ------------------------------------------------------------
# Cell 5: User Behavior Patterns by Hour (FIXED)
# ------------------------------------------------------------
print("\n👥 HOURLY USER BEHAVIOR ANALYSIS")
print("-" * 60)

# Create user-hour level aggregations
user_hourly = df_temporal.groupBy("PRI_IDENTITY_HASH", "CDR_DAY", "call_hour").agg(
    F.count("*").alias("user_hourly_calls"),
    F.sum(F.when(F.col("ACTUAL_USAGE") > 0, 1).otherwise(0)).alias("successful_calls"),
    F.sum("ACTUAL_USAGE").alias("total_duration"),
    F.sum("DEBIT_AMOUNT").alias("total_spend"),
    F.countDistinct("CallingCellID").alias("cells_used")
)

# Identify user patterns
user_patterns = user_hourly.groupBy("PRI_IDENTITY_HASH").agg(
    F.count("*").alias("active_hours"),
    F.sum("user_hourly_calls").alias("total_calls"),
    F.max("user_hourly_calls").alias("max_hourly_calls"),
    F.collect_list("call_hour").alias("active_hour_list"),
    F.sum(F.when(F.col("call_hour").between(22, 23), F.col("user_hourly_calls")).otherwise(0)).alias("late_night_calls"),
    F.sum(F.when(F.col("call_hour").between(0, 2), F.col("user_hourly_calls")).otherwise(0)).alias("early_morning_calls")
).withColumn(
    "is_midnight_caller",
    F.when((F.col("late_night_calls") > 0) & (F.col("early_morning_calls") > 0), 1).otherwise(0)
).withColumn(
    "user_type",
    F.when(F.col("active_hours") >= 20, "Always Active")
     .when(F.col("active_hours") >= 10, "Highly Active")
     .when(F.col("active_hours") >= 5, "Moderately Active")
     .otherwise("Low Activity")
)

# Method 1: Simpler approach without pivot
hourly_user_behavior = user_patterns.join(
    user_hourly, on="PRI_IDENTITY_HASH"
).groupBy("CDR_DAY", "call_hour", "user_type").agg(
    F.countDistinct("PRI_IDENTITY_HASH").alias("users_count"),
    F.sum("user_hourly_calls").alias("total_calls")
).orderBy("CDR_DAY", "call_hour", "user_type")

# Save the table
hourly_user_behavior.write.mode("overwrite").saveAsTable(HOURLY_USER_BEHAVIOR)
print(f"✅ Created hourly user behavior table: {HOURLY_USER_BEHAVIOR}")

# Show sample of user behavior patterns
print("\n📊 User Behavior Summary:")
user_type_summary = user_patterns.groupBy("user_type").agg(
    F.count("*").alias("user_count"),
    F.avg("active_hours").alias("avg_active_hours"),
    F.sum("total_calls").alias("total_calls_by_type"),
    F.sum("is_midnight_caller").alias("midnight_callers")
).orderBy("user_count", ascending=False)
user_type_summary.show()

# Alternative: Create a pivot table separately if needed
print("\n📊 Creating User Activity Pivot Table:")
user_pivot = user_hourly.join(
    user_patterns.select("PRI_IDENTITY_HASH", "user_type"), 
    on="PRI_IDENTITY_HASH"
).groupBy("CDR_DAY", "call_hour").pivot("user_type", 
    ["Always Active", "Highly Active", "Moderately Active", "Low Activity"]
).agg(
    F.countDistinct("PRI_IDENTITY_HASH").alias("users")
).fillna(0).orderBy("CDR_DAY", "call_hour")

# Save pivot table separately
user_pivot.write.mode("overwrite").saveAsTable("cdr_user_activity_pivot")
print("✅ Created user activity pivot table: cdr_user_activity_pivot")

# Show pivot sample
print("\nUser Distribution by Hour and Activity Level:")
user_pivot.show(10)


👥 HOURLY USER BEHAVIOR ANALYSIS
------------------------------------------------------------


Py4JJavaError: An error occurred while calling o827.saveAsTable.
: java.lang.NullPointerException
	at org.apache.spark.util.io.ChunkedByteBuffer.$anonfun$getChunks$1(ChunkedByteBuffer.scala:181)
	at scala.collection.TraversableLike.$anonfun$map$1(TraversableLike.scala:286)
	at scala.collection.IndexedSeqOptimized.foreach(IndexedSeqOptimized.scala:36)
	at scala.collection.IndexedSeqOptimized.foreach$(IndexedSeqOptimized.scala:33)
	at scala.collection.mutable.ArrayOps$ofRef.foreach(ArrayOps.scala:198)
	at scala.collection.TraversableLike.map(TraversableLike.scala:286)
	at scala.collection.TraversableLike.map$(TraversableLike.scala:279)
	at scala.collection.mutable.ArrayOps$ofRef.map(ArrayOps.scala:198)
	at org.apache.spark.util.io.ChunkedByteBuffer.getChunks(ChunkedByteBuffer.scala:181)
	at org.apache.spark.util.io.ChunkedByteBufferInputStream.<init>(ChunkedByteBuffer.scala:278)
	at org.apache.spark.util.io.ChunkedByteBuffer.toInputStream(ChunkedByteBuffer.scala:174)
	at org.apache.spark.sql.execution.SparkPlan.decodeUnsafeRows(SparkPlan.scala:409)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeCollectIterator$2(SparkPlan.scala:457)
	at scala.collection.Iterator$$anon$11.nextCur(Iterator.scala:486)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:492)
	at org.apache.spark.sql.execution.joins.HashedRelation$.apply(HashedRelation.scala:152)
	at org.apache.spark.sql.execution.joins.HashedRelationBroadcastMode.transform(HashedRelation.scala:1162)
	at org.apache.spark.sql.execution.joins.HashedRelationBroadcastMode.transform(HashedRelation.scala:1150)
	at org.apache.spark.sql.execution.exchange.BroadcastExchangeExec.$anonfun$relationFuture$1(BroadcastExchangeExec.scala:151)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withThreadLocalCaptured$2(SQLExecution.scala:224)
	at org.apache.spark.JobArtifactSet$.withActiveJobArtifactState(JobArtifactSet.scala:94)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withThreadLocalCaptured$1(SQLExecution.scala:219)
	at java.base/java.util.concurrent.FutureTask.run(FutureTask.java:264)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
	at java.base/java.lang.Thread.run(Thread.java:829)


In [5]:
# ------------------------------------------------------------
# Cell 5B: Complex User Behavior (Optional - Run if needed)
# ------------------------------------------------------------
print("\n👥 DETAILED USER BEHAVIOR PATTERNS")
print("-" * 60)

# Cache the temporal data first
df_temporal.cache()
print("✅ Cached temporal data")

# Create user profiles with error handling
try:
    # User activity by time period
    user_time_patterns = df_temporal.withColumn(
        "time_period",
        F.when(F.col("call_hour").between(6, 11), "Morning")
         .when(F.col("call_hour").between(12, 17), "Afternoon")
         .when(F.col("call_hour").between(18, 21), "Evening")
         .when(F.col("call_hour").between(22, 23), "Late Night")
         .when(F.col("call_hour").between(0, 5), "Early Morning")
         .otherwise("Night")
    ).groupBy("PRI_IDENTITY_HASH", "time_period").agg(
        F.count("*").alias("calls_in_period")
    )
    
    # Pivot to get calls by time period for each user
    user_time_pivot = user_time_patterns.groupBy("PRI_IDENTITY_HASH").pivot("time_period").sum("calls_in_period").fillna(0)
    
    # Save the result
    user_time_pivot.write.mode("overwrite").saveAsTable("cdr_user_time_patterns")
    print("✅ Created user time patterns table")
    
    # Show sample
    print("\nSample User Time Patterns:")
    user_time_pivot.show(5)
    
except Exception as e:
    print(f"⚠️ Error creating complex user patterns: {str(e)}")
    print("Continuing with simplified analysis...")

# Unpersist to free memory
df_temporal.unpersist()


👥 DETAILED USER BEHAVIOR PATTERNS
------------------------------------------------------------
✅ Cached temporal data


                                                                                

✅ Created user time patterns table

Sample User Time Patterns:
+--------------------+---------+-------------+-------+----------+-------+
|   PRI_IDENTITY_HASH|Afternoon|Early Morning|Evening|Late Night|Morning|
+--------------------+---------+-------------+-------+----------+-------+
|17f61fc2ffe59ef59...|        0|            0|      0|         0|      2|
|8b415d740322d385d...|        0|            1|      0|         0|      4|
|7d7c539fecce2c6dc...|        1|            1|      0|         0|      2|
|dbb0c0db7d216b1ca...|        0|            0|      0|         0|      1|
|cc21d91eec7bb8488...|        0|            0|      0|         0|      1|
+--------------------+---------+-------------+-------+----------+-------+
only showing top 5 rows



DataFrame[CDR_ID: string, CDR_SUB_ID: string, CDR_TYPE: string, CDR_BATCH_ID: string, SRC_CDR_ID: string, START_DATE: string, END_DATE: string, CREATE_DATE: string, CUST_LOCAL_START_DATE: string, CUST_LOCAL_END_DATE: string, OBJ_ID: string, ACTUAL_USAGE: double, RATE_USAGE: double, SERVICE_UNIT_TYPE: string, SERVICE_CATEGORY: string, USAGE_SERVICE_TYPE: string, STD_EVT_TYPE_ID: string, SESSION_ID: string, DEBIT_AMOUNT: double, UN_DEBIT_AMOUNT: double, TOTAL_TAX: double, ServiceFlow: string, CallForwardIndicator: string, ChargingTime: double, CallType: string, RoamState: string, CallingRoamInfo: string, CalledRoamInfo: string, CallingCellID: string, CalledCellID: string, MSCAddress: string, BrandID: string, PRI_IDENTITY_HASH: string, CallingPartyNumber_HASH: string, CalledPartyNumber_HASH: string, CallingPartyIMSI_HASH: string, CalledPartyIMSI_HASH: string, IMEI_HASH: string, CDR_DAY: date, call_timestamp: timestamp, call_hour: int, call_minute: int, call_date: date, hour_key: string]

# 6. Create Views for Analysis and Visualization

In [8]:

print("\n🔍 CREATING ANALYTICAL VIEWS")
print("-" * 60)

# View 1: Hourly Performance Dashboard
spark.sql(f"""
CREATE OR REPLACE VIEW v_hourly_performance AS
SELECT 
    hour_key,
    CDR_DAY,
    call_hour,
    total_calls,
    unique_users,
    success_rate,
    failure_rate,
    ROUND(total_revenue, 2) as revenue,
    ROUND(hourly_arpu, 2) as arpu,
    is_spike_hour,
    network_stress_level,
    CASE 
        WHEN is_celebration_hour = 1 THEN 'Celebration Hour'
        WHEN is_night_hour = 1 THEN 'Night Hour'
        WHEN is_peak_hour = 1 THEN 'Peak Hour'
        WHEN is_business_hour = 1 THEN 'Business Hour'
        ELSE 'Off-Peak'
    END as hour_category
FROM {HOURLY_FEATURES}
ORDER BY CDR_DAY, call_hour
""")
print("✅ Created view: v_hourly_performance")

# View 2: Midnight Transition Analysis
spark.sql(f"""
CREATE OR REPLACE VIEW v_midnight_transition AS
SELECT 
    call_hour,                  
    call_minute,                
    timestamp,
    calls_per_minute,
    unique_callers,
    successful_calls,
    failed_calls,
    ROUND(minute_revenue, 2) as revenue,
    ROUND(failed_calls * 100.0 / calls_per_minute, 2) as failure_rate
FROM {MINUTE_LEVEL}
WHERE (call_hour = 23 AND call_minute >= 50) 
   OR (call_hour = 0 AND call_minute <= 10)
ORDER BY timestamp
""")
print("✅ Created view: v_midnight_transition")

# View 3: Network Stress Hours
spark.sql(f"""
CREATE OR REPLACE VIEW v_network_stress_hours AS
SELECT 
    hour_key,
    CDR_DAY,
    call_hour,
    total_calls,
    failure_rate,
    network_stress_score,
    network_stress_level,
    unique_users,
    active_cells
FROM {HOURLY_FEATURES}
WHERE network_stress_level IN ('High', 'Critical')
ORDER BY network_stress_score DESC
""")
print("✅ Created view: v_network_stress_hours")

# View 4: Service Pattern by Hour
spark.sql(f"""
CREATE OR REPLACE VIEW v_hourly_service_mix AS
SELECT 
    CDR_DAY,
    call_hour,
    voice_calls,
    sms_count,
    data_sessions,
    voice_dominance,
    sms_ratio,
    total_calls
FROM {HOURLY_FEATURES}
ORDER BY CDR_DAY, call_hour
""")
print("✅ Created view: v_hourly_service_mix")



🔍 CREATING ANALYTICAL VIEWS
------------------------------------------------------------
✅ Created view: v_hourly_performance
✅ Created view: v_midnight_transition
✅ Created view: v_network_stress_hours
✅ Created view: v_hourly_service_mix


In [9]:
# ------------------------------------------------------------
# Cell 7: Key Insights from Hourly Analysis
# ------------------------------------------------------------
print("\n📊 KEY INSIGHTS FROM HOURLY ANALYSIS")
print("=" * 60)

# 1. Identify peak hours
peak_hours_df = spark.sql(f"""
SELECT hour_key, total_calls, unique_users, total_revenue, is_spike_hour
FROM {HOURLY_FEATURES}
WHERE is_spike_hour IN ('Major Spike', 'Minor Spike')
ORDER BY total_calls DESC
LIMIT 5
""")
print("\n1️⃣ TOP 5 PEAK HOURS:")
peak_hours_df.show()

# 2. Midnight surge analysis
midnight_surge = spark.sql(f"""
SELECT 
    call_hour,
    SUM(calls_per_minute) as total_calls,
    MAX(calls_per_minute) as max_calls_per_minute,
    AVG(failure_rate) as avg_failure_rate
FROM v_midnight_transition
GROUP BY call_hour
ORDER BY call_hour
""")
print("\n2️⃣ MIDNIGHT SURGE PATTERN:")
midnight_surge.show()

# 3. Network stress periods
stress_periods = spark.sql(f"""
SELECT COUNT(*) as stressed_hours, network_stress_level
FROM v_network_stress_hours
GROUP BY network_stress_level
""")
print("\n3️⃣ NETWORK STRESS SUMMARY:")
stress_periods.show()

# 4. Service evolution through the day
service_evolution = spark.sql(f"""
SELECT 
    CDR_DAY,
    CASE 
        WHEN call_hour BETWEEN 0 AND 5 THEN 'Night'
        WHEN call_hour BETWEEN 6 AND 11 THEN 'Morning'
        WHEN call_hour BETWEEN 12 AND 17 THEN 'Afternoon'
        ELSE 'Evening'
    END as period,
    SUM(voice_calls) as voice,
    SUM(sms_count) as sms,
    AVG(voice_dominance) as avg_voice_dominance
FROM v_hourly_service_mix
GROUP BY CDR_DAY, period
ORDER BY CDR_DAY, period
""")
print("\n4️⃣ SERVICE USAGE BY TIME PERIOD:")
service_evolution.show()

print("\n✅ Hourly analysis complete!")
print("📊 Tables created:")
print(f"   - {HOURLY_TABLE}: Main hourly aggregations")
print(f"   - {HOURLY_FEATURES}: Enriched hourly features")
print(f"   - {MINUTE_LEVEL}: Minute-level for midnight analysis")
print(f"   - {HOURLY_USER_BEHAVIOR}: User patterns by hour")
print("\n🔍 Views available for BI:")
print("   - v_hourly_performance")
print("   - v_midnight_transition")
print("   - v_network_stress_hours")
print("   - v_hourly_service_mix")

# ------------------------------------------------------------
# Cell 8: Export for Trend Analysis
# ------------------------------------------------------------
print("\n📈 PREPARING DATA FOR TREND ANALYSIS")
print("-" * 60)

# Create a consolidated dataset for trend analysis
trend_data = spark.sql(f"""
SELECT 
    h.*,
    CASE 
        WHEN h.CDR_DAY = '2024-12-31' AND h.call_hour >= 18 THEN 'Pre-Celebration'
        WHEN h.CDR_DAY = '2024-12-31' AND h.call_hour >= 22 THEN 'Late NYE'
        WHEN h.CDR_DAY = '2025-01-01' AND h.call_hour <= 2 THEN 'Early NY'
        WHEN h.CDR_DAY = '2025-01-01' AND h.call_hour BETWEEN 3 AND 5 THEN 'Post-Celebration'
        WHEN h.CDR_DAY = '2025-01-01' AND h.call_hour >= 6 THEN 'New Year Day'
        ELSE 'Regular'
    END as celebration_phase,
    LAG(h.total_calls, 1) OVER (ORDER BY h.hour_of_week) as prev_hour_calls,
    LAG(h.total_revenue, 1) OVER (ORDER BY h.hour_of_week) as prev_hour_revenue,
    LEAD(h.total_calls, 1) OVER (ORDER BY h.hour_of_week) as next_hour_calls
FROM {HOURLY_FEATURES} h
""")

# Calculate hour-over-hour growth
trend_analysis = trend_data.withColumn(
    "hour_over_hour_growth",
    F.when(F.col("prev_hour_calls") > 0,
        F.round((F.col("total_calls") - F.col("prev_hour_calls")) / F.col("prev_hour_calls") * 100, 2)
    ).otherwise(None)
).withColumn(
    "revenue_growth",
    F.when(F.col("prev_hour_revenue") > 0,
        F.round((F.col("total_revenue") - F.col("prev_hour_revenue")) / F.col("prev_hour_revenue") * 100, 2)
    ).otherwise(None)
)

trend_analysis.write.mode("overwrite").saveAsTable("cdr_hourly_trends")
print("✅ Created trend analysis table: cdr_hourly_trends")

# Show celebration phase summary
print("\n🎊 CELEBRATION PHASE ANALYSIS:")
celebration_summary = trend_analysis.groupBy("celebration_phase").agg(
    F.count("*").alias("hours"),
    F.sum("total_calls").alias("total_calls"),
    F.avg("success_rate").alias("avg_success_rate"),
    F.sum("total_revenue").alias("total_revenue"),
    F.avg("network_stress_score").alias("avg_stress_score")
).orderBy("celebration_phase")
celebration_summary.show()

print("\n✅ Ready for advanced trend analysis!")
print("💡 Next: Run trend detection algorithms on hourly patterns")


📊 KEY INSIGHTS FROM HOURLY ANALYSIS

1️⃣ TOP 5 PEAK HOURS:
+-------------+-----------+------------+-------------+-------------+
|     hour_key|total_calls|unique_users|total_revenue|is_spike_hour|
+-------------+-----------+------------+-------------+-------------+
|2025-01-01_10|      18125|       11951|    9718766.0|  Minor Spike|
|2025-01-01_09|      16393|       10888|    8206586.0|  Minor Spike|
|2025-01-01_11|      14625|       10071|    6803847.0|  Minor Spike|
|2024-12-31_23|       5271|        3116|    1589915.0|  Minor Spike|
+-------------+-----------+------------+-------------+-------------+


2️⃣ MIDNIGHT SURGE PATTERN:
+---------+-----------+--------------------+----------------+
|call_hour|total_calls|max_calls_per_minute|avg_failure_rate|
+---------+-----------+--------------------+----------------+
|        0|        577|                  60|        0.178182|
|       23|        785|                  93|        0.365000|
+---------+-----------+--------------------+----

25/06/29 04:57:29 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/06/29 04:57:29 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/06/29 04:57:29 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/06/29 04:57:29 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/06/29 04:57:29 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


✅ Created trend analysis table: cdr_hourly_trends

🎊 CELEBRATION PHASE ANALYSIS:
+-----------------+-----+-----------+-----------------+-------------+-------------------+
|celebration_phase|hours|total_calls| avg_success_rate|total_revenue|   avg_stress_score|
+-----------------+-----+-----------+-----------------+-------------+-------------------+
|         Early NY|    3|       3556|99.89999999999999|    1943679.0|0.06999999999999972|
|     New Year Day|    8|      76931|         99.80125|  3.9189624E7|          11.389125|
| Post-Celebration|    3|       2243|            99.82|    1233960.0|0.12600000000000064|
|  Pre-Celebration|    3|       7181|99.88333333333333|    3734442.0| 10.081666666666665|
+-----------------+-----+-----------+-----------------+-------------+-------------------+


✅ Ready for advanced trend analysis!
💡 Next: Run trend detection algorithms on hourly patterns


In [10]:
# ----------------------------------------------------------------------------------
# 10. Cleanup
# ----------------------------------------------------------------------------------
spark.stop()
print("\n✅ Anonymization pipeline completed successfully!")
print("✅ Spark session closed.")



✅ Anonymization pipeline completed successfully!
✅ Spark session closed.
