# CDR Telecom - Creating Compelling New Year's Eve Dashboards

# ============================================================
# NOTEBOOK 04: VISUALIZATION & DASHBOARD SETUP
# Project: CDR Telecom Big Data Engineering Final Year Internship
# Focus: Creating visualizations for Superset & PowerBI
# ============================================================


In [1]:

# ------------------------------------------------------------
# Cell 1: Setup and Prepare Visualization Datasets
# ------------------------------------------------------------
import sys
sys.path.append('/home/jovyan/work/work/scripts')
from spark_init import init_spark
from pyspark.sql import functions as F
import json
from datetime import datetime

spark = init_spark("CDR Visualization Setup")

DATABASE_NAME = "algerie_telecom_cdr"
spark.sql(f"USE {DATABASE_NAME}")

print("=" * 80)
print("📊 CDR VISUALIZATION & DASHBOARD SETUP")
print("=" * 80)
print("Creating optimized datasets for Superset and PowerBI")
print("=" * 80)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/06/29 05:27:33 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


✅ SparkSession initialized (App: CDR Visualization Setup, Spark: 3.5.1)
✅ Hive Warehouse: hdfs://namenode:9000/user/hive/warehouse
✅ Hive Metastore URI: thrift://hive-metastore:9083
📊 CDR VISUALIZATION & DASHBOARD SETUP
Creating optimized datasets for Superset and PowerBI


25/06/29 05:27:36 WARN HiveConf: HiveConf of name hive.metastore.event.db.notification.api.auth does not exist


In [5]:
# ------------------------------------------------------------
# Cell 2: Create Master Time Series Dataset (FIXED)
# ------------------------------------------------------------
print("\n📈 CREATING MASTER TIME SERIES DATASET")
print("-" * 60)

master_timeseries = spark.sql("""
SELECT 
    -- Time dimensions
    h.hour_key,
    h.CDR_DAY,
    h.call_hour,
    CONCAT(h.CDR_DAY, ' ', LPAD(h.call_hour, 2, '0'), ':00:00') AS timestamp,
    h.hour_of_week,

    -- On compare is_celebration_hour à 1
    CASE WHEN h.is_celebration_hour = 1 THEN 'Celebration' ELSE 'Normal' END AS celebration_phase,

    -- Volume metrics
    h.total_calls,
    h.unique_users,
    h.successful_calls,
    h.failed_calls,
    h.active_cells,

    -- Performance metrics
    h.success_rate,
    h.failure_rate,
    h.avg_duration,
    h.median_duration,

    -- Revenue metrics
    h.total_revenue,
    h.hourly_arpu,
    h.paid_calls,
    h.free_calls,
    h.paid_call_ratio,

    -- Service breakdown
    h.voice_calls,
    h.sms_count,
    h.data_sessions,

    -- Trend indicator
    t.hour_over_hour_growth,

    -- Daily-normal comparisons
    h.calls_vs_daily_avg,
    h.is_spike_hour,

    -- Network health
    h.network_stress_score,
    h.network_stress_level,

    -- Anomaly indicators
    a.anomaly_type,
    a.total_anomaly_score,
    CASE 
        WHEN a.anomaly_type = 'Critical Anomaly' THEN 3
        WHEN a.anomaly_type = 'Major Anomaly'    THEN 2
        WHEN a.anomaly_type = 'Minor Anomaly'    THEN 1
        ELSE 0
    END AS anomaly_severity

FROM cdr_hourly_features h
LEFT JOIN cdr_hourly_trends   t ON h.hour_key = t.hour_key
LEFT JOIN cdr_hourly_anomalies a ON h.hour_key = a.hour_key

ORDER BY h.hour_of_week
""")

master_timeseries.write.mode("overwrite").saveAsTable("viz_master_timeseries")
print("✅ Created master time series dataset: viz_master_timeseries")

# Show summary
print("\n📊 Dataset Summary:")
print(f"Total Hours: {master_timeseries.count()}")
print(f"Columns: {len(master_timeseries.columns)}")
master_timeseries.show(5, truncate=False)



📈 CREATING MASTER TIME SERIES DATASET
------------------------------------------------------------


25/06/29 05:32:09 WARN SessionState: METASTORE_FILTER_HOOK will be ignored, since hive.security.authorization.manager is set to instance of HiveAuthorizerFactory.


✅ Created master time series dataset: viz_master_timeseries

📊 Dataset Summary:
Total Hours: 17
Columns: 31
+-------------+----------+---------+-------------------+------------+-----------------+-----------+------------+----------------+------------+------------+------------+------------+-----------------+---------------+-------------+-----------+----------+----------+---------------+-----------+---------+-------------+---------------------+------------------+-------------+--------------------+--------------------+-------------+-------------------+----------------+
|hour_key     |CDR_DAY   |call_hour|timestamp          |hour_of_week|celebration_phase|total_calls|unique_users|successful_calls|failed_calls|active_cells|success_rate|failure_rate|avg_duration     |median_duration|total_revenue|hourly_arpu|paid_calls|free_calls|paid_call_ratio|voice_calls|sms_count|data_sessions|hour_over_hour_growth|calls_vs_daily_avg|is_spike_hour|network_stress_score|network_stress_level|anomaly_type |to

In [6]:
# ------------------------------------------------------------
# Cell 3: Create KPI Summary Tables (UPDATED)
# ------------------------------------------------------------
print("\n📊 CREATING KPI SUMMARY TABLES")
print("-" * 60)

# Overall KPIs
overall_kpis = spark.sql("""
SELECT 
    COUNT(DISTINCT CDR_DAY)                AS days_analyzed,
    SUM(total_calls)                       AS total_calls,
    COUNT(DISTINCT unique_users)           AS total_unique_users,
    ROUND(AVG(success_rate), 2)            AS avg_success_rate,
    ROUND(SUM(total_revenue), 2)           AS total_revenue,
    MAX(total_calls)                       AS peak_hourly_calls,
    ROUND(AVG(network_stress_score), 2)    AS avg_network_stress
FROM viz_master_timeseries
""")
overall_kpis.write.mode("overwrite").saveAsTable("viz_overall_kpis")
print("✅ Created overall KPIs table: viz_overall_kpis")

# Daily comparison KPIs
daily_kpis = spark.sql("""
SELECT 
    CDR_DAY,
    SUM(total_calls)                       AS daily_calls,
    MAX(unique_users)                      AS daily_unique_users,
    ROUND(AVG(success_rate), 2)            AS daily_success_rate,
    ROUND(SUM(total_revenue), 2)           AS daily_revenue,
    MAX(total_calls)                       AS peak_hour_calls,
    SUM(CASE WHEN network_stress_level IN ('High','Critical') THEN 1 ELSE 0 END) AS stressed_hours
FROM viz_master_timeseries
GROUP BY CDR_DAY
ORDER BY CDR_DAY
""")
daily_kpis.write.mode("overwrite").saveAsTable("viz_daily_kpis")
print("✅ Created daily KPIs table: viz_daily_kpis")

# Celebration phase KPIs
phase_kpis = spark.sql("""
SELECT 
    celebration_phase,
    COUNT(*)                              AS phase_hours,
    SUM(total_calls)                      AS phase_calls,
    ROUND(AVG(success_rate), 2)           AS phase_success_rate,
    ROUND(SUM(total_revenue), 2)          AS phase_revenue,
    MAX(hour_over_hour_growth)            AS max_growth_rate,
    ROUND(AVG(network_stress_score), 2)   AS avg_stress
FROM viz_master_timeseries
GROUP BY celebration_phase
ORDER BY
    CASE celebration_phase
      WHEN 'Celebration' THEN 1
      ELSE 2
    END
""")
phase_kpis.write.mode("overwrite").saveAsTable("viz_phase_kpis")
print("✅ Created celebration phase KPIs table: viz_phase_kpis")



📊 CREATING KPI SUMMARY TABLES
------------------------------------------------------------
✅ Created overall KPIs table: viz_overall_kpis
✅ Created daily KPIs table: viz_daily_kpis
✅ Created celebration phase KPIs table: viz_phase_kpis


In [9]:
spark.sql("DESCRIBE cdr_hourly_aggregated").show(40, truncate=False)


+----------------------+---------+-------+
|col_name              |data_type|comment|
+----------------------+---------+-------+
|CDR_DAY               |date     |NULL   |
|call_hour             |int      |NULL   |
|hour_key              |string   |NULL   |
|total_calls           |bigint   |NULL   |
|unique_users          |bigint   |NULL   |
|active_cells          |bigint   |NULL   |
|unique_sessions       |bigint   |NULL   |
|successful_calls      |bigint   |NULL   |
|failed_calls          |bigint   |NULL   |
|total_duration_seconds|double   |NULL   |
|avg_duration          |double   |NULL   |
|stddev_duration       |double   |NULL   |
|min_duration          |double   |NULL   |
|max_duration          |double   |NULL   |
|median_duration       |double   |NULL   |
|p95_duration          |double   |NULL   |
|short_calls_30s       |bigint   |NULL   |
|medium_calls_2min     |bigint   |NULL   |
|normal_calls_5min     |bigint   |NULL   |
|long_calls_over5min   |bigint   |NULL   |
|total_reve

In [10]:
# 1) on agrège d’abord par cellule + heure
cell_hourly = (
  spark.table("cdr_raw")
    .groupBy("CallingCellID","hour_key")
    .agg(
      F.count("*").alias("total_calls"),
      F.countDistinct("UserID").alias("unique_users"),
      F.avg("failure_flag").alias("failure_rate"),
      F.sum("revenue").alias("total_revenue")
    )
)
cell_hourly.createOrReplaceTempView("cdr_hourly_by_cell")

# 2) ensuite on rejoint votre master_timeseries pour récupérer celebration_phase, etc.
cell_performance = spark.sql("""
SELECT
  c.CallingCellID,
  COUNT(*) AS total_hours,
  SUM(c.total_calls) AS cell_total_calls,
  ROUND(AVG(c.unique_users),2) AS avg_users_per_hour,
  ROUND(AVG(c.failure_rate),2) AS avg_failure_rate,
  ROUND(SUM(c.total_revenue),2) AS cell_revenue,
  MAX(c.total_calls) AS peak_hour_calls,
  SUM(CASE WHEN v.celebration_phase='Celebration' THEN c.total_calls ELSE 0 END) AS nye_calls,
  MAX(CASE WHEN v.celebration_phase='Celebration' THEN c.total_calls ELSE 0 END) AS nye_peak
FROM cdr_hourly_by_cell c
JOIN viz_master_timeseries v
  ON c.hour_key = v.hour_key
GROUP BY c.CallingCellID
""")


AnalysisException: [TABLE_OR_VIEW_NOT_FOUND] The table or view `cdr_raw` cannot be found. Verify the spelling and correctness of the schema and catalog.
If you did not qualify the name with a schema, verify the current_schema() output, or qualify the name with the correct schema and catalog.
To tolerate the error on drop use DROP VIEW IF EXISTS or DROP TABLE IF EXISTS.;
'UnresolvedRelation [cdr_raw], [], false


In [13]:
spark.sql("SHOW TABLES").show(50, truncate=False)


+-------------------+----------------------+-----------+
|namespace          |tableName             |isTemporary|
+-------------------+----------------------+-----------+
|algerie_telecom_cdr|cdr_anonymized        |false      |
|algerie_telecom_cdr|cdr_daily_summary     |false      |
|algerie_telecom_cdr|cdr_network_metrics   |false      |
|algerie_telecom_cdr|v_daily_trends        |false      |
|algerie_telecom_cdr|v_network_performance |false      |
|algerie_telecom_cdr|data_quality_checks   |false      |
|algerie_telecom_cdr|cdr_hourly_aggregated |false      |
|algerie_telecom_cdr|cdr_minute_aggregated |false      |
|algerie_telecom_cdr|cdr_hourly_features   |false      |
|algerie_telecom_cdr|cdr_user_time_patterns|false      |
|algerie_telecom_cdr|v_hourly_performance  |false      |
|algerie_telecom_cdr|v_midnight_transition |false      |
|algerie_telecom_cdr|v_network_stress_hours|false      |
|algerie_telecom_cdr|v_hourly_service_mix  |false      |
|algerie_telecom_cdr|cdr_hourly

In [12]:
spark.sql("SHOW TABLES IN algerie_telecom_cdr").show(40, truncate=False)


+-------------------+----------------------+-----------+
|namespace          |tableName             |isTemporary|
+-------------------+----------------------+-----------+
|algerie_telecom_cdr|cdr_anonymized        |false      |
|algerie_telecom_cdr|cdr_daily_summary     |false      |
|algerie_telecom_cdr|cdr_network_metrics   |false      |
|algerie_telecom_cdr|v_daily_trends        |false      |
|algerie_telecom_cdr|v_network_performance |false      |
|algerie_telecom_cdr|data_quality_checks   |false      |
|algerie_telecom_cdr|cdr_hourly_aggregated |false      |
|algerie_telecom_cdr|cdr_minute_aggregated |false      |
|algerie_telecom_cdr|cdr_hourly_features   |false      |
|algerie_telecom_cdr|cdr_user_time_patterns|false      |
|algerie_telecom_cdr|v_hourly_performance  |false      |
|algerie_telecom_cdr|v_midnight_transition |false      |
|algerie_telecom_cdr|v_network_stress_hours|false      |
|algerie_telecom_cdr|v_hourly_service_mix  |false      |
|algerie_telecom_cdr|cdr_hourly

In [17]:
# ------------------------------------------------------------
# Cell 4: Create Cell Performance Geographic Data (UPDATED)
# ------------------------------------------------------------
print("\n🗺️ CREATING GEOGRAPHIC CELL PERFORMANCE DATA")
print("-" * 60)

# 1️⃣ Cell performance metrics
cell_performance = spark.sql("""
SELECT 
    c.CallingCellID,
    COUNT(*)                                        AS total_hours,
    SUM(c.total_calls)                              AS cell_total_calls,
    ROUND(AVG(c.unique_users), 2)                   AS avg_users_per_hour,
    ROUND(AVG(c.failure_rate), 2)                   AS avg_failure_rate,
    ROUND(SUM(c.total_revenue), 2)                  AS cell_revenue,
    MAX(c.total_calls)                              AS peak_hour_calls,
    -- New Year's specific metrics via viz_master_timeseries
    SUM(CASE WHEN v.celebration_phase = 'Celebration' THEN c.total_calls ELSE 0 END) AS nye_calls,
    MAX(CASE WHEN v.celebration_phase = 'Celebration' THEN c.total_calls ELSE 0 END) AS nye_peak
FROM cdr_hourly_aggregated c
JOIN viz_master_timeseries          v ON c.hour_key = v.hour_key
WHERE c.CallingCellID IS NOT NULL
GROUP BY c.CallingCellID
""")
cell_performance.write.mode("overwrite").saveAsTable("viz_cell_performance")
print("✅ Created cell performance table: viz_cell_performance")

# 2️⃣ Cell stress analysis
cell_stress = spark.sql("""
SELECT 
    c.CallingCellID,
    c.call_hour,
    c.total_calls,
    c.failure_rate,
    CASE 
        WHEN c.failure_rate > 30 THEN 'Critical'
        WHEN c.failure_rate > 20 THEN 'High'
        WHEN c.failure_rate > 10 THEN 'Medium'
        ELSE 'Low'
    END                                             AS stress_level,
    v.celebration_phase
FROM cdr_hourly_aggregated c
JOIN viz_master_timeseries          v ON c.hour_key = v.hour_key
WHERE c.CallingCellID IS NOT NULL
  AND v.celebration_phase = 'Celebration'
ORDER BY c.failure_rate DESC
""")
cell_stress.write.mode("overwrite").saveAsTable("viz_cell_stress")
print("✅ Created cell stress analysis table: viz_cell_stress")



🗺️ CREATING GEOGRAPHIC CELL PERFORMANCE DATA
------------------------------------------------------------


AnalysisException: [UNRESOLVED_COLUMN.WITH_SUGGESTION] A column or function parameter with name `c`.`CallingCellID` cannot be resolved. Did you mean one of the following? [`c`.`active_cells`, `v`.`active_cells`, `c`.`failed_calls`, `c`.`paid_calls`, `c`.`roaming_calls`].; line 15 pos 6;
'Aggregate ['c.CallingCellID], ['c.CallingCellID, count(1) AS total_hours#1777L, 'SUM('c.total_calls) AS cell_total_calls#1778, 'ROUND('AVG('c.unique_users), 2) AS avg_users_per_hour#1779, 'ROUND('AVG('c.failure_rate), 2) AS avg_failure_rate#1780, 'ROUND('SUM('c.total_revenue), 2) AS cell_revenue#1781, 'MAX('c.total_calls) AS peak_hour_calls#1782, 'SUM(CASE WHEN ('v.celebration_phase = Celebration) THEN 'c.total_calls ELSE 0 END) AS nye_calls#1783, 'MAX(CASE WHEN ('v.celebration_phase = Celebration) THEN 'c.total_calls ELSE 0 END) AS nye_peak#1784]
+- 'Filter isnotnull('c.CallingCellID)
   +- Join Inner, (hour_key#760 = hour_key#439)
      :- SubqueryAlias c
      :  +- SubqueryAlias spark_catalog.algerie_telecom_cdr.cdr_hourly_aggregated
      :     +- Relation spark_catalog.algerie_telecom_cdr.cdr_hourly_aggregated[CDR_DAY#758,call_hour#759,hour_key#760,total_calls#761L,unique_users#762L,active_cells#763L,unique_sessions#764L,successful_calls#765L,failed_calls#766L,total_duration_seconds#767,avg_duration#768,stddev_duration#769,min_duration#770,max_duration#771,median_duration#772,p95_duration#773,short_calls_30s#774L,medium_calls_2min#775L,normal_calls_5min#776L,long_calls_over5min#777L,total_revenue#778,avg_revenue_per_call#779,paid_calls#780L,free_calls#781L,... 16 more fields] parquet
      +- SubqueryAlias v
         +- SubqueryAlias spark_catalog.algerie_telecom_cdr.viz_master_timeseries
            +- Relation spark_catalog.algerie_telecom_cdr.viz_master_timeseries[hour_key#439,CDR_DAY#440,call_hour#441,timestamp#442,hour_of_week#443,celebration_phase#444,total_calls#445L,unique_users#446L,successful_calls#447L,failed_calls#448L,active_cells#449L,success_rate#450,failure_rate#451,avg_duration#452,median_duration#453,total_revenue#454,hourly_arpu#455,paid_calls#456L,free_calls#457L,paid_call_ratio#458,voice_calls#459L,sms_count#460L,data_sessions#461L,hour_over_hour_growth#462,... 7 more fields] parquet


In [16]:
# ------------------------------------------------------------
# Cell 3.5: Build hourly-by-cell view from raw CDR data
# ------------------------------------------------------------
from pyspark.sql import functions as F

raw = spark.table("cdr_anonymized")

cell_hourly = (
    raw
    # 1️⃣ Extraire un timestamp et en dériver jour/heure
    .withColumn("ts", F.to_timestamp("CUST_LOCAL_START_DATE"))
    .withColumn("CDR_DAY", F.to_date("ts"))
    .withColumn("call_hour", F.hour("ts"))
    # 2️⃣ Construire hour_key comme dans ton master_timeseries
    .withColumn(
        "hour_key",
        F.concat_ws(
            "_",
            F.date_format("ts", "yyyy-MM-dd"),
            F.lpad(F.col("call_hour").cast("string"), 2, "0")
        )
    )
    # 3️⃣ Agréger par cellule & heure
    #    → remplacer OBJ_ID par la colonne qui contient ton CellID
    .groupBy("OBJ_ID", "hour_key", "CDR_DAY", "call_hour")
    .agg(
        F.count("*").alias("total_calls"),
        F.countDistinct("SESSION_ID").alias("unique_users"),      # ex. SESSION_ID comme proxy user
        (F.sum(F.expr("CASE WHEN CallForwardIndicator>0 THEN 1 ELSE 0 END")) 
             / F.count("*")
        ).alias("failure_rate"),                                  # derive failure_flag
        F.sum("DEBIT_AMOUNT").alias("total_revenue")               # DEBIT_AMOUNT pour revenue
    )
    .withColumnRenamed("OBJ_ID", "CallingCellID")
)

cell_hourly.createOrReplaceTempView("cdr_hourly_by_cell")
print("✅ Created temp view: cdr_hourly_by_cell")


✅ Created temp view: cdr_hourly_by_cell


In [18]:
print("\n🗺️ CREATING GEOGRAPHIC CELL PERFORMANCE DATA")
print("-" * 60)

# 1️⃣ Cell performance metrics
cell_performance = spark.sql("""
SELECT 
    c.CallingCellID,
    COUNT(*)                            AS total_hours,
    SUM(c.total_calls)                  AS cell_total_calls,
    ROUND(AVG(c.unique_users), 2)       AS avg_users_per_hour,
    ROUND(AVG(c.failure_rate), 2)       AS avg_failure_rate,
    ROUND(SUM(c.total_revenue), 2)      AS cell_revenue,
    MAX(c.total_calls)                  AS peak_hour_calls,
    SUM(
      CASE WHEN v.celebration_phase = 'Celebration' 
           THEN c.total_calls 
           ELSE 0 
      END
    ) AS nye_calls,
    MAX(
      CASE WHEN v.celebration_phase = 'Celebration' 
           THEN c.total_calls 
           ELSE 0 
      END
    ) AS nye_peak
FROM cdr_hourly_by_cell c
JOIN viz_master_timeseries     v 
  ON c.hour_key = v.hour_key
GROUP BY c.CallingCellID
""")
cell_performance.write.mode("overwrite").saveAsTable("viz_cell_performance")
print("✅ Created cell performance table: viz_cell_performance")

# 2️⃣ Cell stress analysis
cell_stress = spark.sql("""
SELECT 
    c.CallingCellID,
    c.call_hour,
    c.total_calls,
    c.failure_rate,
    CASE 
      WHEN c.failure_rate > 30 THEN 'Critical'
      WHEN c.failure_rate > 20 THEN 'High'
      WHEN c.failure_rate > 10 THEN 'Medium'
      ELSE 'Low'
    END                           AS stress_level,
    v.celebration_phase
FROM cdr_hourly_by_cell c
JOIN viz_master_timeseries     v 
  ON c.hour_key = v.hour_key
WHERE v.celebration_phase = 'Celebration'
ORDER BY c.failure_rate DESC
""")
cell_stress.write.mode("overwrite").saveAsTable("viz_cell_stress")
print("✅ Created cell stress analysis table: viz_cell_stress")



🗺️ CREATING GEOGRAPHIC CELL PERFORMANCE DATA
------------------------------------------------------------
✅ Created cell performance table: viz_cell_performance
✅ Created cell stress analysis table: viz_cell_stress


In [20]:
# ------------------------------------------------------------
# Cell 5: Create User Behavior Visualization Data (FIXED)
# ------------------------------------------------------------
print("\n👥 CREATING USER BEHAVIOR VISUALIZATION DATA")
print("-" * 60)

# 1️⃣ Part de cdr_user_time_patterns pour calculer la répartition des appels par créneau
user_segments = spark.sql("""
SELECT usage_segment, segment_calls
FROM (
  SELECT 'Early Morning'  AS usage_segment, SUM(`Early Morning`)  AS segment_calls FROM cdr_user_time_patterns
  UNION ALL
  SELECT 'Morning'        AS usage_segment, SUM(Morning)        AS segment_calls FROM cdr_user_time_patterns
  UNION ALL
  SELECT 'Afternoon'      AS usage_segment, SUM(Afternoon)      AS segment_calls FROM cdr_user_time_patterns
  UNION ALL
  SELECT 'Evening'        AS usage_segment, SUM(Evening)        AS segment_calls FROM cdr_user_time_patterns
  UNION ALL
  SELECT 'Late Night'     AS usage_segment, SUM(`Late Night`)   AS segment_calls FROM cdr_user_time_patterns
) t
""")
user_segments.write.mode("overwrite").saveAsTable("viz_user_segments")
print("✅ Created user segments table: viz_user_segments")

# 2️⃣ Hourly user activity patterns (depuis viz_master_timeseries)
user_activity_patterns = spark.sql("""
SELECT 
    call_hour,
    SUM(unique_users)                                AS active_users,
    SUM(total_calls)                                 AS total_calls,
    ROUND(SUM(total_calls) / NULLIF(SUM(unique_users),0), 2) AS calls_per_user,
    SUM(CASE WHEN total_calls <= 30 THEN 1 ELSE 0 END)       AS short_call_hours,
    SUM(CASE WHEN total_calls > 300 THEN 1 ELSE 0 END)       AS long_call_hours
FROM viz_master_timeseries
GROUP BY call_hour
ORDER BY call_hour
""")
user_activity_patterns.write.mode("overwrite").saveAsTable("viz_user_activity")
print("✅ Created user activity patterns table: viz_user_activity")



👥 CREATING USER BEHAVIOR VISUALIZATION DATA
------------------------------------------------------------
✅ Created user segments table: viz_user_segments
✅ Created user activity patterns table: viz_user_activity


In [23]:
# ------------------------------------------------------------
# Cell 6: Create Superset Dashboard Configurations
# ------------------------------------------------------------
print("\n🎨 CREATING SUPERSET DASHBOARD CONFIGURATIONS")
print("-" * 60)

# Dashboard 1: Executive Overview
executive_dashboard = {
    "title": "CDR New Year's Eve Executive Dashboard",
    "charts": [
        {
            "type": "big_number",
            "title": "Total Calls",
            "query": "SELECT SUM(total_calls) FROM viz_overall_kpis"
        },
        {
            "type": "big_number_with_trendline",
            "title": "Peak Hour Calls",
            "query": "SELECT MAX(total_calls) as value, hour_key FROM viz_master_timeseries GROUP BY hour_key ORDER BY value DESC LIMIT 1"
        },
        {
            "type": "line_chart",
            "title": "Hourly Call Volume",
            "query": "SELECT timestamp, total_calls, celebration_phase FROM viz_master_timeseries ORDER BY timestamp",
            "x_axis": "timestamp",
            "y_axis": "total_calls",
            "color_by": "celebration_phase"
        },
        {
            "type": "bar_chart",
            "title": "Revenue by Celebration Phase",
            "query": "SELECT celebration_phase, phase_revenue FROM viz_phase_kpis",
            "x_axis": "celebration_phase",
            "y_axis": "phase_revenue"
        }
    ]
}

# Dashboard 2: Network Operations
network_dashboard = {
    "title": "Network Operations Dashboard",
    "charts": [
        {
            "type": "heatmap",
            "title": "Network Stress Heatmap",
            "query": "SELECT call_hour, CDR_DAY, network_stress_score FROM viz_master_timeseries",
            "x_axis": "call_hour",
            "y_axis": "CDR_DAY",
            "metric": "network_stress_score"
        },
        {
            "type": "scatter_plot",
            "title": "Calls vs Failure Rate",
            "query": "SELECT total_calls, failure_rate, anomaly_type, hour_key, unique_users FROM viz_master_timeseries",
            "x_axis": "total_calls",
            "y_axis": "failure_rate",
            "color_by": "anomaly_type",
            "bubble_size": "unique_users"
        },
        {
            "type": "gauge_chart",
            "title": "Average Network Stress",
            "query": "SELECT AVG(network_stress_score) as stress FROM viz_master_timeseries",
            "max_value": 100,
            "thresholds": [40, 60, 80]
        }
    ]
}

# Dashboard 3: User Analytics
user_dashboard = {
    "title": "User Behavior Analytics",
    "charts": [
        {
            "type": "pie_chart",
            "title": "User Segments",
            "query": "SELECT usage_segment, segment_calls AS user_count FROM viz_user_segments"
        },
        {
            "type": "area_chart",
            "title": "User Activity by Hour",
            "query": "SELECT call_hour, active_users, calls_per_user FROM viz_user_activity ORDER BY call_hour",
            "x_axis": "call_hour",
            "metrics": ["active_users", "calls_per_user"]
        }
    ]
}

# Save configurations as JSON
configs = {
    "executive": executive_dashboard,
    "network": network_dashboard,
    "user": user_dashboard
}

# Create a configurations table
config_df = spark.createDataFrame([
    ("executive_dashboard", json.dumps(executive_dashboard)),
    ("network_dashboard", json.dumps(network_dashboard)),
    ("user_dashboard", json.dumps(user_dashboard))
], ["dashboard_name", "config_json"])

config_df.write.mode("overwrite").saveAsTable("viz_dashboard_configs")
print("✅ Created dashboard configurations")



🎨 CREATING SUPERSET DASHBOARD CONFIGURATIONS
------------------------------------------------------------


                                                                                

✅ Created dashboard configurations


In [26]:
# ------------------------------------------------------------
# Cell 7: Create PowerBI-Optimized Tables
# ------------------------------------------------------------
print("\n📊 CREATING POWERBI-OPTIMIZED TABLES")
print("-" * 60)

# 1️⃣ Flatten the master dataset for PowerBI
powerbi_flat = spark.sql("""
SELECT 
    timestamp,
    CDR_DAY         AS Date,
    call_hour       AS Hour,
    CASE 
      WHEN call_hour BETWEEN 0 AND 5  THEN '00-05 Night'
      WHEN call_hour BETWEEN 6 AND 11 THEN '06-11 Morning'
      WHEN call_hour BETWEEN 12 AND 17 THEN '12-17 Afternoon'
      ELSE '18-23 Evening'
    END AS TimePeriod,
    celebration_phase     AS CelebrationPhase,
    total_calls           AS TotalCalls,
    successful_calls      AS SuccessfulCalls,
    failed_calls          AS FailedCalls,
    unique_users          AS UniqueUsers,
    total_revenue         AS Revenue,
    success_rate          AS SuccessRate,
    failure_rate          AS FailureRate,
    hourly_arpu           AS ARPU,
    hour_over_hour_growth AS GrowthRate,
    network_stress_level  AS NetworkStress,
    anomaly_type          AS AnomalyType
    -- on SUPPRIME trend_direction car absent de viz_master_timeseries
FROM viz_master_timeseries
""")
powerbi_flat.write.mode("overwrite").saveAsTable("powerbi_timeseries")
print("✅ Created PowerBI time series table")

# 2️⃣ Fact table
fact_calls = spark.sql("""
SELECT 
    hour_key,
    total_calls,
    successful_calls,
    failed_calls,
    unique_users,
    total_revenue,
    voice_calls,
    sms_count,
    data_sessions
FROM cdr_hourly_aggregated
""")
fact_calls.write.mode("overwrite").saveAsTable("powerbi_fact_calls")
print("✅ Created PowerBI fact table")

# 3️⃣ Time dimension
powerbi_dim_time = spark.sql("""
SELECT DISTINCT
    h.hour_key,
    h.CDR_DAY,
    h.call_hour,
    CASE 
      WHEN h.call_hour BETWEEN 0 AND 5  THEN 'Night'
      WHEN h.call_hour BETWEEN 6 AND 11 THEN 'Morning'
      WHEN h.call_hour BETWEEN 12 AND 17 THEN 'Afternoon'
      ELSE 'Evening'
    END AS period_of_day,
    CASE WHEN h.is_celebration_hour = 1 THEN 'Celebration' ELSE 'Normal' END AS CelebrationPhase
FROM cdr_hourly_features h
""")
powerbi_dim_time.write.mode("overwrite").saveAsTable("powerbi_dim_time")
print("✅ Created PowerBI time dimension")



📊 CREATING POWERBI-OPTIMIZED TABLES
------------------------------------------------------------
✅ Created PowerBI time series table
✅ Created PowerBI fact table
✅ Created PowerBI time dimension


In [27]:
# ------------------------------------------------------------
# Cell 8: Create Visualization SQL Queries
# ------------------------------------------------------------
print("\n📝 VISUALIZATION SQL QUERIES")
print("-" * 60)

# Store useful queries for dashboard creation
visualization_queries = {
    "midnight_spike": """
-- Minute-by-minute analysis around midnight
SELECT 
    timestamp,
    calls_per_minute,
    unique_callers,
    failure_rate
FROM cdr_minute_aggregated
WHERE timestamp BETWEEN '2024-12-31 23:00:00' AND '2025-01-01 01:00:00'
ORDER BY timestamp
""",
    
    "hourly_comparison": """
-- Compare same hours across both days
SELECT 
    call_hour,
    SUM(CASE WHEN CDR_DAY = '2024-12-31' THEN total_calls ELSE 0 END) as dec31_calls,
    SUM(CASE WHEN CDR_DAY = '2025-01-01' THEN total_calls ELSE 0 END) as jan01_calls,
    ROUND(
        (SUM(CASE WHEN CDR_DAY = '2025-01-01' THEN total_calls ELSE 0 END) - 
         SUM(CASE WHEN CDR_DAY = '2024-12-31' THEN total_calls ELSE 0 END)) * 100.0 / 
         NULLIF(SUM(CASE WHEN CDR_DAY = '2024-12-31' THEN total_calls ELSE 0 END), 0), 2
    ) as growth_percentage
FROM viz_master_timeseries
GROUP BY call_hour
ORDER BY call_hour
""",
    
    "anomaly_timeline": """
-- Anomalies with context
SELECT 
    timestamp,
    total_calls,
    anomaly_type,
    anomaly_severity,
    celebration_phase,
    network_stress_level
FROM viz_master_timeseries
WHERE anomaly_severity > 0
ORDER BY timestamp
""",
    
    "revenue_waterfall": """
-- Revenue breakdown by phase
WITH phase_revenue AS (
    SELECT 
        celebration_phase,
        SUM(total_revenue) as revenue,
        ROW_NUMBER() OVER (ORDER BY 
            CASE celebration_phase
                WHEN 'Pre-Celebration' THEN 1
                WHEN 'Late NYE' THEN 2
                WHEN 'Early NY' THEN 3
                WHEN 'Post-Celebration' THEN 4
                WHEN 'New Year Day' THEN 5
            END
        ) as phase_order
    FROM viz_master_timeseries
    WHERE celebration_phase IS NOT NULL
    GROUP BY celebration_phase
)
SELECT 
    celebration_phase,
    revenue,
    SUM(revenue) OVER (ORDER BY phase_order) as cumulative_revenue
FROM phase_revenue
ORDER BY phase_order
""",
    
    "cell_performance_map": """
-- For geographic visualization
SELECT 
    CallingCellID,
    cell_total_calls,
    avg_failure_rate,
    cell_revenue,
    nye_peak,
    CASE 
        WHEN avg_failure_rate > 20 THEN 'Poor'
        WHEN avg_failure_rate > 10 THEN 'Fair'
        ELSE 'Good'
    END as performance_category
FROM viz_cell_performance
WHERE CallingCellID IS NOT NULL
"""
}

# Save queries for reference
queries_df = spark.createDataFrame(
    [(k, v) for k, v in visualization_queries.items()],
    ["query_name", "query_sql"]
)
queries_df.write.mode("overwrite").saveAsTable("viz_query_library")
print("✅ Created visualization query library")

# Print queries for easy copying
for name, query in visualization_queries.items():
    print(f"\n--- {name.upper()} ---")
    print(query)


📝 VISUALIZATION SQL QUERIES
------------------------------------------------------------
✅ Created visualization query library

--- MIDNIGHT_SPIKE ---

-- Minute-by-minute analysis around midnight
SELECT 
    timestamp,
    calls_per_minute,
    unique_callers,
    failure_rate
FROM cdr_minute_aggregated
WHERE timestamp BETWEEN '2024-12-31 23:00:00' AND '2025-01-01 01:00:00'
ORDER BY timestamp


--- HOURLY_COMPARISON ---

-- Compare same hours across both days
SELECT 
    call_hour,
    SUM(CASE WHEN CDR_DAY = '2024-12-31' THEN total_calls ELSE 0 END) as dec31_calls,
    SUM(CASE WHEN CDR_DAY = '2025-01-01' THEN total_calls ELSE 0 END) as jan01_calls,
    ROUND(
        (SUM(CASE WHEN CDR_DAY = '2025-01-01' THEN total_calls ELSE 0 END) - 
         SUM(CASE WHEN CDR_DAY = '2024-12-31' THEN total_calls ELSE 0 END)) * 100.0 / 
         NULLIF(SUM(CASE WHEN CDR_DAY = '2024-12-31' THEN total_calls ELSE 0 END), 0), 2
    ) as growth_percentage
FROM viz_master_timeseries
GROUP BY call_hour
O

In [28]:
# ------------------------------------------------------------
# Cell 9: Create Alert Configurations
# ------------------------------------------------------------
print("\n🚨 CREATING ALERT CONFIGURATIONS")
print("-" * 60)

# Define alert thresholds based on observed patterns
alert_configs = spark.sql("""
SELECT 
    'High Call Volume' as alert_name,
    'total_calls > 5000' as condition,
    'warning' as severity,
    'Hourly calls exceed 5000' as description
UNION ALL
SELECT 
    'Critical Failure Rate' as alert_name,
    'failure_rate > 25' as condition,
    'critical' as severity,
    'Failure rate exceeds 25%' as description
UNION ALL
SELECT 
    'Network Stress Critical' as alert_name,
    'network_stress_score > 70' as condition,
    'critical' as severity,
    'Network stress score above 70' as description
UNION ALL
SELECT 
    'Major Anomaly Detected' as alert_name,
    'anomaly_severity >= 2' as condition,
    'warning' as severity,
    'Major or critical anomaly detected' as description
UNION ALL
SELECT 
    'Revenue Spike' as alert_name,
    'hour_over_hour_growth > 200' as condition,
    'info' as severity,
    'Hour-over-hour growth exceeds 200%' as description
""")

alert_configs.write.mode("overwrite").saveAsTable("viz_alert_configs")
print("✅ Created alert configurations")

# Current alerts based on data
current_alerts = spark.sql("""
SELECT 
    hour_key,
    timestamp,
    CASE 
        WHEN total_calls > 5000 THEN 'High Call Volume'
        WHEN failure_rate > 25 THEN 'Critical Failure Rate'
        WHEN network_stress_score > 70 THEN 'Network Stress Critical'
        WHEN anomaly_severity >= 2 THEN 'Major Anomaly Detected'
        ELSE NULL
    END as alert_triggered,
    total_calls,
    failure_rate,
    network_stress_score
FROM viz_master_timeseries
WHERE total_calls > 5000 
   OR failure_rate > 25 
   OR network_stress_score > 70 
   OR anomaly_severity >= 2
ORDER BY timestamp
""")

print("\n🚨 Active Alerts Summary:")
current_alerts.groupBy("alert_triggered").count().show()


🚨 CREATING ALERT CONFIGURATIONS
------------------------------------------------------------
✅ Created alert configurations

🚨 Active Alerts Summary:
+--------------------+-----+
|     alert_triggered|count|
+--------------------+-----+
|    High Call Volume|    7|
|Major Anomaly Det...|    1|
+--------------------+-----+



In [29]:
# ------------------------------------------------------------
# Cell 10: Final Dashboard Summary and Instructions
# ------------------------------------------------------------
print("\n" + "=" * 80)
print("📊 VISUALIZATION SETUP COMPLETE")
print("=" * 80)

print("\n✅ TABLES CREATED FOR VISUALIZATION:")
print("   1. viz_master_timeseries - Complete time series data")
print("   2. viz_overall_kpis - High-level KPIs")
print("   3. viz_daily_kpis - Daily comparison metrics")
print("   4. viz_phase_kpis - Celebration phase analysis")
print("   5. viz_cell_performance - Geographic cell data")
print("   6. viz_user_segments - User segmentation")
print("   7. powerbi_* tables - PowerBI optimized datasets")

print("\n🎨 RECOMMENDED DASHBOARDS:")
print("\n1. EXECUTIVE DASHBOARD:")
print("   - KPI cards: Total calls, Peak hour, Success rate, Revenue")
print("   - Line chart: Hourly call volume with celebration phases")
print("   - Bar chart: Revenue by phase")
print("   - Gauge: Network stress level")

print("\n2. OPERATIONS DASHBOARD:")
print("   - Heatmap: Network stress by hour")
print("   - Scatter plot: Calls vs failure rate (colored by anomaly)")
print("   - Time series: Minute-by-minute midnight analysis")
print("   - Table: Active alerts")

print("\n3. USER ANALYTICS DASHBOARD:")
print("   - Pie chart: User segments")
print("   - Area chart: Active users by hour")
print("   - Bar chart: Call duration distribution")
print("   - Line chart: ARPU trends")

print("\n📝 SUPERSET SETUP INSTRUCTIONS:")
print("1. Import the datasets from Hive")
print("2. Create calculated fields for growth rates")
print("3. Set up color schemes (green→yellow→red for stress)")
print("4. Configure auto-refresh for real-time monitoring")
print("5. Add filters for date and celebration phase")

print("\n📝 POWERBI SETUP INSTRUCTIONS:")
print("1. Connect to Hive using ODBC connector")
print("2. Import powerbi_* tables")
print("3. Create relationships: fact_calls → dim_time")
print("4. Add DAX measures for YoY comparisons")
print("5. Use conditional formatting for anomalies")

print("\n🎯 KEY INSIGHTS TO HIGHLIGHT:")
print("   • 11.5x traffic surge on New Year's Day")
print("   • Midnight spike pattern (exact timing)")
print("   • Network handled load with <15% failure rate")
print("   • Revenue opportunity in free calls")
print("   • Cell-specific stress patterns")

print(f"\n✅ Visualization setup completed at: {datetime.now()}")
print("🚀 Ready to create stunning dashboards!")

# Stop Spark
spark.stop()


📊 VISUALIZATION SETUP COMPLETE

✅ TABLES CREATED FOR VISUALIZATION:
   1. viz_master_timeseries - Complete time series data
   2. viz_overall_kpis - High-level KPIs
   3. viz_daily_kpis - Daily comparison metrics
   4. viz_phase_kpis - Celebration phase analysis
   5. viz_cell_performance - Geographic cell data
   6. viz_user_segments - User segmentation
   7. powerbi_* tables - PowerBI optimized datasets

🎨 RECOMMENDED DASHBOARDS:

1. EXECUTIVE DASHBOARD:
   - KPI cards: Total calls, Peak hour, Success rate, Revenue
   - Line chart: Hourly call volume with celebration phases
   - Bar chart: Revenue by phase
   - Gauge: Network stress level

2. OPERATIONS DASHBOARD:
   - Heatmap: Network stress by hour
   - Scatter plot: Calls vs failure rate (colored by anomaly)
   - Time series: Minute-by-minute midnight analysis
   - Table: Active alerts

3. USER ANALYTICS DASHBOARD:
   - Pie chart: User segments
   - Area chart: Active users by hour
   - Bar chart: Call duration distribution
   - 

In [13]:
import os
from pyspark.sql import SparkSession

spark = init_spark("CDR Viz Exports to BI")
spark.sql("USE algerie_telecom_cdr")

tables_to_export = [
    "viz_master_timeseries",
    "viz_overall_kpis",
    "viz_daily_kpis",
    "viz_phase_kpis",
    "viz_cell_performance",
    "viz_user_segments",
    "viz_user_activity",
    "powerbi_timeseries",
    "powerbi_fact_calls",
    "powerbi_dim_time",
    "viz_dashboard_configs",
    "viz_query_library",
    "viz_alert_configs",
]

# on construit le chemin dans ton home
home = os.environ["HOME"]  # /home/jovyan
export_root = os.path.join(home, "work", "dashboards", "exports")
os.makedirs(export_root, exist_ok=True)

for tbl in tables_to_export:
    out_dir = os.path.join(export_root, tbl)
    os.makedirs(out_dir, exist_ok=True)
    print(f"▶ Export de {tbl} → {out_dir}")
    spark.table(tbl) \
         .coalesce(1) \
         .write \
         .mode("overwrite") \
         .option("header", "true") \
         .csv(out_dir)

print("✅ Export terminé. Maintenant fais un Refresh dans le file-browser sur ~/work/dashboards/exports")


✅ SparkSession initialized (App: CDR Viz Exports to BI, Spark: 3.5.1)
✅ Hive Warehouse: hdfs://namenode:9000/user/hive/warehouse
✅ Hive Metastore URI: thrift://hive-metastore:9083
▶ Export de viz_master_timeseries → /home/jovyan/work/dashboards/exports/viz_master_timeseries
▶ Export de viz_overall_kpis → /home/jovyan/work/dashboards/exports/viz_overall_kpis
▶ Export de viz_daily_kpis → /home/jovyan/work/dashboards/exports/viz_daily_kpis
▶ Export de viz_phase_kpis → /home/jovyan/work/dashboards/exports/viz_phase_kpis
▶ Export de viz_cell_performance → /home/jovyan/work/dashboards/exports/viz_cell_performance
▶ Export de viz_user_segments → /home/jovyan/work/dashboards/exports/viz_user_segments
▶ Export de viz_user_activity → /home/jovyan/work/dashboards/exports/viz_user_activity
▶ Export de powerbi_timeseries → /home/jovyan/work/dashboards/exports/powerbi_timeseries
▶ Export de powerbi_fact_calls → /home/jovyan/work/dashboards/exports/powerbi_fact_calls
▶ Export de powerbi_dim_time → /h

In [14]:
import shutil
shutil.make_archive(
    os.path.join(home, "work", "dashboards", "exports"), 
    "zip", 
    os.path.join(home, "work", "dashboards", "exports")
)
print("✅ exports.zip créé sous ~/work/dashboards/exports.zip")


✅ exports.zip créé sous ~/work/dashboards/exports.zip


In [9]:
import os
print("cwd:", os.getcwd())
print("listing root:", os.listdir("/"))
print("listing cwd:", os.listdir(os.getcwd()))


cwd: /home/jovyan/work/work/spark-apps
listing root: ['dev', 'etc', 'libx32', 'tmp', 'lib', 'boot', 'media', 'srv', 'sys', 'bin', 'lib32', 'opt', 'home', 'proc', 'root', 'sbin', 'var', 'lib64', 'mnt', 'run', 'usr']
listing cwd: ['.ipynb_checkpoints', '01.Data_quality.ipynb', '02.Hive_Feature_Engineering_Real_CDR.ipynb', '03.Network-Trend-Analysis.ipynb', '04.Data_Viz.ipynb', 'anonymize_cdr.py', 'Anon_CDR_EDA_Quality.ipynb', 'CDR_Analysis.ipynb', 'CDR_Schema_Load', 'Delta_Lake_Tables.ipynb', 'derby.log', 'EDA_generate_CDR_AT.ipynb', 'exports.zip', 'Feature_Engineering.ipynb', 'Hive_Tables.ipynb', 'tmp']


In [18]:
!pwd


/home/jovyan/work/work/spark-apps


In [19]:
!ls -l


total 4048
-rwxrwxrwx 1 root root  560136 Jul  2 00:59 01.Data_quality.ipynb
-rwxrwxrwx 1 root root   94651 Jul  2 01:07 02.Hive_Feature_Engineering_Real_CDR.ipynb
-rwxrwxrwx 1 root root   60766 Jun 29 05:26 03.Network-Trend-Analysis.ipynb
-rwxrwxrwx 1 root root   85788 Jul  2 04:30 04.Data_Viz.ipynb
-rwxrwxrwx 1 root root   80672 Jun 29 04:14 Anon_CDR_EDA_Quality.ipynb
-rwxrwxrwx 1 root root    1621 Jun 16 21:05 anonymize_cdr.py
-rwxrwxrwx 1 root root  131489 Jun 16 21:39 CDR_Analysis.ipynb
drwxrwxrwx 1 root root    4096 Jun 16 12:10 CDR_Schema_Load
-rwxrwxrwx 1 root root   31103 Jun  2 12:49 Delta_Lake_Tables.ipynb
-rwxrwxrwx 1 root root 2613096 May  7 18:54 derby.log
-rwxrwxrwx 1 root root   11454 Jun 25 00:50 EDA_generate_CDR_AT.ipynb
-rwxrwxrwx 1 root root      22 Jul  2 01:49 exports.zip
-rwxrwxrwx 1 root root  102069 Jun 22 15:21 Feature_Engineering.ipynb
-rwxrwxrwx 1 root root  356166 Jun 25 20:55 Hive_Tables.ipynb
drwxrwxrwx 1 root root    4096 Jul  2 01:30 tmp


In [20]:
!ls -l tmp
!ls -l tmp/exports


total 0
drwxrwxrwx 1 root root 4096 Jul  2 01:30 exports
total 0


In [21]:
import os

base = os.getcwd()
export_root = os.path.join(base, "tmp", "exports")
print("Je regarde :", export_root)
for root, dirs, files in os.walk(export_root):
    print(root, "→", len(dirs), "sous-dossiers ;", len(files), "fichiers")
    for d in dirs:
        print("  └─", d)
    for f in files:
        print("     •", f)


Je regarde : /home/jovyan/work/work/spark-apps/tmp/exports
/home/jovyan/work/work/spark-apps/tmp/exports → 0 sous-dossiers ; 0 fichiers


In [22]:
!hdfs dfs -ls /tmp/exports

/bin/bash: line 1: hdfs: command not found


In [24]:
%%bash
# crée un dossier local
mkdir -p exports_hdfs

# copie tout le répertoire HDFS exports en local
hdfs dfs -copyToLocal /home/jovyan/work/dashboards/exports exports_hdfs


bash: line 5: hdfs: command not found


CalledProcessError: Command 'b'# cr\xc3\xa9e un dossier local\nmkdir -p exports_hdfs\n\n# copie tout le r\xc3\xa9pertoire HDFS exports en local\nhdfs dfs -copyToLocal /home/jovyan/work/dashboards/exports exports_hdfs\n'' returned non-zero exit status 127.

In [25]:
!mkdir -p exports_hdfs
!hdfs dfs -copyToLocal /home/jovyan/work/dashboards/exports exports_hdfs


/bin/bash: line 1: hdfs: command not found


In [26]:
!ls -l ~/local_exports


ls: cannot access '/home/jovyan/local_exports': No such file or directory


In [27]:
!ls -R ~/local_exports


ls: cannot access '/home/jovyan/local_exports': No such file or directory


In [28]:
print(
    spark.sparkContext._jsc
                 .hadoopConfiguration()
                 .get("fs.defaultFS")
)


hdfs://namenode:9000


In [29]:
fs_default = spark.sparkContext._jsc \
                .hadoopConfiguration() \
                .get("fs.defaultFS")
print(fs_default)   # e.g. hdfs://namenode:9000


hdfs://namenode:9000
