# **Configuration**

In [1]:
import numpy
import pandas
import pyproj
import shapely
import sklearn

In [2]:
print(numpy.__version__)
print(pandas.__version__)
print(pyproj.__version__)
print(shapely.__version__)
print(sklearn.__version__)

1.26.4
1.5.3
3.7.0
2.0.6
1.5.2


In [3]:
# Import libraries
import h3
import h3_pyspark
import pandas as pd
import pyspark.sql.functions as f
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lag, lead
from pyspark.sql.window import Window
from pyspark.sql.types import StructType, StructField, DoubleType, IntegerType, TimestampType, StringType, DateType
from datetime import datetime, timedelta



pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 50)

In [4]:
import math
import os
from functools import reduce
from math import asin, atan2, cos, degrees, floor, radians, sin, sqrt

import geopandas as gpd
import geopy
import h3
import numpy as np
import pandas as pd
import pyspark
import sedona
# import rasterio
from pyspark import SparkContext
from pyspark.conf import SparkConf
from pyspark.sql import DataFrame, SparkSession
from pyspark.sql import functions as F
from pyspark.sql import types as T
from pyspark.sql.window import Window

from shapely.wkt import loads

print("pandas : ", pd.__version__)
print("geopandas : ", gpd.__version__)
print("pyspark : ", pyspark.__version__)
print("sedona : ", sedona.version)

pandas :  1.5.3
geopandas :  1.0.1
pyspark :  3.3.4
sedona :  1.2.1


In [5]:
os.environ["PYSPARK_PYTHON"] = "./env/bin/python"
os.environ["HADOOP_CONF_DIR"] = "/etc/spark3/conf.cloudera.spark3_on_yarn/yarn-conf"
os.environ["HADOOP_HOME"] = "/opt/cloudera/parcels/CDH/lib/hadoop"
os.environ["SPARK_HOME"] = "/opt/cloudera/parcels/SPARK3-3.3.2.3.3.7190.4-1-1.p0.51021169/lib/spark3"
os.environ["SPARK_CONF_DIR"] = "/etc/spark3/conf.cloudera.spark3_on_yarn"

conf = (
    SparkConf()
    .setMaster("yarn")
    .setAppName("data-cerdas-rerun")
    .set("spark.dynamicAllocation.maxExecutors", "50")
    .set("spark.dynamicAllocation.minExecutors", "1")
    .set("spark.executor.cores", "12")
    .set("spark.executor.memory", "64g")
    .set("spark.sql.shuffle.partitions", "7000")
    .set("spark.yarn.queue", "root.pnt.hui_pnt_bpsint")
    .set("spark.driver.maxResultSize","8g")
    .set(
        "spark.yarn.appMasterEnv.PYSPARK_PYTHON",
        "./env/bin/python",
    )
    .set(
        "spark.yarn.dist.archives", "hdfs://nsdiscovery/warehouse/tablespace/external/hive/pnt_bps_int.db/envs/mobility_310.tar.gz#env"
    )
)

In [6]:
sc = SparkContext.getOrCreate(conf=conf)

spark = SparkSession(sc)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/03/24 11:35:31 WARN  util.Utils: [Thread-3]: Service 'sparkDriver' could not bind on port 30060. Attempting port 30061.
25/03/24 11:35:31 WARN  util.Utils: [Thread-3]: Service 'sparkDriver' could not bind on port 30061. Attempting port 30062.
25/03/24 11:35:32 WARN  util.Utils: [Thread-3]: Service 'SparkUI' could not bind on port 30072. Attempting port 30073.
25/03/24 11:35:32 WARN  util.Utils: [Thread-3]: Service 'SparkUI' could not bind on port 30073. Attempting port 30074.
25/03/24 11:35:35 WARN  conf.HiveConf: [Thread-3]: HiveConf of name hive.metastore.runworker.in does not exist
25/03/24 11:35:35 WARN  conf.HiveConf: [Thread-3]: HiveConf of name hive.masking.algo does not exist
25/03/24 11:35:37 WARN  util.Utils: [Thread-3]: spark.executor.instances less than spark.dynamicAllocation.minExecutors is invalid, ignoring its setting, please update your c

In [7]:
spark

# UE Monthly

In [8]:


def prep_lbs(lbs,filter_month):
    lbs = lbs.filter(F.substring('calling_date',1,7)==filter_month)
    lbs = lbs\
            .withColumnRenamed('hash_msisdn', 'imsi')\
            .withColumnRenamed('encrypted_msisdn', 'imsi')\
            .withColumn('msisdn', f.col('imsi').cast(StringType()))
    
    lbs = lbs.withColumn('h3_10', latlon_h3(F.col('latitude').cast('double'),F.col('longitude').cast('double') , f.lit(10)))
    return lbs

def join_lbs_lau(lbs,lau):
    lbs = lbs.join(lau, ['h3_10'], 'left')
    lbs = lbs.withColumn('h3_8', h3_parent('h3_10', f.lit(8)))
    lbs = lbs.repartition('msisdn')
    return lbs

def haversine_distance(lon1, lat1, lon2, lat2):
    R = 6371.0  # Earth's radius in kilometers
    dlat = f.radians(lat2 - lat1)
    dlon = f.radians(lon2 - lon1)
    a = f.pow(f.sin(dlat/2), 2) + \
        f.cos(f.radians(lat1)) * f.cos(f.radians(lat2)) * f.pow(f.sin(dlon/2), 2)
    c = 2 * f.atan2(f.sqrt(a), f.sqrt(1-a))
    return f.round(R * c, 2)

def process_ue_monthly(lbs):
    part = Window.partitionBy('msisdn').orderBy(['datetime'])
    
    home_start  = 0
    home_end    = 5
    work_start  = 8
    work_end    = 16
    home_start2 = 20
    home_end2   = 23

    lbs_weekday_stop = lbs\
                        .withColumn('latitude', f.round(f.col('latitude').cast('double'), 5)) \
                        .withColumn('longitude', f.round(f.col('longitude').cast('double'), 5)) \
                        .dropDuplicates(['msisdn','datetime','longitude','latitude'])\
                        .withColumn('event_day', f.date_format(col('datetime'), 'E'))\
                        .withColumn('day_type', f.when((col('event_day')=='Sat') | (col('event_day')=='Sun'),'weekend').otherwise('weekday') )\
                        .filter(col('day_type') == 'weekday')\
                        .withColumn('prev_latitude', f.lag('latitude').over(part))\
                        .withColumn('prev_longitude', f.lag('longitude').over(part))\
                        .withColumn('prev_datetime', f.lag('datetime').over(part))\
                        .withColumn('time_seconds', f.unix_timestamp('datetime') - f.unix_timestamp('prev_datetime')) \
                        .withColumn('distance',  haversine_distance(col('prev_longitude'), col('prev_latitude'), col('longitude'), col('latitude')) )\
                        .withColumn('speed', f.round( col('distance') / (col('time_seconds') / 3600) ,2) )\
                        .filter(col('speed') <= 10.0)\
                        .withColumn('month', f.format_string("%04d-M%02d", f.year(col('datetime')), f.month(col('datetime'))) )\
                        .withColumn('date', f.to_date(col('datetime')) )\
                        .withColumn('hour', f.hour(col('datetime')) )\
                        .withColumn('hour_category', f.when( (col('hour') >= home_start) & (col('hour') <= home_end), 'home' )\
                                                      .when( (col('hour') >= work_start) & (col('hour') <= work_end), 'work' )\
                                                      .when( (col('hour') >= home_start2) & (col('hour') <= home_end2), 'home' )\
                                                      .otherwise('others') )\
                        .withColumn('activity_kab', f.col('kab'))\
                        .withColumn('activity_kec', f.col('kec'))
    
    part2 = Window.partitionBy(['msisdn', 'month', 'h3_8', 'activity_kec', 'hour_category'])

    part3 = Window.partitionBy(['msisdn', 'month', 'hour_category'])\
                  .orderBy([col('total_date').desc(), col('total_count').desc(), 'entropy'])
    
    ue = lbs_weekday_stop \
          .groupBy('msisdn', 'month', 'h3_8', 'activity_kec', 'hour_category', 'hour')\
          .agg(
                f.count('*').alias('N'),
                f.countDistinct('date').alias('N_date'),
              )\
          .filter(col('N_date') >= 5 )\
          .withColumn('total_date', f.sum('N_date').over(part2)) \
          .withColumn('total_count', f.sum('N').over(part2)) \
          .withColumn('prob', f.col('N') / f.col('total_count')) \
          .withColumn('entropy', f.round(-f.sum(f.col('prob') * f.log2(f.col('prob'))).over(part2),2) ) \
          .withColumn('rank', f.dense_rank().over(part3) )\
          .filter( (col('rank') == 1) & ~(col('hour_category') == 'others') )\
          .select(['msisdn', 'month', 'h3_8', 'activity_kec', 'entropy', 'hour_category'])\
          .distinct()
    
    ue_monthly = ue\
                  .groupBy('msisdn', 'month')\
                  .agg(
                        f.min(f.when(col('hour_category')=='home',col('h3_8'))).alias('home_h3_8'),
                        f.min(f.when(col('hour_category')=='home',col('activity_kec'))).alias('home_kec'),
                        f.min(f.when(col('hour_category')=='home',col('entropy'))).alias('home_entropy'),
                        f.min(f.when(col('hour_category')=='work',col('h3_8'))).alias('work_h3_8'),
                        f.min(f.when(col('hour_category')=='work',col('activity_kec'))).alias('work_kec'),
                        f.min(f.when(col('hour_category')=='work',col('entropy'))).alias('work_entropy')
                  )\
                  .withColumn('home_kab', col('home_kec').substr(0, 5))\
                  .withColumn('work_kab', col('work_kec').substr(0, 5))\
                  .withColumn('date', f.to_date(f.concat_ws('-', col('month').substr(0, 4), col('month').substr(7, 2), f.lit('01')), 'yyyy-MM-dd') )\
                  .withColumn('date_6m_ago', f.add_months(col('date'), -5) )\
                  .withColumn('date', f.last_day(col('date')) )\
    
    return ue_monthly

def process_data(
    lbs,
    lau,
    filter_month
):
        
    lbs = prep_lbs(lbs,filter_month)
    lbs = join_lbs_lau(lbs,lau)
    ue_monthly = process_ue_monthly(lbs)
    return ue_monthly

In [9]:
h3_parent = f.udf(lambda h3_str,size:h3.cell_to_parent(h3_str,size) if h3_str else None, returnType=StringType())
latlon_h3 = F.udf(lambda lat,lon,size:h3.latlng_to_cell(lat,lon,size) if lat != None and lon != None else None)

lau = spark.read.table('pe_bps.indonesia_h3_38prov')
# lbs = spark.read.table('p_gsa_stg.lbs_genome')
lbs = spark.read.table('pe_bps.wisnus_sample5_lbs_2024')

filter_month = '2024-12'

25/03/24 11:36:04 WARN  conf.HiveConf: [Thread-3]: HiveConf of name hive.metastore.runworker.in does not exist
25/03/24 11:36:04 WARN  conf.HiveConf: [Thread-3]: HiveConf of name hive.masking.algo does not exist
25/03/24 11:36:04 WARN  client.HiveClientImpl: [Thread-3]: Detected HiveConf hive.execution.engine is 'tez' and will be reset to 'mr' to disable useless hive logic
Hive Session ID = 4fd37de9-9598-48e8-8ead-70933b64574e
25/03/24 11:36:05 WARN  conf.HiveConf: [Thread-3]: HiveConf of name hive.metastore.runworker.in does not exist
25/03/24 11:36:05 WARN  conf.HiveConf: [Thread-3]: HiveConf of name hive.masking.algo does not exist


In [10]:
ue_monthly = process_data(
    lbs,
    lau,
    filter_month
)

## Looping

In [11]:
import logging

# Configure the logging to save to a file
logging.basicConfig(
    filename='logs/logging_ue_monthly_sample.log',  # Name of the log file
    level=logging.INFO,  # Set the logging level to INFO
    format='%(asctime)s - %(levelname)s - %(message)s',  # Define the format of log messages
    datefmt='%Y-%m-%d %H:%M:%S'  # Date format
)

## Atur Bulan Di sini

In [None]:
list_date = pd.date_range('2024-04','2024-11',freq='MS')
for datetime in list_date:
    
    filter_month = datetime.strftime('%Y-%m')
    logging.info(f'PROCESS: {filter_month}')
    ue_monthly = process_data(
        lbs,
        lau,
        filter_month
    )
    ue_monthly.repartition(100).write.partitionBy('month').mode('append').saveAsTable('pnt_bps_int.data_cerdas_ue_monthly_sample')

25/03/24 11:36:13 WARN  metastore.RetryingMetaStoreClient: [Thread-3]: MetaStoreClient lost connection. Attempting to reconnect (1 of 1) after 1s. listPartitionsWithAuthInfo
org.apache.thrift.transport.TTransportException: SASL authentication not complete
	at org.apache.thrift.transport.TSaslTransport.write(TSaslTransport.java:442) ~[libthrift-0.16.0.jar:0.16.0]
	at org.apache.thrift.transport.TSaslClientTransport.write(TSaslClientTransport.java:39) ~[libthrift-0.16.0.jar:0.16.0]
	at org.apache.hadoop.hive.metastore.security.TFilterTransport.write(TFilterTransport.java:73) ~[hive-standalone-metastore-3.1.3000.7.1.9.4-4.jar:3.1.3000.7.1.9.4-4]
	at org.apache.thrift.protocol.TBinaryProtocol.writeI32(TBinaryProtocol.java:204) ~[libthrift-0.16.0.jar:0.16.0]
	at org.apache.thrift.protocol.TBinaryProtocol.writeMessageBegin(TBinaryProtocol.java:119) ~[libthrift-0.16.0.jar:0.16.0]
	at org.apache.thrift.TServiceClient.sendBase(TServiceClient.java:70) ~[libthrift-0.16.0.jar:0.16.0]
	at org.apach

In [16]:
q = """SHOW PARTITIONS pnt_bps_int.data_cerdas_ue_monthly_sample"""
spark.sql(q).toPandas()

Unnamed: 0,partition
0,month=2023-M12
1,month=2024-M01
2,month=2024-M02
3,month=2024-M03
4,month=2024-M04
5,month=2024-M05
6,month=2024-M06
7,month=2024-M07
8,month=2024-M08
9,month=2024-M09


In [17]:
spark.read.table('pnt_bps_int.data_cerdas_ue_monthly_sample').limit(10).toPandas()

Unnamed: 0,msisdn,home_h3_8,home_kec,home_entropy,work_h3_8,work_kec,work_entropy,home_kab,work_kab,date,date_6m_ago,month
0,2371386359976944937,888da230c5fffff,35|09|050,1.0,,,,35|09,,2024-01-31,2023-08-01,2024-M01
1,-1274965686682802740,8895059313fffff,73|14|030,0.0,8895059313fffff,73|14|030,1.5,73|14,73|14,2024-01-31,2023-08-01,2024-M01
2,-1578913676184715203,8868c8cc89fffff,71|05|120,0.0,8868c8cc89fffff,71|05|120,2.31,71|05,71|05,2024-01-31,2023-08-01,2024-M01
3,-7567723583503207360,,,,888cf64e4bfffff,16|12|040,2.31,,16|12,2024-01-31,2023-08-01,2024-M01
4,-3164909028423898742,8865246901fffff,21|71|061,3.24,886526a6c3fffff,21|71|061,2.98,21|71,21|71,2024-01-31,2023-08-01,2024-M01
5,1136923908067489651,889500116bfffff,73|05|031,0.0,8895001a57fffff,73|05|031,0.0,73|05,73|05,2024-01-31,2023-08-01,2024-M01
6,-6184012444850013421,888d848d59fffff,35|13|070,3.28,888d848d59fffff,35|13|070,3.14,35|13,35|13,2024-01-31,2023-08-01,2024-M01
7,-7476753881106820274,88652eb201fffff,14|08|010,2.11,88652eb201fffff,14|08|010,1.58,14|08,14|08,2024-01-31,2023-08-01,2024-M01
8,-4695904401745350961,888c160ec3fffff,32|13|190,3.26,888c160ec3fffff,32|13|190,3.14,32|13,32|13,2024-01-31,2023-08-01,2024-M01
9,531993138834456383,,,,886890aa2dfffff,62|09|060,0.0,,62|09,2024-01-31,2023-08-01,2024-M01


# UE Semesterly

In [18]:
def process_ue_semesterly(ue_monthly,start_date,end_date):
    part_home = Window.partitionBy('msisdn').orderBy([col('home_N_month').desc(), col('home_entropy').asc(), col('latest_month').desc()])
    part_work = Window.partitionBy('msisdn').orderBy([col('work_N_month').desc(), col('work_entropy').asc(), col('latest_month').desc()])
    
    period_month = end_date[:7]
    # Set 6 months period
    ue_semesterly =   ue_monthly\
                        .filter(col('date').between(start_date,end_date))

    home_semesterly = ue_semesterly\
                      .groupBy('msisdn','home_h3_8','home_kab','home_kec')\
                      .agg(
                          f.countDistinct('month').alias('home_N_month'),
                          f.sum('home_entropy').alias('home_entropy'),
                          f.max('month').alias('latest_month')
                      )\
                      .withColumn('rn', f.row_number().over(part_home))\
                      .filter(col('rn') == 1)\
                      .withColumn('event_month', f.lit(period_month[:4]+'-M'+period_month[-2:]))\
                      .drop('rn','latest_month')

    work_semesterly = ue_semesterly\
                      .groupBy('msisdn','work_h3_8','work_kab','work_kec')\
                      .agg(
                          f.countDistinct('month').alias('work_N_month'),
                          f.sum('work_entropy').alias('work_entropy'),
                          f.max('month').alias('latest_month')
                      )\
                      .withColumn('rn', f.row_number().over(part_work))\
                      .filter(col('rn') == 1)\
                      .withColumn('event_month', f.lit(period_month[:4]+'-M'+period_month[-2:]))\
                      .drop('rn','latest_month')
    
    ue_6 = home_semesterly\
        .join(work_semesterly, [ 'msisdn', 'event_month' ], 'full')\
        .select(
            'event_month', 'msisdn',
            f.coalesce(col('home_h3_8'), col('work_h3_8')).alias('home_h3_8'),
            f.coalesce(col('work_h3_8'), col('home_h3_8')).alias('work_h3_8'),
            f.coalesce(col('home_kab'), col('work_kab')).alias('home_kab'),
            f.coalesce(col('home_kec'), col('work_kec')).alias('home_kec'),
            f.coalesce(col('work_kab'), col('home_kab')).alias('work_kab'),
            f.coalesce(col('work_kec'), col('home_kec')).alias('work_kec')
        )
    
    return ue_6

In [19]:
ue_monthly = spark.read.table('pnt_bps_int.data_cerdas_ue_monthly_sample')
start_date = '2024-06-01'# perbedaan 6 bulan ke belakang
end_date = '2024-11-30' # 6, 7, 8, 9, 10, 11

In [20]:
ue_semesterly = process_ue_semesterly(ue_monthly,start_date,end_date)
ue_semesterly.printSchema()

root
 |-- event_month: string (nullable = true)
 |-- msisdn: string (nullable = true)
 |-- home_h3_8: string (nullable = true)
 |-- work_h3_8: string (nullable = true)
 |-- home_kab: string (nullable = true)
 |-- home_kec: string (nullable = true)
 |-- work_kab: string (nullable = true)
 |-- work_kec: string (nullable = true)



## Looping

In [21]:
import logging

# Configure the logging to save to a file
logging.basicConfig(
    filename='logging_ue_semesterly_sample.log',  # Name of the log file
    level=logging.INFO,  # Set the logging level to INFO
    format='%(asctime)s - %(levelname)s - %(message)s',  # Define the format of log messages
    datefmt='%Y-%m-%d %H:%M:%S'  # Date format
)

In [22]:
from dateutil.relativedelta import relativedelta

## Atur Bulan Di sini

In [23]:
list_month = pd.date_range('2024-05','2024-11',freq='MS')
for month in list_month:
    
    logging.info(f'PROCESS: {month.strftime("%Y-%m")}')
    
    start_datetime = month - relativedelta(months=5)
    end_datetime = month + relativedelta(days=month.days_in_month) - relativedelta(days=1)
    start_date = start_datetime.strftime('%Y-%m-%d')
    end_date = end_datetime.strftime('%Y-%m-%d')

    logging.info(f'START DATE: {start_date}')
    logging.info(f'END DATE: {end_date}')
    
    ue_semesterly = process_ue_semesterly(ue_monthly,start_date,end_date)
    
    ue_semesterly.repartition(100).write.partitionBy('event_month').mode('append').saveAsTable('pnt_bps_int.data_cerdas_ue_sample')
    
    logging.info(f'======== DONE ========')

                                                                                

# Checking UE Monthly

In [32]:
ue_monthly = spark.read.table('pnt_bps_int.data_cerdas_ue_monthly_sample')

25/03/25 01:39:45 WARN  conf.HiveConf: [Thread-3]: HiveConf of name hive.metastore.runworker.in does not exist
25/03/25 01:39:45 WARN  conf.HiveConf: [Thread-3]: HiveConf of name hive.masking.algo does not exist


In [33]:
%%time
ue_monthly.limit(10).toPandas()

[Stage 317:>                                                        (0 + 1) / 1]

CPU times: user 11.6 ms, sys: 4.25 ms, total: 15.8 ms
Wall time: 13.7 s


                                                                                

Unnamed: 0,msisdn,home_h3_8,home_kec,home_entropy,work_h3_8,work_kec,work_entropy,home_kab,work_kab,date,date_6m_ago,month
0,2371386359976944937,888da230c5fffff,35|09|050,1.0,,,,35|09,,2024-01-31,2023-08-01,2024-M01
1,-1274965686682802740,8895059313fffff,73|14|030,0.0,8895059313fffff,73|14|030,1.5,73|14,73|14,2024-01-31,2023-08-01,2024-M01
2,-1578913676184715203,8868c8cc89fffff,71|05|120,0.0,8868c8cc89fffff,71|05|120,2.31,71|05,71|05,2024-01-31,2023-08-01,2024-M01
3,-7567723583503207360,,,,888cf64e4bfffff,16|12|040,2.31,,16|12,2024-01-31,2023-08-01,2024-M01
4,-3164909028423898742,8865246901fffff,21|71|061,3.24,886526a6c3fffff,21|71|061,2.98,21|71,21|71,2024-01-31,2023-08-01,2024-M01
5,1136923908067489651,889500116bfffff,73|05|031,0.0,8895001a57fffff,73|05|031,0.0,73|05,73|05,2024-01-31,2023-08-01,2024-M01
6,-6184012444850013421,888d848d59fffff,35|13|070,3.28,888d848d59fffff,35|13|070,3.14,35|13,35|13,2024-01-31,2023-08-01,2024-M01
7,-7476753881106820274,88652eb201fffff,14|08|010,2.11,88652eb201fffff,14|08|010,1.58,14|08,14|08,2024-01-31,2023-08-01,2024-M01
8,-4695904401745350961,888c160ec3fffff,32|13|190,3.26,888c160ec3fffff,32|13|190,3.14,32|13,32|13,2024-01-31,2023-08-01,2024-M01
9,531993138834456383,,,,886890aa2dfffff,62|09|060,0.0,,62|09,2024-01-31,2023-08-01,2024-M01


In [34]:
q = """SHOW PARTITIONS pnt_bps_int.data_cerdas_ue_monthly_sample"""
spark.sql(q).toPandas()

Unnamed: 0,partition
0,month=2023-M12
1,month=2024-M01
2,month=2024-M02
3,month=2024-M03
4,month=2024-M04
5,month=2024-M05
6,month=2024-M06
7,month=2024-M07
8,month=2024-M08
9,month=2024-M09


In [27]:
%%time
ue_monthly.groupBy('month').agg(
    F.count('msisdn').alias('num_row'),
    F.countDistinct('msisdn').alias('num_msisdn')
).toPandas().sort_values('month')



CPU times: user 36.9 ms, sys: 2.8 ms, total: 39.7 ms
Wall time: 11.6 s


                                                                                

Unnamed: 0,month,num_row,num_msisdn
3,2023-M12,4763453,4763453
1,2024-M01,4906492,4906492
9,2024-M02,4817474,4817474
5,2024-M03,4719132,4719132
4,2024-M04,4630604,4630604
7,2024-M05,4598242,4598242
8,2024-M06,4362195,4362195
10,2024-M07,4297405,4297405
6,2024-M08,4122516,4122516
0,2024-M09,3924637,3924637


# Checking UE Semesterly

In [28]:
ue_semesterly = spark.read.table('pnt_bps_int.data_cerdas_ue_sample')
# ue_semesterly = ue_semesterly.filter(F.col('event_month')=='2024-M12')

In [29]:
%%time
ue_semesterly.limit(12).toPandas()

CPU times: user 8.52 ms, sys: 65 μs, total: 8.58 ms
Wall time: 490 ms


Unnamed: 0,msisdn,home_h3_8,work_h3_8,home_kab,home_kec,work_kab,work_kec,event_month
0,3575010115860143924,888cf0846bfffff,888cf095b3fffff,17|05,17|05|010,17|05,17|05|010,2024-M05
1,-1132260017814221523,88652cd637fffff,88652c996bfffff,14|71,14|71|010,14|06,14|06|070,2024-M05
2,-4078250205974930428,,,,,,,2024-M05
3,6369446884781901962,888c14cce7fffff,888c14cce7fffff,32|05,32|05|140,32|05,32|05|140,2024-M05
4,4154820185226518483,8865212cc9fffff,8865212cc9fffff,14|08,14|08|013,14|08,14|08|013,2024-M05
5,862667039292241160,888c106a05fffff,888c106a05fffff,31|74,31|74|040,31|74,31|74|040,2024-M05
6,3807043185012931184,8895216699fffff,8895216699fffff,73|26,73|26|130,73|26,73|26|130,2024-M05
7,3766577526323865386,88688e0f5dfffff,88688e0f5dfffff,64|03,64|03|080,64|03,64|03|080,2024-M05
8,-8530845161508761695,888d8d6ecdfffff,888d899b41fffff,33|02,33|02|170,33|02,33|02|140,2024-M05
9,-8536780261890682715,88652bc2a1fffff,88652bc223fffff,12|23,12|23|010,12|23,12|23|020,2024-M05


In [31]:
%%time
ue_semesterly.groupBy('event_month').agg(
    F.count('msisdn').alias('num_row'),
    F.countDistinct('msisdn').alias('num_msisdn')
).toPandas().sort_values('event_month')



CPU times: user 19.6 ms, sys: 1.31 ms, total: 20.9 ms
Wall time: 5.97 s


                                                                                

Unnamed: 0,event_month,num_row,num_msisdn
3,2024-M05,5771522,5771522
4,2024-M06,5685795,5685795
5,2024-M07,5492443,5492443
2,2024-M08,5364754,5364754
0,2024-M09,5227037,5227037
6,2024-M10,5080438,5080438
1,2024-M11,4945083,4945083
