In [1]:
import pandas as pd
pd.set_option("display.max_columns", 512)

In [2]:
from pyspark import StorageLevel
from pyspark.sql.functions import col, when, concat, year, month, lpad, lit, udf, hour, concat_ws, unix_timestamp
from pyspark.sql.types import LongType, StringType, DoubleType, TimestampType, BooleanType

In [3]:
import graphframes

In [4]:
spark.sparkContext.setCheckpointDir("hdfs:///user/rbuendi1/spark_checkpoints")

In [6]:
# Just for CDR Data. YOU HAVE TO CHANGE THIS TO GET FEATURES FOR PREDICTION (with the prediction notebook) afterwards.
# For example: to create list for launching campaign in december 2017, you need to
# put here october 2017:
year = 2018
month = 1

# It turns out that these notebook is used both for preparing data for training and prediction.
# but due to possible deanonimization issues, we will transform the phone number when used
# for training. So, I create here a variable called dataset_for_training, and should be
# switched to True if you are going to prepare a dataset for training. Otherwise, False 
dataset_for_training = False

## Listados de teléfonos que ya son clientes de vodafone (tanto prepago como pospago)

In [7]:
acFinalPrepago = (spark.read.table("raw_es.vf_pre_ac_final")
                  .where(col("year") == year)
                  .where(col("month") == month)
                  )

In [8]:
acFinalPospago = (spark.read.table("raw_es.vf_pos_ac_final")
                  .where(col("year") == year)
                  .where(col("month") == month)
                 )

In [9]:
assert acFinalPrepago.count() > 1000 # Si falla, es que la Prepago no está cargada aún para ese mes
assert acFinalPospago.count() > 1000 # Si falla, es que la Pospago no está cargada aún para ese mes

In [10]:
lookup_cliente = (acFinalPrepago
                  .dropDuplicates(subset=["msisdn"])
                  .withColumn("es_vodafone", lit("prepago"))
                  .select("msisdn", "es_vodafone")
                  .union(acFinalPospago
                         .dropDuplicates(subset=["x_id_red"])
                         .withColumn("es_vodafone", lit("pospago"))
                         .select(col("x_id_red").alias("msisdn"), col("es_vodafone"))
                        )
                 )

# CDRs

In [11]:
def remove_trailing_zeros_and_34(number):
    if number.startswith("0"):
        return remove_trailing_zeros_and_34(number[1:])
    elif number.startswith("33"): # Indeed, 33. I think that it is due to deanonimization. We take out the 33 here.
        return remove_trailing_zeros_and_34(number[2:])
    else:
        return number
    
clean_number_udf = udf(remove_trailing_zeros_and_34, StringType())

In [12]:
cdrs_raw = (spark.read.table("raw_es.mediated_cdr_navajo")
            .where(col("year") == year)
            .where(col("month") == month)
            .na.drop(subset=["nrprima", "nrsecun"])
            )

if dataset_for_training:
    cdrs_full = (spark.read.table("raw_es.mediated_cdr_navajo")
                .where(col("year") == year)
                .where(col("month") == month)
                .na.drop(subset=["nrprima", "nrsecun"])
                .withColumn("numeroorigen", clean_number_udf(col("nrprima")))
                .withColumn("numerodestino", clean_number_udf(col("nrsecun")))
                .withColumn("dateofcall", unix_timestamp(concat_ws(" ", col("dtinilla"), col("hrinilla")),
                                                         format="yyyyMMdd HHmmss").cast(TimestampType()))
                .withColumnRenamed("tmdurlla", "airduration")
                .select("numeroorigen", 
                         "numerodestino", 
                         "dateofcall", 
                         "airduration"
                        )
                .repartition(cdrs_raw.count() // 3000000)
                .dropDuplicates(subset=["numeroorigen", "numerodestino", "dateofcall"])
                #.limit(50000) # Remove!
                ).checkpoint(eager=False)
    
else:
    cdrs_full = (spark.read.table("raw_es.mediated_cdr_navajo")
                .where(col("year") == year)
                .where(col("month") == month)
                .na.drop(subset=["nrprima", "nrsecun"])
                .withColumn("numeroorigen", col("nrprima"))
                .withColumn("numerodestino", col("nrsecun"))
                .withColumn("dateofcall", unix_timestamp(concat_ws(" ", col("dtinilla"), col("hrinilla")),
                                                         format="yyyyMMdd HHmmss").cast(TimestampType()))
                .withColumnRenamed("tmdurlla", "airduration")
                .select("numeroorigen", 
                         "numerodestino", 
                         "dateofcall", 
                         "airduration"
                        )
                .repartition(cdrs_raw.count() // 3000000)
                .dropDuplicates(subset=["numeroorigen", "numerodestino", "dateofcall"])
                #.limit(50000) # Remove!
                ).checkpoint(eager=False)

In [13]:
cdrs_raw.printSchema()

root
 |-- NETWORKTYPE: string (nullable = true)
 |-- TPCDR: string (nullable = true)
 |-- TPITINERANCIA: string (nullable = true)
 |-- PORTPRIMA: string (nullable = true)
 |-- OPERPRIMA: string (nullable = true)
 |-- NRPRIMA: string (nullable = true)
 |-- DTINILLA: string (nullable = true)
 |-- HRINILLA: string (nullable = true)
 |-- NRIMSI: string (nullable = true)
 |-- PORTSECUN: string (nullable = true)
 |-- OPERSECUN: string (nullable = true)
 |-- NRSECUN: string (nullable = true)
 |-- NRVOLUM: long (nullable = true)
 |-- CDTEXTO: string (nullable = true)
 |-- CDIMAGEN: string (nullable = true)
 |-- CDAUDIO: string (nullable = true)
 |-- CDVIDEO: string (nullable = true)
 |-- CDTGROUP: string (nullable = true)
 |-- CDSENLLA: string (nullable = true)
 |-- CDCNTRAL: string (nullable = true)
 |-- TMDURLLA: long (nullable = true)
 |-- ROAM_SERVICE_CODE: string (nullable = true)
 |-- TPROAMSERV: string (nullable = true)
 |-- FILLER: string (nullable = true)
 |-- CDAPLICA: string (nullab

In [14]:
too_many_numeroorigen_calls = (cdrs_full.groupBy("numeroorigen")
                               .count()
                               .where(col("count") <= 10000)
                              )

too_many_numerodestino_calls = (cdrs_full.groupBy("numerodestino")
                                .count()
                                .where(col("count") <= 10000)
                                )

cdrs = (cdrs_full
        .join(too_many_numeroorigen_calls.select("numeroorigen"),
              how="inner",
              on="numeroorigen")
        .join(too_many_numerodestino_calls.select("numerodestino"),
              how="inner",
              on="numerodestino")
       )

In [15]:
origenes = (cdrs
            .select("numeroorigen")
           )
destinos = (cdrs
            .select(col("numerodestino"), 
                    #col("tolocation"), 
                    #col("destino")
                   )
           )
origenes_mas_completos = (origenes
                          .dropDuplicates(subset=["numeroorigen"])
                          #.join(destinos.dropDuplicates(subset=["numerodestino"]),
                          #      how="left",
                          #      on=origenes["numeroorigen"]==destinos["numerodestino"]
                          #     )
                          #.select("numeroorigen", "tolocation", "destino")
                         )


vertices = (origenes_mas_completos
            .dropDuplicates(subset=["numeroorigen"])
            .withColumnRenamed("numeroorigen","id")
            .union(destinos
                   .dropDuplicates(subset=["numerodestino"])
                   .withColumnRenamed("numerodestino", "id")
                  )
            .dropDuplicates(subset=["id"])
            .na.drop(subset=["id"])
           )

vertices_info_vf = (vertices.join(lookup_cliente,
                                 how="left",
                                 on=vertices["id"]==lookup_cliente["msisdn"])
                    .withColumn("es_vodafone", when(~(col("es_vodafone").isNull()), 
                                                    col("es_vodafone"))
                                               .otherwise("no")
                               )
                    .select("id", 
                            #"tolocation", 
                            #"destino", 
                            "es_vodafone")
                   ).repartition(300).checkpoint(eager=False)

edges = (cdrs
         .select("numeroorigen", 
                 "numerodestino", 
                 "dateofcall", 
                 "airduration"
                )
         .withColumnRenamed("numeroorigen", "src")
         .withColumnRenamed("numerodestino", "dst")
        ).repartition(300).checkpoint(eager=False)

In [16]:
edges.show()

+------------+-------------+--------------------+-----------+
|         src|          dst|          dateofcall|airduration|
+------------+-------------+--------------------+-----------+
|   602182818|0033642744114|2018-01-07 20:16:...|        270|
|   628483961|  12864079273|2018-01-16 21:30:...|         40|
|   685096485|  14607419135|2018-01-14 22:43:...|       2541|
|   665791507|  17812277871|2018-01-10 20:49:...|          6|
|   677570362|  18898386852|2018-01-17 22:20:...|         69|
|201105713200| 201108858877|2018-01-27 10:02:...|         60|
|201105024366| 201108858877|2018-01-25 16:00:...|         60|
|212652793987| 212604388854|2018-01-25 12:17:...|         60|
|   926303772| 212615979336|2018-01-19 09:48:...|          1|
|   671993780| 212633688688|2018-01-06 22:58:...|         83|
|   602744751| 212643515024|2018-01-06 20:07:...|        339|
|   656134400| 212659622348|2018-01-14 20:55:...|        212|
|   616587370| 212671080023|2018-01-03 13:00:...|        118|
|   6333

In [17]:
grafo_completo = (graphframes.GraphFrame(vertices_info_vf,
                                         edges
                                        )
                 )

In [18]:
hours = [("6AM_14PM", (6,14)),
         ("15PM_18PM",(15,18)),
         ("19PM_1AM", [(19,23),(0,1)]),
         ("2AM_5AM",  (2,5))]


in_out_dfs = []

for literal, time_slot in hours:
    try:
        edges_filtered_by_hour = (grafo_completo
                                  .edges
                                  .filter((hour(col("dateofcall")).between(time_slot[0][0], 
                                                                           time_slot[0][1]))
                                         | (hour(col("dateofcall")).between(time_slot[1][0], 
                                                                            time_slot[1][1]))
                                         )
                                  )
    except TypeError:
        edges_filtered_by_hour = (grafo_completo
                                  .edges
                                  .filter(hour(col("dateofcall")).between(time_slot[0], 
                                                                          time_slot[1]))
                                  )
        
    grafo_completo_filtered_by_hour = (graphframes.GraphFrame(grafo_completo.vertices,
                                                              edges_filtered_by_hour)
                                       .persist(StorageLevel.DISK_ONLY)
                                      )
    
    ##
    ## General graph statistics
    ##
    full_indegrees = (grafo_completo_filtered_by_hour
                     .inDegrees
                     .withColumnRenamed("inDegree","received_calls_"+literal)
                     .persist(StorageLevel.DISK_ONLY)
                     )
    
    full_outdegrees = (grafo_completo_filtered_by_hour
                       .outDegrees
                       .withColumnRenamed("outDegree","emited_calls_"+literal)
                       .persist(StorageLevel.DISK_ONLY)
                      )
    
    number_mins_rcv_calls = (grafo_completo_filtered_by_hour
                             .edges
                             .groupBy("dst").sum("airduration")
                             .withColumnRenamed("sum(airduration)", "n_mins_received_"+literal)
                             .withColumn("n_mins_received_"+literal, 
                                         col("n_mins_received_"+literal).cast(DoubleType()))
                            )
    
    number_mins_src_calls = (grafo_completo_filtered_by_hour
                             .edges
                             .groupBy("src").sum("airduration")
                             .withColumnRenamed("sum(airduration)", "n_mins_called_"+literal)
                             .withColumn("n_mins_called_"+literal, 
                                         col("n_mins_called_"+literal).cast(DoubleType())
                                        )
                            )
    
    ##
    ## VF-specific statistics
    ##   
    vodafone_types = ["prepago", "pospago"]
    
    vodafone_types_dfs = {"vf_received_call_ratio": [],
                          "vf_emited_call_ratio": [],
                          "ratio_mins_rcv_calls_vf": [],
                          "ratio_mins_src_calls_vf": []}
    
    for vf_type in vodafone_types:
        filtered_edges_entrada = (grafo_completo_filtered_by_hour
                                  .find("(a)-[e]->(b)")
                                  .filter("a.es_vodafone == '" + vf_type + "'")
                                  .select("e.src", "e.dst", "e.dateofcall", "e.airduration")
                                 )

        calls_from_vodafone = (graphframes.GraphFrame(grafo_completo.vertices, 
                                                      filtered_edges_entrada)
                              )
    
    
        vf_indegrees = (calls_from_vodafone
                        .inDegrees
                        .withColumnRenamed("inDegree", "received_vf_calls_"+vf_type+"_"+literal)
                       )

        vf_received_call_ratio = (full_indegrees.join(vf_indegrees,
                                                      how="left",
                                                      on="id")
                                 .withColumn("received_calls_"+literal, col("received_calls_"+literal).cast(DoubleType()))
                                 .withColumn("received_vf_calls_"+vf_type+"_"+literal, col("received_vf_calls_"+vf_type+"_"+literal).cast(DoubleType()))
                                 .withColumn("received_vf_calls_ratio_"+vf_type+"_"+literal, 
                                             col("received_vf_calls_"+vf_type+"_"+literal)
                                             / col("received_calls_"+literal)
                                            )
                                 .drop("received_calls_"+literal)
                                 .na.fill(0.0)
                                 )
    
    ##
        filtered_edges_salida = (grafo_completo_filtered_by_hour
                                 .find("(a)-[e]->(b)")
                                 .filter("b.es_vodafone == '" + vf_type + "'")
                                 .select("e.src", "e.dst", "e.dateofcall", "e.airduration")
                                )
    
        calls_to_vodafone = graphframes.GraphFrame(grafo_completo.vertices, 
                                                   filtered_edges_salida)
    
    
        vf_outdegrees = (calls_to_vodafone
                         .outDegrees
                         .withColumnRenamed("outDegree", "emited_vf_calls_"+vf_type+"_"+literal)
                        )
    
        vf_emited_call_ratio  = (full_outdegrees.join(vf_outdegrees,
                                                       how="left",
                                                       on="id")
                                 .withColumn("emited_calls_"+literal, col("emited_calls_"+literal).cast(DoubleType()))
                                 .withColumn("emited_vf_calls_"+vf_type+"_"+literal, col("emited_vf_calls_"+vf_type+"_"+literal).cast(DoubleType()))
                                 .withColumn("emited_vf_calls_ratio_"+vf_type+"_"+literal, 
                                             col("emited_vf_calls_"+vf_type+"_"+literal)
                                             / col("emited_calls_"+literal)
                                            )
                                 .drop("emited_calls_"+literal)
                                 .na.fill(0.0)
                                 )
    
    ##
    
        number_mins_rcv_calls_vf = (filtered_edges_entrada
                                    .groupBy("dst")
                                    .sum("airduration")
                                    .withColumnRenamed("sum(airduration)", "n_mins_received_vf_"+vf_type+"_"+literal)
                                    .withColumn("n_mins_received_vf_"+vf_type+"_"+literal,
                                                col("n_mins_received_vf_"+vf_type+"_"+literal).cast(DoubleType()))
                                   )

        ratio_mins_rcv_calls_vf = (number_mins_rcv_calls
                                   .join(number_mins_rcv_calls_vf,
                                         how="left",
                                         on="dst")
                                   .withColumn("ratio_mins_received_vf_"+vf_type+"_"+literal, 
                                               col("n_mins_received_vf_"+vf_type+"_"+literal)
                                               / col("n_mins_received_"+literal)
                                              )
                                   .drop("n_mins_received_"+literal)
                                   .na.fill(0.0)
                                  )
    
    ##
    
        number_mins_src_calls_vf = (filtered_edges_salida
                                    .groupBy("src")
                                    .sum("airduration")
                                    .withColumnRenamed("sum(airduration)", "n_mins_called_vf_"+vf_type+"_"+literal)
                                    .withColumn("n_mins_called_vf_"+vf_type+"_"+literal,
                                                col("n_mins_called_vf_"+vf_type+"_"+literal).cast(DoubleType())
                                               )
                                   )

        ratio_mins_src_calls_vf = (number_mins_src_calls
                                   .join(number_mins_src_calls_vf,
                                         how="left",
                                         on="src")
                                   .withColumn("ratio_mins_called_vf_"+vf_type+"_"+literal, 
                                               col("n_mins_called_vf_"+vf_type+"_"+literal)
                                               / col("n_mins_called_"+literal)
                                              )
                                   .drop("n_mins_called_"+literal)
                                   .na.fill(0.0)
                                  )
        
        # Final appends
        vodafone_types_dfs["vf_received_call_ratio"].append(vf_received_call_ratio)
        vodafone_types_dfs["vf_emited_call_ratio"].append(vf_emited_call_ratio)
        vodafone_types_dfs["ratio_mins_rcv_calls_vf"].append(ratio_mins_rcv_calls_vf)
        vodafone_types_dfs["ratio_mins_src_calls_vf"].append(ratio_mins_src_calls_vf)
    
    
    # JOINS
    full_vf_received_call_ratio = full_indegrees.join(vodafone_types_dfs["vf_received_call_ratio"][0],
                                                      how="left",
                                                      on="id")
    
    for df in vodafone_types_dfs["vf_received_call_ratio"][1:]:
        full_vf_received_call_ratio = (full_vf_received_call_ratio
                                       .join(df,
                                             how="left",
                                             on="id")
                                      )
    #    
    full_vf_emited_call_ratio = full_outdegrees.join(vodafone_types_dfs["vf_emited_call_ratio"][0],
                                                    how="left",
                                                    on="id")
    
    for df in vodafone_types_dfs["vf_emited_call_ratio"][1:]:
        full_vf_emited_call_ratio = (full_vf_emited_call_ratio
                                       .join(df,
                                             how="left",
                                             on="id")
                                      )
    #   
    full_ratio_mins_rcv_calls_vf = number_mins_rcv_calls.join(vodafone_types_dfs["ratio_mins_rcv_calls_vf"][0],
                                                              how="left",
                                                              on="dst")
    for df in vodafone_types_dfs["ratio_mins_rcv_calls_vf"][1:]:
        full_ratio_mins_rcv_calls_vf = (full_ratio_mins_rcv_calls_vf
                                       .join(df,
                                             how="left",
                                             on="dst")
                                      )
        
    full_ratio_mins_rcv_calls_vf = full_ratio_mins_rcv_calls_vf.withColumnRenamed("dst","id")
    
    #
    full_ratio_mins_src_calls_vf = number_mins_src_calls.join(vodafone_types_dfs["ratio_mins_src_calls_vf"][0],
                                                              how="left",
                                                              on="src")
    for df in vodafone_types_dfs["ratio_mins_src_calls_vf"][1:]:
        full_ratio_mins_src_calls_vf = (full_ratio_mins_src_calls_vf
                                       .join(df,
                                             how="left",
                                             on="src")
                                      )
        
    full_ratio_mins_src_calls_vf = full_ratio_mins_src_calls_vf.withColumnRenamed("src","id")
        
    
    ## JOINS
    in_out_features = (grafo_completo
                       .vertices
                       .select("id")
                       .join(full_vf_received_call_ratio,
                             how="left",
                             on="id")
                       .join(full_vf_emited_call_ratio,
                             how="left",
                             on="id")
                       .join(full_ratio_mins_rcv_calls_vf,
                             how="left",
                             on="id")
                       .join(full_ratio_mins_src_calls_vf,
                             how="left",
                             on="id")
                      )
    
    in_out_dfs.append((literal, in_out_features))

In [19]:
in_out_df = in_out_dfs[0][1]

for literal, df in in_out_dfs[1:]:
    in_out_df = (in_out_df
                 .join(df,
                       how="outer",
                       on="id")
                 .na.fill(0.0)
                )
    
in_out_df_cached = in_out_df.checkpoint(eager=False)

In [20]:
in_out_df_cached.count()

57076236

In [21]:
spark.conf.set("spark.sql.shuffle.partitions", 4100)

In [22]:
training_flag_udf = udf(lambda x: True, BooleanType())
not_training_flag_udf = udf(lambda x: False, BooleanType())

if dataset_for_training:
    feature_set = (grafo_completo
                   .vertices
                   .join(in_out_df_cached,
                         how="left",
                         on="id")
                   .withColumn("year", lit(str(year)).cast(LongType()))
                   .withColumn("month", lit(str(month)).cast(LongType()))
                   .withColumn("for_training", training_flag_udf(col("year")))
                  )
else:
    feature_set = (grafo_completo
                   .vertices
                   .join(in_out_df_cached,
                         how="left",
                         on="id")
                   .withColumn("year", lit(str(year)).cast(LongType()))
                   .withColumn("month", lit(str(month)).cast(LongType()))
                   .withColumn("for_training", not_training_flag_udf(col("year")))
                  )

In [23]:
(feature_set
 .write
 .format("parquet")
 .mode("append")
 .partitionBy("year","month")  
 .saveAsTable("tests_es.cdr_graph_features_no_communities_ads")
 )

In [24]:
print("Finished!")

Finished!


In [26]:
month

1