In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DateType, DoubleType
from utils import Utils
from utils.Utils import Transformer
from utils.spark_utils import Spark_utils
import logging

In [2]:
## Settings variables for log handling and SparkSession
spark = None
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

## Spark_utils object to enable SparkSession
spark_util = Spark_utils("DEV", "Meli_Pipeline")
spark_util.set_spark_session()
spark = spark_util.get_spark_session()

Environment:  DEV
24/08/05 20:45:31 WARN Utils: Your hostname, EPCOBOGW1343 resolves to a loopback address: 127.0.1.1; using 10.255.255.254 instead (on interface lo)
24/08/05 20:45:31 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


24/08/05 20:45:32 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
## Iniaiting data reads from original source to Dataframe
logger.info("Reading Data Sources from Local Storage...")

logger.info("Defining prints.json schema ...")
prints_schema = StructType([
StructField("day", DateType(), True),
StructField("event_data", StructType([
    StructField("position", IntegerType(), True),
    StructField("value_prop", StringType(), True)
]), True),
StructField("user_id", IntegerType(), True)
])

## Reading "prints.json" data to dataframe
logger.info("Read prints.json with as dataset with proper schema")
df_prints = spark.read.json("data_sources/prints.json", schema=prints_schema)

logger.info("Prints's dataframe schema...")
df_prints.printSchema()

INFO:__main__:Reading Data Sources from Local Storage...
INFO:__main__:Defining prints.json schema ...
INFO:__main__:Read prints.json with as dataset with proper schema
INFO:__main__:Prints's dataframe schema...


root
 |-- day: date (nullable = true)
 |-- event_data: struct (nullable = true)
 |    |-- position: integer (nullable = true)
 |    |-- value_prop: string (nullable = true)
 |-- user_id: integer (nullable = true)



In [4]:
## Reading "taps.json" data to dataframe

taps_schema = StructType([
StructField("day", DateType(), True),
StructField("event_data", StructType([
    StructField("position", IntegerType(), True),
    StructField("value_prop", StringType(), True)
]), True),
StructField("user_id", IntegerType(), True)
])
    
df_taps = spark.read.json("data_sources/taps.json", schema = taps_schema)

## Reading "prints.json" data to dataframe

pays_schema = StructType([
StructField("pay_date", DateType(), True),
StructField("total", DoubleType(), True),
StructField("user_id", IntegerType(), True),
StructField("value_prop", StringType(), True)
])
    
df_pays = spark.read.option("header","True").csv("data_sources/pays.csv", schema = pays_schema)
logger.info("3 New Datasets were created")

INFO:__main__:3 New Datasets were created


In [5]:
#################### Processing df_prints DataFrame ##########################

logger.info("Calculating week of year for column day in prints")
df_prints = Transformer.calcWeekOfTheYear(df_prints, "day", "week_of_year")

df_prints.printSchema()

logger.info("Calculating Rank for column week_of_year in prints")
##df_prints = Transformer.calcRank(df_prints, ['user_id'], ['week_of_year'], "rank_num", "desc")

logger.info("Calculating Row_Number for column week_of_year in prints")
##df_prints = Transformer.calcRowNumber(df_prints, ['user_id','week_of_year'], ['week_of_year'], "row_num", "desc")
    
df_prints.printSchema()

logger.info("Showing processed data for Prints")
df_prints.show(truncate = False)
##df_prints.show(10, truncate = False)

#################### Processing df_taps DataFrame ##########################

logger.info("Calculating week of year for column day in taps")
df_taps = Transformer.calcWeekOfTheYear(df_taps, "day", "week_of_year")

df_taps.printSchema()

logger.info("Calculating Rank for column week_of_year in taps")
##df_taps = Transformer.calcRank(df_taps, ['user_id'], ['week_of_year'], "rank_num", "desc")

logger.info("Calculating Row_Number for column week_of_year in prints")
##df_taps = Transformer.calcRowNumber(df_taps, ['user_id','week_of_year'], ['week_of_year'], "row_num", "desc")
    
df_taps.printSchema()

logger.info("Showing processed data for Prints")
df_taps.show(truncate = False)
##df_prints.show(10, truncate = False)

#################### Processing df_pays DataFrame ##########################

logger.info("Calculating week of year for column day in pays")
df_pays = Transformer.calcWeekOfTheYear(df_pays, "pay_date", "week_of_year")

df_pays.printSchema()

logger.info("Calculating Rank for column week_of_year in pays")
##df_pays = Transformer.calcRank(df_pays, ['user_id'], ['week_of_year'], "rank_num", "desc")

logger.info("Calculating Row_Number for column week_of_year in prints")
##df_pays = Transformer.calcRowNumber(df_pays, ['user_id','week_of_year'], ['week_of_year'], "row_num", "desc")
    
df_pays.printSchema()

logger.info("Showing processed data for Prints")
df_pays.show(truncate = False)
##df_prints.show(10, truncate = False)

INFO:__main__:Calculating week of year for column day in prints
INFO:__main__:Calculating Rank for column week_of_year in prints
INFO:__main__:Calculating Row_Number for column week_of_year in prints
INFO:__main__:Showing processed data for Prints


root
 |-- day: date (nullable = true)
 |-- event_data: struct (nullable = true)
 |    |-- position: integer (nullable = true)
 |    |-- value_prop: string (nullable = true)
 |-- user_id: integer (nullable = true)
 |-- week_of_year: integer (nullable = true)

root
 |-- day: date (nullable = true)
 |-- event_data: struct (nullable = true)
 |    |-- position: integer (nullable = true)
 |    |-- value_prop: string (nullable = true)
 |-- user_id: integer (nullable = true)
 |-- week_of_year: integer (nullable = true)



INFO:__main__:Calculating week of year for column day in taps
INFO:__main__:Calculating Rank for column week_of_year in taps
INFO:__main__:Calculating Row_Number for column week_of_year in prints
INFO:__main__:Showing processed data for Prints
INFO:__main__:Calculating week of year for column day in pays
INFO:__main__:Calculating Rank for column week_of_year in pays
INFO:__main__:Calculating Row_Number for column week_of_year in prints
INFO:__main__:Showing processed data for Prints


+----------+-----------------------+-------+------------+
|day       |event_data             |user_id|week_of_year|
+----------+-----------------------+-------+------------+
|2020-11-01|{0, cellphone_recharge}|98702  |44          |
|2020-11-01|{1, prepaid}           |98702  |44          |
|2020-11-01|{0, prepaid}           |63252  |44          |
|2020-11-01|{0, cellphone_recharge}|24728  |44          |
|2020-11-01|{1, link_cobro}        |24728  |44          |
|2020-11-01|{2, credits_consumer}  |24728  |44          |
|2020-11-01|{3, point}             |24728  |44          |
|2020-11-01|{0, point}             |25517  |44          |
|2020-11-01|{1, credits_consumer}  |25517  |44          |
|2020-11-01|{2, transport}         |25517  |44          |
|2020-11-01|{0, point}             |57587  |44          |
|2020-11-01|{0, transport}         |13609  |44          |
|2020-11-01|{0, cellphone_recharge}|3708   |44          |
|2020-11-01|{1, prepaid}           |3708   |44          |
|2020-11-01|{2

In [12]:
df_prints.createOrReplaceTempView("prints")
df_taps.createOrReplaceTempView("taps")
df_pays.createOrReplaceTempView("pays")

In [None]:
spark.sql("""
select user_id,
count(1) q ,
min(week_of_year),
max(week_of_year)
from prints
group by user_id
order by q desc
""").show(truncate = False)

In [None]:
spark.sql("""
select count(1) q ,
min(week_of_year),
avg(week_of_year),
max(week_of_year)
from prints
order by q desc
""").show(truncate = False)

In [None]:
spark.sql("""
select x.*
from
(select * ,
rank() over (partition by user_id order by week_of_year desc) rown
from prints
where user_id = 1) x
""").show(truncate = False)

In [None]:
spark.sql("""
SELECT a, b, row_number() OVER (PARTITION BY a ORDER BY b) FROM VALUES ('A1', 2), ('A1', 1), ('A2', 3), ('A1', 1) tab(a, b);
""").show()

In [7]:
spark.sql("""
select user_id
,week_of_year
,count(1) q
from prints
group by user_id
,week_of_year
order by user_id , week_of_year desc
""").show(truncate = False)

[Stage 3:>                                                          (0 + 1) / 1]

+-------+------------+---+
|user_id|week_of_year|q  |
+-------+------------+---+
|1      |48          |2  |
|1      |45          |2  |
|2      |48          |3  |
|2      |46          |1  |
|2      |45          |5  |
|3      |47          |5  |
|3      |46          |3  |
|3      |45          |1  |
|4      |49          |1  |
|4      |48          |1  |
|4      |47          |1  |
|4      |46          |2  |
|4      |45          |1  |
|5      |48          |2  |
|5      |47          |1  |
|5      |45          |5  |
|6      |46          |3  |
|6      |45          |2  |
|7      |48          |2  |
|7      |46          |2  |
+-------+------------+---+
only showing top 20 rows



                                                                                

In [14]:
spark.sql("""
select user_id
,count(1) q
from prints
group by user_id
order by q desc  
""").show(truncate = False)

+-------+---+
|user_id|q  |
+-------+---+
|35156  |27 |
|9704   |27 |
|61554  |26 |
|88770  |26 |
|30781  |25 |
|51870  |25 |
|5352   |25 |
|48782  |25 |
|64536  |24 |
|38850  |24 |
|33842  |24 |
|56578  |24 |
|59876  |24 |
|65036  |24 |
|41940  |24 |
|88191  |24 |
|20457  |24 |
|50813  |24 |
|95769  |24 |
|19656  |24 |
+-------+---+
only showing top 20 rows



                                                                                

In [23]:
spark.sql("""
select a.*
,case when b.user_id is not null then "y" else "n" end click
from prints a
left join taps b
on a.user_id = b.user_id
and a.event_data = b.event_data
and a.day = b.day
where a.user_id = 3708
""").show(30, truncate = False)

[Stage 24:>                                                         (0 + 1) / 1]

+----------+-----------------------+-------+------------+-----+
|day       |event_data             |user_id|week_of_year|click|
+----------+-----------------------+-------+------------+-----+
|2020-11-01|{0, cellphone_recharge}|3708   |44          |n    |
|2020-11-01|{1, prepaid}           |3708   |44          |n    |
|2020-11-01|{2, point}             |3708   |44          |y    |
|2020-11-01|{3, send_money}        |3708   |44          |y    |
|2020-11-09|{0, credits_consumer}  |3708   |46          |n    |
|2020-11-13|{0, link_cobro}        |3708   |46          |n    |
|2020-11-25|{0, link_cobro}        |3708   |48          |n    |
|2020-11-25|{1, transport}         |3708   |48          |n    |
+----------+-----------------------+-------+------------+-----+



                                                                                