In [92]:
from datetime import datetime
from dateutil.relativedelta import relativedelta

from pyspark.sql import SparkSession
from pyspark.sql import types as T
from pyspark.sql import functions as F

In [100]:
DATE_PARAM = "2018-12-31"
TRANSACTIONS_FILE = "file:///home/jovyan/work/data/transactions_train.csv"
ARTICLES_FILE = "file:///home/jovyan/work/data/articles.csv"
CUSTOMERS_FILE = "file:///home/jovyan/work/data/customers.csv"

In [102]:
filter_date = datetime.strptime(DATE_PARAM, "%Y-%m-%d")
date_begin = filter_date + relativedelta(day=1)
date_end = filter_date + relativedelta(day=31)

In [57]:
spark = (
    SparkSession.builder
    .master("spark://spark-master:7077")
    .appName("transactions-data-mart")
    .getOrCreate()
)

In [58]:
transactions_schema = T.StructType(
    [
        T.StructField("t_dat", T.DateType(), True),
        T.StructField("customer_id", T.StringType(), True),
        T.StructField("article_id", T.IntegerType(), True),
        T.StructField("price", T.DecimalType(22, 20), True),
        T.StructField("sales_channel_id", T.IntegerType(), True) 
    ]
)

In [59]:
articles_schema = T.StructType(
    [
        T.StructField('article_id', T.IntegerType(), True), 
        T.StructField('product_code', T.IntegerType(), True), 
        T.StructField('prod_name', T.StringType(), True), 
        T.StructField('product_type_no', T.IntegerType(), True), 
        T.StructField('product_type_name', T.StringType(), True), 
        T.StructField('product_group_name', T.StringType(), True), 
        T.StructField('graphical_appearance_no', T.IntegerType(), True), 
        T.StructField('graphical_appearance_name', T.StringType(), True), 
        T.StructField('colour_group_code', T.IntegerType(), True), 
        T.StructField('colour_group_name', T.StringType(), True), 
        T.StructField('perceived_colour_value_id', T.IntegerType(), True), 
        T.StructField('perceived_colour_value_name', T.StringType(), True), 
        T.StructField('perceived_colour_master_id', T.IntegerType(), True), 
        T.StructField('perceived_colour_master_name', T.StringType(), True), 
        T.StructField('department_no', T.IntegerType(), True), 
        T.StructField('department_name', T.StringType(), True), 
        T.StructField('index_code', T.StringType(), True), 
        T.StructField('index_name', T.StringType(), True), 
        T.StructField('index_group_no', T.IntegerType(), True), 
        T.StructField('index_group_name', T.StringType(), True), 
        T.StructField('section_no', T.IntegerType(), True), 
        T.StructField('section_name', T.StringType(), True), 
        T.StructField('garment_group_no', T.IntegerType(), True), 
        T.StructField('garment_group_name', T.StringType(), True), 
        T.StructField('detail_desc', T.StringType(), True)
    ]
)

In [60]:
customers_schema = T.StructType(
    [
        T.StructField('customer_id', T.StringType(), True), 
        T.StructField('FN', T.DecimalType(2, 1), True), 
        T.StructField('Active', T.DecimalType(2, 1), True), 
        T.StructField('club_member_status', T.StringType(), True), 
        T.StructField('fashion_news_frequency', T.StringType(), True), 
        T.StructField('age', T.IntegerType(), True), 
        T.StructField('postal_code', T.StringType(), True)
    ]
)

In [61]:
transactions_df = (
    spark.read
    .format("csv")
    .schema(transactions_schema)
    .option("header", "true")
    .option("delimiter", ",")
    .load(TRANSACTIONS_FILE)
)

In [62]:
articles_df = (
    spark.read
    .format("csv")
    .schema(articles_schema)
    .option("header", "true")
    .option("delimiter", ",")
    .load(ARTICLES_FILE)
)

In [63]:
customers_df = (
    spark.read
    .format("csv")
    .schema(customers_schema)
    .option("header", "true")
    .option("delimiter", ",")
    .load(CUSTOMERS_FILE)
)

In [64]:
transactions_df.show(5, False)

+----------+----------------------------------------------------------------+----------+----------------------+----------------+
|t_dat     |customer_id                                                     |article_id|price                 |sales_channel_id|
+----------+----------------------------------------------------------------+----------+----------------------+----------------+
|2018-09-20|000058a12d5b43e67d225668fa1f8d618c13dc232df0cad8ffe7ad4a1091e318|663713001 |0.05083050847457626400|2               |
|2018-09-20|000058a12d5b43e67d225668fa1f8d618c13dc232df0cad8ffe7ad4a1091e318|541518023 |0.03049152542372881000|2               |
|2018-09-20|00007d2de826758b65a93dd24ce629ed66842531df6699338c5570910a014cc2|505221004 |0.01523728813559322000|2               |
|2018-09-20|00007d2de826758b65a93dd24ce629ed66842531df6699338c5570910a014cc2|685687003 |0.01693220338983050800|2               |
|2018-09-20|00007d2de826758b65a93dd24ce629ed66842531df6699338c5570910a014cc2|685687004 |0.0169322

In [65]:
articles_df.show(5, False)

+----------+------------+-----------------+---------------+-----------------+------------------+-----------------------+-------------------------+-----------------+-----------------+-------------------------+---------------------------+--------------------------+----------------------------+-------------+---------------+----------+----------------+--------------+----------------+----------+----------------------+----------------+------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|article_id|product_code|prod_name        |product_type_no|product_type_name|product_group_name|graphical_appearance_no|graphical_appearance_name|colour_group_code|colour_group_name|perceived_colour_value_id|perceived_colour_value_name|perceived_colour_master_id|perceived_colour_master_name|department_no|

In [66]:
customers_df.show(5, False)

+----------------------------------------------------------------+----+------+------------------+----------------------+---+----------------------------------------------------------------+
|customer_id                                                     |FN  |Active|club_member_status|fashion_news_frequency|age|postal_code                                                     |
+----------------------------------------------------------------+----+------+------------------+----------------------+---+----------------------------------------------------------------+
|00000dbacae5abe5e23885899a1fa44253a17956c6d1c3d25f88aa139fdfc657|null|null  |ACTIVE            |NONE                  |49 |52043ee2162cf5aa7ee79974281641c6f11a68d276429a91f8ca0d4b6efa8100|
|0000423b00ade91418cceaf3b26c6af3dd342b51fd051eec9c12fb36984420fa|null|null  |ACTIVE            |NONE                  |25 |2973abc54daa8a5f8ccfe9362140c63247c5eee03f1d93f4c830291c32bc3057|
|000058a12d5b43e67d225668fa1f8d618c13dc232df0cad8f

In [107]:
filtered_transactions_df = (
    transactions_df
    .where(
        (F.col("t_dat") >= date_begin) & 
        (F.col("t_dat") <= date_end)
    )
)

In [108]:
filtered_transactions_df.show(5)



+----------+--------------------+----------+--------------------+----------------+
|     t_dat|         customer_id|article_id|               price|sales_channel_id|
+----------+--------------------+----------+--------------------+----------------+
|2018-12-01|000782c5ed3f985d9...| 573085020|0.033881355932203...|               1|
|2018-12-01|0019bbc42b332e285...| 715730001|0.050830508474576...|               2|
|2018-12-01|0021f97b775f64436...| 569562005|0.036000000000000...|               1|
|2018-12-01|002e9cd227bbef22c...| 633136009|0.033881355932203...|               2|
|2018-12-01|0038b16b41bf785c0...| 661435002|0.084728813559322...|               2|
+----------+--------------------+----------+--------------------+----------------+
only showing top 5 rows



                                                                                