In [1]:
import findspark
findspark.init()

import pandas as pd 
import pyspark
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.types import *
import pyspark.sql.functions as fn
from pyspark.sql.window import Window

from pyspark.sql.functions import udf

In [2]:
def equivalent_type(f):
    if f == 'datetime64[ns]': return DateType()
    elif f == 'int64': return LongType()
    elif f == 'int32': return IntegerType()
    elif f == 'float64': return FloatType()
    else: return StringType()

def define_structure(string, format_type):
    try: typo = equivalent_type(format_type)
    except: typo = StringType()
    return StructField(string, typo)


# Given pandas dataframe, it will return a spark's dataframe.
def pandas_to_spark(pandas_df,sparkSession):
    columns = list(pandas_df.columns)
    types = list(pandas_df.dtypes)
    struct_list = []
    i = 0
    for column, typo in zip(columns, types): 
        struct_list.append(define_structure(column, typo))
    p_schema = StructType(struct_list)
    return sparkSession.createDataFrame(pandas_df, p_schema)

In [3]:
# start spark engine 
conf = pyspark.SparkConf().setAppName('tes_spark').setMaster('local')
sc = pyspark.SparkContext.getOrCreate(conf=conf)
spark = SparkSession(sc)

In [4]:
spark

# load dataset 

In [69]:
# customer information 
df_cli = spark \
    .read.format("com.databricks.spark.csv")\
    .option("header", "true")\
    .option("inferschema", "true")\
    .option("delimiter", ",")\
    .load("../../data/data/clients.csv")

In [None]:
df_cli.select(fn.count("client_id"),fn.countDistinct("client_id")).show()

In [46]:
df_cli.limit(3).toPandas()

Unnamed: 0,client_id,first_issue_date,first_redeem_date,age,gender
0,000012768d,2017-08-05 15:40:48,2018-01-04 19:30:07,45,U
1,000036f903,2017-04-10 13:54:23,2017-04-23 12:37:56,72,F
2,000048b7a6,2018-12-15 13:33:11,,68,F


In [5]:
# customer flag 
df_up_train = spark \
    .read.format("com.databricks.spark.csv")\
    .option("header", "true")\
    .option("inferschema", "true")\
    .option("delimiter", ",")\
    .load("../../data/data/uplift_train.csv")

In [58]:
df_up_train.limit(3).toPandas()

Unnamed: 0,client_id,treatment_flg,target
0,000012768d,0,1
1,000036f903,1,1
2,00010925a5,1,1


In [None]:
df_up_train.select(fn.count("client_id"),fn.countDistinct("client_id")).show()

In [56]:
# custommer id for testing
df_up_test = spark \
    .read.format("com.databricks.spark.csv")\
    .option("header", "true")\
    .option("inferschema", "true")\
    .option("delimiter", ",")\
    .load("../../data/data/uplift_test.csv")

In [57]:
df_up_test.limit(3).toPandas()

Unnamed: 0,client_id
0,000048b7a6
1,000073194a
2,00007c7133


In [None]:
df_up_test.select(fn.count("client_id"),fn.countDistinct("client_id")).show()

In [12]:
# product information
df_pro = spark \
    .read.format("com.databricks.spark.csv")\
    .option("header", "true")\
    .option("inferschema", "true")\
    .option("delimiter", ",")\
    .load("../../data/data/products.csv")

In [42]:
df_pro.groupby('level_1','level_2','level_3','level_4').agg(fn.count('product_id').alias('total')).sort("total").show()

+----------+----------+----------+----------+-----+
|   level_1|   level_2|   level_3|   level_4|total|
+----------+----------+----------+----------+-----+
|ec62ce61e3|36bd2cad67|edbe75f28a|95187f1e43|    1|
|c3d3a8e8c6|fb84f08028|4603fa9fa5|510cfe0b63|    1|
|ec62ce61e3|a75f1bad01|ebfe092e6a|df5475c6f1|    1|
|ec62ce61e3|3ef03403a0|9b3d1f6cb1|6eae515849|    1|
|c3d3a8e8c6|fb84f08028|4603fa9fa5|009b538c18|    1|
|ec62ce61e3|4202626fcb|a328a6cbdb|1d84a42993|    1|
|c3d3a8e8c6|428e08386e|b62e37b39d|48254e6a9b|    1|
|ec62ce61e3|6b4ae3f25d|a5b820d60d|fa40a7dbd2|    1|
|e344ab2e71|b0c4967fce|eac3f82414|877ad83201|    1|
|c3d3a8e8c6|de6f3b925a|cf854d5a22|fca87d4254|    1|
|c3d3a8e8c6|f2333c90fb|78800e7c84|6041123c2f|    1|
|e344ab2e71|703f4b6eb0|0c37077fa0|ed84e1ad23|    1|
|ec62ce61e3|e8705574ff|4ea774ea51|1f2d0f578d|    1|
|c3d3a8e8c6|ad2b2e17d2|eda7b2976b|690cbc6ce7|    1|
|c3d3a8e8c6|034aca0659|b67737054d|8b97a649bb|    1|
|e344ab2e71|ed2ad1797c|57f95167c1|48aaaf19eb|    1|
|ec62ce61e3|

In [45]:
df_pro.columns

['product_id',
 'level_1',
 'level_2',
 'level_3',
 'level_4',
 'segment_id',
 'brand_id',
 'vendor_id',
 'netto',
 'is_own_trademark',
 'is_alcohol']

In [124]:
df_pro.select(fn.countDistinct('product_id')).show()

+--------------------------+
|count(DISTINCT product_id)|
+--------------------------+
|                     43038|
+--------------------------+



In [None]:
df_pro.select(fn.count("product_id"),fn.countDistinct("product_id")).show()

In [None]:
df_pro.limit(3).toPandas()

In [47]:
# purchase transactional data 
df_pur = spark \
    .read.format("com.databricks.spark.csv")\
    .option("header", "true")\
    .option("inferschema", "true")\
    .option("delimiter", ",")\
    .load("../../data/data/purchases.csv")

# Data inspection null checking etc 

In [None]:
# df_pur.select(fn.date_format(fn.col('ts'),"yyyy-MM-dd").alias('ts').cast("date"))

In [None]:
df_pur.limit(3).toPandas()

In [None]:
# days observation
df_pur.select(fn.countDistinct("date")).show()

In [None]:
# start and end data observation 
df_pur.select(fn.max("date"), fn.min("date")).show()

In [None]:
# count how many unique products and client 
df_pur.select(fn.count("client_id"),fn.countDistinct("client_id"), fn.countDistinct('product_id')).show()

In [None]:
df_pur.select(fn.countDistinct("transaction_id")).show()

In [None]:
df_pur.limit(3).toPandas()

In [None]:
# count total spending for each customer
df_pur_gb = df_pur.groupby('client_id').agg(fn.sum('purchase_sum').alias('purchase_sum'))

In [None]:
df_pur_gb.limit(3).toPandas()

In [None]:
df_cli.limit(3).toPandas()

In [None]:
df_cli.createTempView('tmp')

In [None]:
qry0 = """
select gender_flag, count(1)n  from (
    select 
    case when first_issue_date is null then 1 else 0 end fid_flag,
    case when first_redeem_date is null then 1 else 0 end frd_flag,
    case when age is null then 1 else 0 end age_flag,
    case when gender is null then 1 else 0 end gender_flag
    from tmp 
)x
group by 1
"""

# qry0 = "select first_issue_date from tmp limit 10"
df_tmp = spark.sql(qry0)

In [None]:
df_tmp.limit(3).toPandas()

In [None]:
df_cli.select(fn.count("first_issue_date")).show()

In [None]:
df_up_train.select([fn.count(fn.when(fn.isnan(c) | fn.col(c).isNull(), c)).alias(c) for c in df_up_train.columns]).show()

In [None]:
df_up_train.limit(3).toPandas()

In [None]:
df_up_train_summ = df_up_train.groupby('treatment_flg', 'target').agg(fn.count('client_id').alias('n_client'))

In [None]:
df_up_train_summ.show()

# Feature generator

In [48]:
#extrach month from datetime 
df_pur = df_pur.withColumn("date", fn.to_date(fn.col("transaction_datetime")))
df_pur = df_pur.withColumn("month", fn.date_format(fn.col("transaction_datetime"), "M"))
df_pur = df_pur.withColumn('day',fn.dayofmonth(df_pur.date))
# df_pur = df_pur.withColumn("month", fn.date_format(fn.col("transaction_datetime"), "M")) # day of year 

In [35]:
df_pur.printSchema()

root
 |-- client_id: string (nullable = true)
 |-- transaction_id: string (nullable = true)
 |-- transaction_datetime: string (nullable = true)
 |-- regular_points_received: double (nullable = true)
 |-- express_points_received: double (nullable = true)
 |-- regular_points_spent: double (nullable = true)
 |-- express_points_spent: double (nullable = true)
 |-- purchase_sum: double (nullable = true)
 |-- store_id: string (nullable = true)
 |-- product_id: string (nullable = true)
 |-- product_quantity: double (nullable = true)
 |-- trn_sum_from_iss: double (nullable = true)
 |-- trn_sum_from_red: double (nullable = true)
 |-- month: string (nullable = true)
 |-- date: date (nullable = true)
 |-- day: integer (nullable = true)



# regular and express points 

In [49]:
# collect unique point per transaction 
df_trans_agg = df_pur.select('client_id','transaction_id','regular_points_received','express_points_received','regular_points_spent','express_points_spent','purchase_sum','month')\
            .groupby('client_id','transaction_id','month')\
            .agg(fn.count('transaction_id').alias('n_trans'),fn.avg('regular_points_received').alias('s_reg_pts_rec'),fn.avg('express_points_received').alias('s_exp_pts_rec')
                ,fn.avg('regular_points_spent').alias('s_reg_pts_sp'),fn.avg('express_points_spent').alias('s_exp_pts_sp'), fn.avg('purchase_sum').alias('s_purchase_sum')
                )

df_trans_agg = df_trans_agg.groupby('client_id','month').agg(fn.sum('s_reg_pts_rec').alias('s_reg_pts_rec'),fn.sum('s_exp_pts_rec').alias('s_exp_pts_rec'),
                                                             fn.sum('s_reg_pts_sp').alias('s_reg_pts_sp'),fn.sum('s_exp_pts_sp').alias('s_exp_pts_sp'),
                                                             fn.sum('s_purchase_sum').alias('s_purchase_sum'), fn.countDistinct(fn.col('transaction_id')).alias('n_trans'))

In [17]:
# total regular point received

# pivot month
df_reg_pts_rec= df_trans_agg.groupBy("client_id").pivot("month").sum("s_reg_pts_rec")

#rename columns 
for i in range(1,len(df_reg_pts_rec.columns)):
    df_reg_pts_rec = df_reg_pts_rec.withColumnRenamed(df_reg_pts_rec.columns[i], "m_{}".format(df_reg_pts_rec.columns[i]))
    
    
df_reg_pts_rec.repartition(1).write.option("header",True).csv("D:/works/master_tilburg/dss/thesis/data/pur_reg_pts_pur.csv")

In [29]:
# total express point received

# pivot month
df_exp_pts_rec= df_trans_agg.groupBy("client_id").pivot("month").sum("s_exp_pts_rec")

#rename columns 
for i in range(1,len(df_exp_pts_rec.columns)):
    df_exp_pts_rec = df_exp_pts_rec.withColumnRenamed(df_reg_pts_rec.columns[i], "m_{}".format(df_reg_pts_rec.columns[i]))
    
    
df_exp_pts_rec.repartition(1).write.option("header",True).csv("D:/works/master_tilburg/dss/thesis/data/pur_exp_pts_pur.csv")

In [32]:
# total regular purchased

# pivot month
df_pur_sum = df_trans_agg.groupBy("client_id").pivot("month").sum("s_purchase_sum")

#rename columns 
for i in range(1,len(df_pur_sum.columns)):
    df_pur_sum = df_pur_sum.withColumnRenamed(df_pur_sum.columns[i], "m_{}_ps".format(df_pur_sum.columns[i]))
    
    
df_pur_sum.repartition(1).write.option("header",True).csv("D:/works/master_tilburg/dss/thesis/data/pur_sum.csv")

In [33]:
# total regular point spend

# pivot month
df_reg_pts_spt = df_trans_agg.groupBy("client_id").pivot("month").sum("s_reg_pts_sp")

#rename columns 
for i in range(1,len(df_reg_pts_spt.columns)):
    df_reg_pts_spt = df_reg_pts_spt.withColumnRenamed(df_reg_pts_spt.columns[i], "m_{}_ps".format(df_reg_pts_spt.columns[i]))
    
    
df_reg_pts_spt.repartition(1).write.option("header",True).csv("D:/works/master_tilburg/dss/thesis/data/pur_reg_pts_spt.csv")

In [36]:
# total express point spend

# pivot month
df_exp_pts_spt = df_trans_agg.groupBy("client_id").pivot("month").sum("s_exp_pts_sp")

#rename columns 
for i in range(1,len(df_exp_pts_spt.columns)):
    df_exp_pts_spt = df_exp_pts_spt.withColumnRenamed(df_exp_pts_spt.columns[i], "m_{}_eps".format(df_exp_pts_spt.columns[i]))
    
    
df_exp_pts_spt.repartition(1).write.option("header",True).csv("D:/works/master_tilburg/dss/thesis/data/pur_exp_pts_spt.csv")

In [None]:
df_reg_pts_rec.limit(3).toPandas()

Unnamed: 0,client_id,1,11,12,2,3
0,08d1b7df10,3.9,,6.9,7.0,4.8
1,6badd3c893,0.2,0.7,1.7,8.4,0.2
2,3375fc142e,16.5,1.9,23.3,15.2,19.6


# Product popularity 

In [94]:
# top product transactions each months 
df_top_product =df_pur.select('client_id','product_id','purchase_sum','month').groupby('client_id','month','product_id')\
                .agg(fn.count('product_id').alias('n_product'),fn.sum('purchase_sum').alias('s_purchase'))

# adding row number on purchase and quantity 
df_top_product = df_top_product.withColumn("rank_pur", fn.row_number().over(Window.partitionBy("client_id",'month').orderBy(fn.col("s_purchase").desc())))
df_top_product = df_top_product.withColumn("rank_qty", fn.row_number().over(Window.partitionBy("client_id",'month').orderBy(fn.col("n_product").desc())))

# filter top product by purchase
df_top_product_purchase = df_top_product.filter(df_top_product.rank_pur == 1)

# filter top product by quantity
df_top_product_qty = df_top_product.filter(df_top_product.rank_qty == 1)

In [89]:
# pivot product by purchse 
df_top_product_purchase_gb = df_top_product_purchase.groupby("client_id").pivot("month").agg(fn.first(fn.col('product_id')))

#rename columns 
for i in range(1,len(df_top_product_purchase_gb.columns)):
    df_top_product_purchase_gb = df_top_product_purchase_gb.withColumnRenamed(df_top_product_purchase_gb.columns[i], "m_{}_prd_pur".format(df_top_product_purchase_gb.columns[i]))
    
df_top_product_purchase_gb.repartition(1).write.option("header",True).csv("D:/works/master_tilburg/dss/thesis/data/pur_prd_val.csv")

In [90]:
# pivot product by qty 
df_top_product_qty = df_top_product_qty.groupby("client_id").pivot("month").agg(fn.first(fn.col('product_id')))

#rename columns 
for i in range(1,len(df_top_product_qty.columns)):
    df_top_product_qty = df_top_product_qty.withColumnRenamed(df_top_product_qty.columns[i], "m_{}_prd_qty".format(df_top_product_qty.columns[i]))

df_top_product_qty.repartition(1).write.option("header",True).csv("D:/works/master_tilburg/dss/thesis/data/pur_prd_qty_val.csv")

In [97]:
# pivot product by sum purchse 
df_top_product_purchase_sum = df_top_product_purchase.groupby("client_id").pivot("month").sum('s_purchase')

#rename columns 
for i in range(1,len(df_top_product_purchase_sum.columns)):
    df_top_product_purchase_sum = df_top_product_purchase_sum.withColumnRenamed(df_top_product_purchase_sum.columns[i], "m_{}_prd_pur_sum".format(df_top_product_purchase_sum.columns[i]))

df_top_product_purchase_sum.repartition(1).write.option("header",True).csv("D:/works/master_tilburg/dss/thesis/data/pur_prd_sum.csv")

In [98]:
# pivot product by total quantity value
df_top_product_qty2 = df_top_product_qty.groupby("client_id").pivot("month").sum('n_product')

#rename columns 
for i in range(1,len(df_top_product_qty2.columns)):
    df_top_product_qty2 = df_top_product_qty2.withColumnRenamed(df_top_product_qty2.columns[i], "m_{}_prd_pur_qty".format(df_top_product_qty2.columns[i]))

df_top_product_qty2.repartition(1).write.option("header",True).csv("D:/works/master_tilburg/dss/thesis/data/pur_prd_qty.csv")

In [86]:
tes.limit(3).toPandas()

Unnamed: 0,client_id,1,11,12,2,3
0,001ecff0a8,68931482c8,,120c2f5f84,f4599ca21a,4009f09b04
1,006391ff01,4009f09b04,,,4009f09b04,4009f09b04
2,0068dd084d,075b06cce4,4009f09b04,4009f09b04,a396cc6b08,3ad8062e82


In [81]:
df_top_product_purchase.filter(df_top_product.client_id == '0004d028a5').show()

+----------+-----+----------+---------+----------+--------+--------+
| client_id|month|product_id|n_product|s_purchase|rank_pur|rank_qty|
+----------+-----+----------+---------+----------+--------+--------+
|0004d028a5|   12|89c0fb09aa|        1|    349.79|       1|       1|
|0004d028a5|    3|c2749ad87a|        2|    760.96|       1|       1|
|0004d028a5|    2|f4599ca21a|        2|   1647.75|       1|       1|
|0004d028a5|    1|bc68469f16|        2|     498.0|       1|       1|
+----------+-----+----------+---------+----------+--------+--------+



In [68]:
df_top_product.filter(df_top_product.client_id == '0004d028a5').show()

+----------+-----+----------+---------+----------+--------+--------+
| client_id|month|product_id|n_product|s_purchase|rank_pur|rank_qty|
+----------+-----+----------+---------+----------+--------+--------+
|0004d028a5|   12|5186e12ff4|        1|     191.0|       1|       1|
|0004d028a5|   12|e1387ef699|        1|     191.0|       2|       2|
|0004d028a5|   12|89c0fb09aa|        1|    349.79|       3|       3|
|0004d028a5|   12|aa9230de5b|        1|    349.79|       4|       4|
|0004d028a5|   12|4a29330c8d|        1|    349.79|       5|       5|
|0004d028a5|    3|83c0f480db|        1|    265.96|       1|       1|
|0004d028a5|    3|f0a594c841|        1|    265.96|       2|       2|
|0004d028a5|    3|3f76e5bebd|        1|    265.96|       3|       3|
|0004d028a5|    3|0bff7a124a|        1|    398.26|       4|       4|
|0004d028a5|    3|f1b1bb97f2|        1|    398.26|       5|       5|
|0004d028a5|    3|6372e8152f|        1|    398.26|       6|       6|
|0004d028a5|    3|c8ce1d0d31|     

In [64]:
df_top_product.limit(3).toPandas()

Unnamed: 0,client_id,month,product_id,n_product,s_purchase,rank_pur,rank_qty
0,0004d028a5,12,5186e12ff4,1,191.0,1,1
1,0004d028a5,12,e1387ef699,1,191.0,2,2
2,0004d028a5,12,89c0fb09aa,1,349.79,3,3


In [99]:
df_top_product.printSchema()

root
 |-- client_id: string (nullable = true)
 |-- month: string (nullable = true)
 |-- product_id: string (nullable = true)
 |-- n_product: long (nullable = false)
 |-- s_purchase: double (nullable = true)
 |-- rank_pur: integer (nullable = true)
 |-- rank_qty: integer (nullable = true)



# store popularity 

In [100]:
df_pur.printSchema()

root
 |-- client_id: string (nullable = true)
 |-- transaction_id: string (nullable = true)
 |-- transaction_datetime: string (nullable = true)
 |-- regular_points_received: double (nullable = true)
 |-- express_points_received: double (nullable = true)
 |-- regular_points_spent: double (nullable = true)
 |-- express_points_spent: double (nullable = true)
 |-- purchase_sum: double (nullable = true)
 |-- store_id: string (nullable = true)
 |-- product_id: string (nullable = true)
 |-- product_quantity: double (nullable = true)
 |-- trn_sum_from_iss: double (nullable = true)
 |-- trn_sum_from_red: double (nullable = true)
 |-- date: date (nullable = true)
 |-- month: string (nullable = true)
 |-- day: integer (nullable = true)



In [129]:
# tracsation summary 

df_store_gb = df_pur.select('client_id','transaction_id','month','store_id','purchase_sum','product_id').groupby('client_id','month','transaction_id','store_id')\
                    .agg(fn.avg('purchase_sum').alias('s_purchase_sum'), fn.countDistinct('product_id').alias('n_prod_id'))

# 
df_store_gb = df_store_gb.select('client_id','month','transaction_id','store_id','s_purchase_sum','n_prod_id').groupby('client_id','month','store_id')\
              .agg(fn.sum('s_purchase_sum').alias('s_purchase_sum'), fn.avg('n_prod_id').alias('avg_n_prod_id'))                  

# unique product per store 
df_store_gb2 = df_pur.select('client_id','store_id','product_id').groupby('client_id','store_id').agg(fn.countDistinct('product_id').alias('n_product'))
df_store_gb2 = df_store_gb2.withColumnRenamed('client_id','client_id2').withColumnRenamed('store_id','store_id2')

# join with with unique product for each store 
df_store_gb = df_store_gb.join(df_store_gb2, (df_store_gb.client_id == df_store_gb2.client_id2) & (df_store_gb.store_id == df_store_gb2.store_id2) , 'inner')

df_store_gb = df_store_gb.drop('client_id2','store_id2')

# put ranking 
df_store_gb_rank = df_store_gb.withColumn("rank_s_purchase", fn.row_number().over(Window.partitionBy("client_id",'month').orderBy(fn.col("s_purchase_sum").desc())))
df_store_gb_rank = df_store_gb_rank.withColumn("rank_prod_qty", fn.row_number().over(Window.partitionBy("client_id",'month').orderBy(fn.col("n_product").desc())))

# filter top product by purchase
df_store_gb_top_pur = df_store_gb_rank.filter(df_store_gb_rank.rank_s_purchase == 1)

# filter top product by quantity
df_store_gb_top_qty = df_store_gb_rank.filter(df_store_gb_rank.rank_prod_qty == 1)

In [131]:
# pivot product by store top purchase  (store) 
df_store_gb_top_pur_pvt = df_store_gb_top_pur.groupby("client_id").pivot("month").agg(fn.first(fn.col('store_id')))

#rename columns 
for i in range(1,len(df_store_gb_top_pur_pvt.columns)):
    df_store_gb_top_pur_pvt = df_store_gb_top_pur_pvt.withColumnRenamed(df_store_gb_top_pur_pvt.columns[i], "m_{}_str_top_pur".format(df_store_gb_top_pur_pvt.columns[i]))
    
df_store_gb_top_pur_pvt.repartition(1).write.option("header",True).csv("D:/works/master_tilburg/dss/thesis/data/str_top_pur.csv")

In [132]:
# pivot product by store top purchase  (store) 
df_store_gb_top_qty_pvt = df_store_gb_top_qty.groupby("client_id").pivot("month").agg(fn.first(fn.col('store_id')))

#rename columns 
for i in range(1,len(df_store_gb_top_qty_pvt.columns)):
    df_store_gb_top_qty_pvt = df_store_gb_top_qty_pvt.withColumnRenamed(df_store_gb_top_qty_pvt.columns[i], "m_{}_str_top_qty".format(df_store_gb_top_qty_pvt.columns[i]))
    
df_store_gb_top_qty_pvt.repartition(1).write.option("header",True).csv("D:/works/master_tilburg/dss/thesis/data/str_top_qty.csv")

In [134]:
# pivot product by store top purchase  (store purchase value ) 
df_store_gb_top_pur_val_pvt = df_store_gb_top_pur.groupby("client_id").pivot("month").sum('s_purchase_sum')


#rename columns 
for i in range(1,len(df_store_gb_top_pur_val_pvt.columns)):
    df_store_gb_top_pur_val_pvt = df_store_gb_top_pur_val_pvt.withColumnRenamed(df_store_gb_top_pur_val_pvt.columns[i], "m_{}_str_top_pur_val".format(df_store_gb_top_pur_val_pvt.columns[i]))
    
df_store_gb_top_pur_val_pvt.repartition(1).write.option("header",True).csv("D:/works/master_tilburg/dss/thesis/data/str_top_pur_val.csv")

In [136]:
# pivot product by store top purchase  (store purchase value ) 
df_store_gb_top_qty_val_pvt = df_store_gb_top_qty.groupby("client_id").pivot("month").sum('n_product')

#rename columns 
for i in range(1,len(df_store_gb_top_qty_val_pvt.columns)):
    df_store_gb_top_qty_val_pvt = df_store_gb_top_qty_val_pvt.withColumnRenamed(df_store_gb_top_qty_val_pvt.columns[i], "m_{}_str_top_qty_val".format(df_store_gb_top_qty_val_pvt.columns[i]))
    
df_store_gb_top_qty_val_pvt.repartition(1).write.option("header",True).csv("D:/works/master_tilburg/dss/thesis/data/str_top_qty_val.csv")

In [135]:
df_store_gb_top_pur.printSchema()

root
 |-- client_id: string (nullable = true)
 |-- month: string (nullable = true)
 |-- store_id: string (nullable = true)
 |-- s_purchase_sum: double (nullable = true)
 |-- avg_n_prod_id: double (nullable = true)
 |-- n_product: long (nullable = false)
 |-- rank_s_purchase: integer (nullable = true)
 |-- rank_prod_qty: integer (nullable = true)



In [109]:
df_store_gb.count() # 2729515

2729515

In [123]:
df_store_gb.limit(10).toPandas()

Unnamed: 0,client_id,month,store_id,s_purchase_sum,avg_n_prod_id,n_product
0,000aef1e0b,12,7bfd87d161,565.67,10.0,21
1,000aef1e0b,2,7bfd87d161,569.0,12.0,21
2,000b0559be,11,7dc4e574be,269.97,1.0,39
3,000b0559be,12,7dc4e574be,5837.94,4.888889,39
4,000b0559be,2,7dc4e574be,1440.22,4.0,39
5,000b0559be,1,7dc4e574be,235.0,1.0,39
6,001d004e5e,12,18e9a4401d,25.7,1.0,1
7,001dac232d,12,8d3d83fcc1,924.0,4.0,4
8,003cb63a18,3,72c9bdc485,1474.0,7.75,92
9,003cb63a18,11,72c9bdc485,1020.0,8.333333,92


# Join all features 

In [74]:
# regular points received 
df_rpr = spark \
    .read.format("com.databricks.spark.csv")\
    .option("header", "true")\
    .option("inferschema", "true")\
    .option("delimiter", ",")\
    .load("../../data/pur_reg_pts_rec.csv")

# express points received 
df_epr = spark \
    .read.format("com.databricks.spark.csv")\
    .option("header", "true")\
    .option("inferschema", "true")\
    .option("delimiter", ",")\
    .load("../../data/pur_exp_pts_pur.csv")

# purchase sum 
df_ps = spark \
    .read.format("com.databricks.spark.csv")\
    .option("header", "true")\
    .option("inferschema", "true")\
    .option("delimiter", ",")\
    .load("../../data/pur_sum.csv")

# regular point spent  
df_rps = spark \
    .read.format("com.databricks.spark.csv")\
    .option("header", "true")\
    .option("inferschema", "true")\
    .option("delimiter", ",")\
    .load("../../data/pur_reg_pts_spt.csv/")

# express point spent  
df_eps = spark \
    .read.format("com.databricks.spark.csv")\
    .option("header", "true")\
    .option("inferschema", "true")\
    .option("delimiter", ",")\
    .load("../../data/pur_exp_pts_spt.csv")

# top product spent by quantity 
df_prd_qty = spark \
    .read.format("com.databricks.spark.csv")\
    .option("header", "true")\
    .option("inferschema", "true")\
    .option("delimiter", ",")\
    .load("../../data/pur_prd_qty.csv")

# top product spent by quantity 
df_prd_qty_val = spark \
    .read.format("com.databricks.spark.csv")\
    .option("header", "true")\
    .option("inferschema", "true")\
    .option("delimiter", ",")\
    .load("../../data/pur_prd_qty_val.csv")

# top product spent by sum purchase
df_prd_pur = spark \
    .read.format("com.databricks.spark.csv")\
    .option("header", "true")\
    .option("inferschema", "true")\
    .option("delimiter", ",")\
    .load("../../data/pur_prd_sum.csv")

# top product spent by sum purchase
df_prd_pur_val = spark \
    .read.format("com.databricks.spark.csv")\
    .option("header", "true")\
    .option("inferschema", "true")\
    .option("delimiter", ",")\
    .load("../../data/pur_prd_sum_val.csv")

#  top store by quantity  
df_str_top_pur = spark \
    .read.format("com.databricks.spark.csv")\
    .option("header", "true")\
    .option("inferschema", "true")\
    .option("delimiter", ",")\
    .load("../../data/str_top_pur.csv")

# top store spent store
df_str_top_pur_val = spark \
    .read.format("com.databricks.spark.csv")\
    .option("header", "true")\
    .option("inferschema", "true")\
    .option("delimiter", ",")\
    .load("../../data/str_top_pur_val.csv")

#  top store by quantity  
df_str_top_qty = spark \
    .read.format("com.databricks.spark.csv")\
    .option("header", "true")\
    .option("inferschema", "true")\
    .option("delimiter", ",")\
    .load("../../data/str_top_qty.csv")

#  top store by quantity  
df_str_top_qty_val = spark \
    .read.format("com.databricks.spark.csv")\
    .option("header", "true")\
    .option("inferschema", "true")\
    .option("delimiter", ",")\
    .load("../../data/str_top_qty_val.csv")


In [45]:
df_str_top_pur_val.limit(3).toPandas()

Unnamed: 0,client_id,m_1_str_top_pur_val,m_11_str_top_pur_val,m_12_str_top_pur_val,m_2_str_top_pur_val,m_3_str_top_pur_val
0,02c44238cb,,551.0,1749.0,1484.0,2956.0
1,0690e629e9,3413.43,3833.45,2778.86,2580.0,2634.76
2,0799c8ec86,562.27,,209.0,340.53,91.69


In [95]:
# df_rpr = df_rpr.withColumnRenamed('client_id','client_id2')
# df_features = df_cli.join(df_rpr, df_cli.client_id == df_rpr.client_id2, 'left' )
# df_features = df_features.drop('client_id2')

# join customers label and info 
df_cli = df_cli.withColumnRenamed('client_id','client_id2')
df_features = df_up_train.join(df_cli,df_up_train.client_id == df_cli.client_id2,'inner')
df_features = df_features.drop('client_id2')

# join with rps 
df_rpr = df_rpr.withColumnRenamed('client_id','client_id2')
df_features = df_features.join(df_rpr, df_features.client_id == df_rpr.client_id2, 'left' )
df_features = df_features.drop('client_id2')

# join with eps 
df_eps = df_eps.withColumnRenamed('client_id','client_id2')
df_features = df_features.join(df_eps, df_features.client_id == df_eps.client_id2, 'left' )
df_features = df_features.drop('client_id2')

# join with ps 
df_ps = df_ps.withColumnRenamed('client_id','client_id2')
df_features = df_features.join(df_ps, df_features.client_id == df_ps.client_id2, 'left' )
df_features = df_features.drop('client_id2')

# join with eps 
df_rps = df_rps.withColumnRenamed('client_id','client_id2')
df_features = df_features.join(df_rps, df_features.client_id == df_rps.client_id2, 'left' )
df_features = df_features.drop('client_id2')

# join with prd_qty 
df_prd_qty = df_prd_qty.withColumnRenamed('client_id','client_id2')
df_features = df_features.join(df_prd_qty, df_features.client_id == df_prd_qty.client_id2, 'left' )
df_features = df_features.drop('client_id2')

# join with prd_purchase
df_prd_pur = df_prd_pur.withColumnRenamed('client_id','client_id2')
df_features = df_features.join(df_prd_pur, df_features.client_id == df_prd_pur.client_id2, 'left' )
df_features = df_features.drop('client_id2')

# join with prd_qty_value
df_prd_qty_val = df_prd_qty_val.withColumnRenamed('client_id','client_id2')
df_features = df_features.join(df_prd_qty_val, df_features.client_id == df_prd_qty_val.client_id2, 'left' )
df_features = df_features.drop('client_id2')

# join with prd_pur_value
df_prd_pur_val = df_prd_pur_val.withColumnRenamed('client_id','client_id2')
df_features = df_features.join(df_prd_pur_val, df_features.client_id == df_prd_pur_val.client_id2, 'left' )
df_features = df_features.drop('client_id2')

# join with top top store quantity 
df_str_top_qty = df_str_top_qty.withColumnRenamed('client_id','client_id2')
df_features = df_features.join(df_str_top_qty, df_features.client_id == df_str_top_qty.client_id2, 'left' )
df_features = df_features.drop('client_id2')

# join with top top store purchase 
df_str_top_pur = df_str_top_pur.withColumnRenamed('client_id','client_id2')
df_features = df_features.join(df_str_top_pur, df_features.client_id == df_str_top_pur.client_id2, 'left' )
df_features = df_features.drop('client_id2')

# join with top top store quantity value 
df_str_top_qty_val = df_str_top_qty_val.withColumnRenamed('client_id','client_id2')
df_features = df_features.join(df_str_top_qty_val, df_features.client_id == df_str_top_qty_val.client_id2, 'left' )
df_features = df_features.drop('client_id2')

# join with top top store quantity value 
df_str_top_pur_val = df_str_top_pur_val.withColumnRenamed('client_id','client_id2')
df_features = df_features.join(df_str_top_pur_val, df_features.client_id == df_str_top_pur_val.client_id2, 'left' )
df_features = df_features.drop('client_id2')

In [96]:
df_features.limit(3).toPandas()

Unnamed: 0,client_id,treatment_flg,target,first_issue_date,first_redeem_date,age,gender,m_1_rpr,m_11_rpr,m_12_rpr,...,m_1_str_top_qty_val,m_11_str_top_qty_val,m_12_str_top_qty_val,m_2_str_top_qty_val,m_3_str_top_qty_val,m_1_str_top_pur_val,m_11_str_top_pur_val,m_12_str_top_pur_val,m_2_str_top_pur_val,m_3_str_top_pur_val
0,00f6cab0d9,0,1,2017-09-14 15:27:21,2017-12-25 16:27:54,48,U,17.6,,26.5,...,83,,83,83,83,1811.24,,2844.54,1360.99,1732.33
1,010c5002de,1,1,2018-10-24 12:43:19,2018-11-12 21:39:30,27,M,3.2,3.0,8.6,...,40,40.0,40,40,40,749.84,307.56,1537.7,764.03,728.48
2,018253c9e4,0,0,2018-01-19 18:00:10,2018-11-29 15:13:23,78,U,16.4,0.0,28.6,...,39,39.0,39,39,28,972.0,309.0,2196.0,813.0,223.0


In [97]:
df_features.printSchema()

root
 |-- client_id: string (nullable = true)
 |-- treatment_flg: integer (nullable = true)
 |-- target: integer (nullable = true)
 |-- first_issue_date: string (nullable = true)
 |-- first_redeem_date: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- gender: string (nullable = true)
 |-- m_1_rpr: double (nullable = true)
 |-- m_11_rpr: double (nullable = true)
 |-- m_12_rpr: double (nullable = true)
 |-- m_2_rpr: double (nullable = true)
 |-- m_3_rpr: double (nullable = true)
 |-- m_1_eps: double (nullable = true)
 |-- m_11_eps: double (nullable = true)
 |-- m_12_eps: double (nullable = true)
 |-- m_2_eps: double (nullable = true)
 |-- m_3_eps: double (nullable = true)
 |-- m_1_ps: double (nullable = true)
 |-- m_11_ps: double (nullable = true)
 |-- m_12_ps: double (nullable = true)
 |-- m_2_ps: double (nullable = true)
 |-- m_3_ps: double (nullable = true)
 |-- m_1_rps: double (nullable = true)
 |-- m_11_rps: double (nullable = true)
 |-- m_12_rps: double (nullable =

In [None]:
df_features.repartition(1).write.option("header",True).csv("D:/works/master_tilburg/dss/thesis/data/feature_stg1.csv")

In [82]:
df_features.select(fn.count(fn.col('client_id')),fn.countDistinct(fn.col('client_id'))).show()

+----------------+-------------------------+
|count(client_id)|count(DISTINCT client_id)|
+----------------+-------------------------+
|          200039|                   200039|
+----------------+-------------------------+



In [70]:
data = [("Banana",1000,"USA"), ("Carrots",1500,"USA"), ("Beans",1600,"USA"), \
      ("Orange",2000,"USA"),("Orange",2000,"USA"),("Banana",400,"China"), \
      ("Carrots",1200,"China"),("Beans",1500,"China"),("Orange",4000,"China"), \
      ("Banana",2000,"Canada"),("Carrots",2000,"Canada"),("Beans",2000,"Mexico")]

columns= ["Product","Amount","Country"]
df = spark.createDataFrame(data = data, schema = columns)

In [71]:
df.show(3)



+-------+------+-------+
|Product|Amount|Country|
+-------+------+-------+
| Banana|  1000|    USA|
|Carrots|  1500|    USA|
|  Beans|  1600|    USA|
+-------+------+-------+
only showing top 3 rows



In [None]:
pivotDF = df.groupBy("Product").pivot("Country").sum("Amount")
pivotDF.printSchema()

root
 |-- Product: string (nullable = true)
 |-- Canada: long (nullable = true)
 |-- China: long (nullable = true)
 |-- Mexico: long (nullable = true)
 |-- USA: long (nullable = true)



In [None]:
pivotDF.limit(3).toPandas()

Unnamed: 0,Product,Canada,China,Mexico,USA
0,Orange,,4000,,4000
1,Beans,,1500,2000.0,1600
2,Banana,2000.0,400,,1000


In [None]:
df_trans_agg.limit(3).toPandas()

In [None]:
df_trans_agg.select(fn.countDistinct(fn.col('month'))).show()

In [None]:
df_reg_pts_rec.select(fn.count(fn.col('transaction_id')),fn.countDistinct(fn.col('transaction_id'))).show()

In [None]:
df_pur.limit(3).toPandas()

In [137]:
spark.stop()