In [1]:
import findspark
findspark.init()
import datetime as dt

import pandas as pd 
import pyspark
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.types import *
import pyspark.sql.functions as fn
from pyspark.sql.window import Window

from pyspark.sql.functions import udf

In [2]:
def equivalent_type(f):
    if f == 'datetime64[ns]': return DateType()
    elif f == 'int64': return LongType()
    elif f == 'int32': return IntegerType()
    elif f == 'float64': return FloatType()
    else: return StringType()

def define_structure(string, format_type):
    try: typo = equivalent_type(format_type)
    except: typo = StringType()
    return StructField(string, typo)


# Given pandas dataframe, it will return a spark's dataframe.
def pandas_to_spark(pandas_df,sparkSession):
    columns = list(pandas_df.columns)
    types = list(pandas_df.dtypes)
    struct_list = []
    i = 0
    for column, typo in zip(columns, types): 
        struct_list.append(define_structure(column, typo))
    p_schema = StructType(struct_list)
    return sparkSession.createDataFrame(pandas_df, p_schema)

In [3]:
# start spark engine 
conf = pyspark.SparkConf().setAppName('tes_spark').setMaster('local')
sc = pyspark.SparkContext.getOrCreate(conf=conf)
spark = SparkSession(sc)

In [154]:
# customer information 
df_cli = spark \
    .read.format("com.databricks.spark.csv")\
    .option("header", "true")\
    .option("inferschema", "true")\
    .option("delimiter", ",")\
    .load("../../data/data/clients.csv")

In [5]:
# customer flag 
df_up_train = spark \
    .read.format("com.databricks.spark.csv")\
    .option("header", "true")\
    .option("inferschema", "true")\
    .option("delimiter", ",")\
    .load("../../data/data/uplift_train.csv")

In [6]:
# custommer id for testing
df_up_test = spark \
    .read.format("com.databricks.spark.csv")\
    .option("header", "true")\
    .option("inferschema", "true")\
    .option("delimiter", ",")\
    .load("../../data/data/uplift_test.csv")

In [7]:
# product information
df_pro = spark \
    .read.format("com.databricks.spark.csv")\
    .option("header", "true")\
    .option("inferschema", "true")\
    .option("delimiter", ",")\
    .load("../../data/data/products.csv")

In [8]:
# purchase transactional data 
df_pur = spark \
    .read.format("com.databricks.spark.csv")\
    .option("header", "true")\
    .option("inferschema", "true")\
    .option("delimiter", ",")\
    .load("../../data/data/purchases.csv")

In [165]:
tes_df = spark \
    .read.format("com.databricks.spark.csv")\
    .option("header", "true")\
    .option("inferschema", "true")\
    .option("delimiter", ",")\
    .load("D:/works/master_tilburg/dss/thesis/data/feature_stg2.csv")

In [167]:
tes_df.limit(3).toPandas()

Unnamed: 0,client_id,regular_points_received,express_points_spent,purchase_sum,avg_n_prod,avg_n_prod_qty,trn_sum_from_iss,trn_sum_from_red,n_transaction,store_id_pur,s_purchase_sum,store_id_pur_qty,store_n_product,product_pur,s_purchase,product_qty,n_product
0,02429418df,37.3,0.0,4156.77,8.5,62.0,539.914286,0.0,6,d09acf8114,1349.0,2fe93e36be,21,4009f09b04,3190.7,4009f09b04,3
1,02d6c08e7d,5.9,0.0,747.0,1.888889,30.0,500.583333,0.0,9,7763d9b151,721.0,7763d9b151,8,21e8f864ff,506.0,21e8f864ff,7
2,03f35da9a5,30.2,0.0,5661.85,3.647059,87.0,1455.9,0.0,17,04d336aec5,5661.85,04d336aec5,47,222c727a1d,2090.06,222c727a1d,4


In [32]:
dt.date(2019, 11, 20)

datetime.date(2019, 11, 20)

In [31]:
dt.datetime(2019, 11, 20, 12,37,56)

datetime.datetime(2019, 11, 20, 12, 37, 56)

# start 

In [155]:
# fill missing value in first redeem date with max date 
df_cli = df_cli.withColumn('first_redeem_date2', fn.when(fn.col('first_redeem_date').isNull(), dt.datetime(2019, 11, 20, 12,37,56)).otherwise(fn.col('first_redeem_date')))
df_cli = df_cli.drop('first_redeem_date')
df_cli = df_cli.withColumnRenamed('first_redeem_date2','first_redeem_date')
df_cli = df_cli.withColumn("first_redeem_date", fn.to_date(fn.col("first_redeem_date")))

# fill na values in purchase 
df_pur =  df_pur.fillna(value=0,subset=["trn_sum_from_red"])

In [146]:
df_cli.printSchema()

root
 |-- client_id: string (nullable = true)
 |-- first_issue_date: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- gender: string (nullable = true)
 |-- first_redeem_date: date (nullable = true)



In [135]:
df_cli.count()

400162

In [147]:
df_up_train.count()

200039

In [158]:
# join client and purchase 
df_cli = df_cli.withColumnRenamed('client_id','client_id2')

df_cli_pur = df_cli.select('client_id2','first_redeem_date').join(df_pur,df_cli.client_id2 == df_pur.client_id,'inner')
df_cli_pur = df_cli_pur.drop('client_id2')

# keep transactiion before redeem date 
df_cli_pur = df_cli_pur.withColumn("transaction_datetime", fn.to_date(fn.col("transaction_datetime")))
df_cli_pur = df_cli_pur.withColumn('drop_transaction',fn.when(fn.col('transaction_datetime') < fn.col('first_redeem_date'),1).otherwise(0)) # 1 keep, 0 remove 

# add target 
df_up_train = df_up_train.withColumnRenamed('client_id','client_id2')
df_cli_pur = df_cli_pur.join(df_up_train,df_cli_pur.client_id==df_up_train.client_id2,'inner')
df_cli_pur = df_cli_pur.drop('client_id2')

# keep transaction before redeem date 
df_cli_pur = df_cli_pur.filter(df_cli_pur.drop_transaction == 1)

In [151]:
df_cli_pur.groupby('target','treatment_flg','drop_transaction').agg(fn.countDistinct('client_id')).toPandas()

Unnamed: 0,target,treatment_flg,drop_transaction,count(client_id)
0,0,1,1,15032
1,0,0,0,29165
2,1,0,1,19993
3,1,1,1,21141
4,1,0,0,50675
5,1,1,0,53320
6,0,1,0,26732
7,0,0,1,16658


In [159]:
# feature aggregation by transaction 
df_feature = df_cli_pur.groupby('client_id','transaction_id').agg(fn.avg('regular_points_received').alias('regular_points_received'), fn.avg('express_points_spent').alias('express_points_spent') \
                                                                 ,fn.avg('purchase_sum').alias('purchase_sum'),fn.countDistinct('product_id').alias('n_prod')\
                                                                 ,fn.sum('product_quantity').alias('product_quantity'),fn.avg('trn_sum_from_iss').alias('trn_sum_from_iss'),fn.avg('trn_sum_from_red').alias('trn_sum_from_red'))

df_feature = df_feature.groupby('client_id').agg(fn.sum('regular_points_received').alias('regular_points_received'),fn.sum('express_points_spent').alias('express_points_spent')\
                                                ,fn.sum('purchase_sum').alias('purchase_sum'),fn.avg('n_prod').alias('avg_n_prod'),fn.sum('product_quantity').alias('avg_n_prod_qty')\
                                                ,fn.sum('trn_sum_from_iss').alias('trn_sum_from_iss'),fn.sum('trn_sum_from_red').alias('trn_sum_from_red'), fn.countDistinct('transaction_id').alias('n_transaction'))


In [160]:
# top product transactions
df_top_product =df_cli_pur.select('client_id','product_id','purchase_sum').groupby('client_id','product_id')\
                .agg(fn.count('product_id').alias('n_product'),fn.sum('purchase_sum').alias('s_purchase'))

# adding row number on purchase and quantity 
df_top_product = df_top_product.withColumn("rank_pur", fn.row_number().over(Window.partitionBy("client_id").orderBy(fn.col("s_purchase").desc())))
df_top_product = df_top_product.withColumn("rank_qty", fn.row_number().over(Window.partitionBy("client_id").orderBy(fn.col("n_product").desc())))

# filter top product by purchase
df_top_product_purchase = df_top_product.filter(df_top_product.rank_pur == 1)

# filter top product by quantity
df_top_product_qty = df_top_product.filter(df_top_product.rank_qty == 1)

In [161]:
# favorite store 
df_store_gb = df_cli_pur.select('client_id','transaction_id','store_id','purchase_sum','product_id').groupby('client_id','transaction_id','store_id')\
                    .agg(fn.avg('purchase_sum').alias('s_purchase_sum'), fn.countDistinct('product_id').alias('n_prod_id'))

# 
df_store_gb = df_store_gb.select('client_id','transaction_id','store_id','s_purchase_sum','n_prod_id').groupby('client_id','store_id')\
              .agg(fn.sum('s_purchase_sum').alias('s_purchase_sum'), fn.avg('n_prod_id').alias('avg_n_prod_id'))                  

# unique product per store 
df_store_gb2 = df_cli_pur.select('client_id','store_id','product_id').groupby('client_id','store_id').agg(fn.countDistinct('product_id').alias('n_product'))
df_store_gb2 = df_store_gb2.withColumnRenamed('client_id','client_id2').withColumnRenamed('store_id','store_id2')

# join with with unique product for each store 
df_store_gb = df_store_gb.join(df_store_gb2, (df_store_gb.client_id == df_store_gb2.client_id2) & (df_store_gb.store_id == df_store_gb2.store_id2) , 'inner')

df_store_gb = df_store_gb.drop('client_id2','store_id2')

# put ranking 
df_store_gb_rank = df_store_gb.withColumn("rank_s_purchase", fn.row_number().over(Window.partitionBy("client_id").orderBy(fn.col("s_purchase_sum").desc())))
df_store_gb_rank = df_store_gb_rank.withColumn("rank_prod_qty", fn.row_number().over(Window.partitionBy("client_id").orderBy(fn.col("n_product").desc())))


# filter top product by purchase
df_store_gb_top_pur = df_store_gb_rank.filter(df_store_gb_rank.rank_s_purchase == 1)

# filter top product by quantity
df_store_gb_top_qty = df_store_gb_rank.filter(df_store_gb_rank.rank_prod_qty == 1)

In [162]:
# join new feature store and products

# store purchase 
df_store_gb_top_pur = df_store_gb_top_pur.withColumnRenamed('client_id','client_id2')
df_feature = df_feature.join(df_store_gb_top_pur.select('client_id2','store_id','s_purchase_sum'),df_feature.client_id == df_store_gb_top_pur.client_id2,'left' )
df_feature = df_feature.drop('client_id2')
df_feature = df_feature.withColumnRenamed('store_id','store_id_pur')

# store quantity 
df_store_gb_top_qty = df_store_gb_top_qty.withColumnRenamed('client_id','client_id2')
df_feature = df_feature.join(df_store_gb_top_qty.select('client_id2','store_id','n_product'),df_feature.client_id == df_store_gb_top_qty.client_id2,'left' )
df_feature = df_feature.drop('client_id2')
df_feature = df_feature.withColumnRenamed('store_id','store_id_pur_qty').withColumnRenamed('n_product','store_n_product')

# product purchase 
df_top_product_purchase = df_top_product_purchase.withColumnRenamed('client_id','client_id2')
df_feature = df_feature.join(df_top_product_purchase.select('client_id2','product_id','s_purchase'),df_feature.client_id == df_top_product_purchase.client_id2,'left' )
df_feature = df_feature.drop('client_id2')
df_feature = df_feature.withColumnRenamed('product_id','product_pur')

# product qty 
df_top_product_qty = df_top_product_qty.withColumnRenamed('client_id','client_id2')
df_feature = df_feature.join(df_top_product_qty.select('client_id2','product_id','n_product'),df_feature.client_id == df_top_product_qty.client_id2,'left' )
df_feature = df_feature.drop('client_id2')
df_feature = df_feature.withColumnRenamed('product_id','product_qty')

In [164]:
df_feature.repartition(1).write.mode('overwrite').option("header",True).csv("D:/works/master_tilburg/dss/thesis/data/feature_stg2.csv")

In [132]:
df_feature.limit(3).show()

+----------+-----------------------+--------------------+-----------------+------------------+--------------+------------------+----------------+-------------+------------+--------------+----------------+---------------+-----------+----------+-----------+---------+
| client_id|regular_points_received|express_points_spent|     purchase_sum|        avg_n_prod|avg_n_prod_qty|  trn_sum_from_iss|trn_sum_from_red|n_transaction|store_id_pur|s_purchase_sum|store_id_pur_qty|store_n_product|product_pur|s_purchase|product_qty|n_product|
+----------+-----------------------+--------------------+-----------------+------------------+--------------+------------------+----------------+-------------+------------+--------------+----------------+---------------+-----------+----------+-----------+---------+
|02429418df|     37.300000000000004|                 0.0|          4156.77|               8.5|          62.0| 539.9142857142857|             0.0|            6|  d09acf8114|        1349.0|      2fe93e36b

In [152]:
df_feature.printSchema()

root
 |-- client_id: string (nullable = true)
 |-- regular_points_received: double (nullable = true)
 |-- express_points_spent: double (nullable = true)
 |-- purchase_sum: double (nullable = true)
 |-- avg_n_prod: double (nullable = true)
 |-- avg_n_prod_qty: double (nullable = true)
 |-- trn_sum_from_iss: double (nullable = true)
 |-- trn_sum_from_red: double (nullable = true)
 |-- n_transaction: long (nullable = false)
 |-- store_id_pur: string (nullable = true)
 |-- s_purchase_sum: double (nullable = true)
 |-- store_id_pur_qty: string (nullable = true)
 |-- store_n_product: long (nullable = true)
 |-- product_pur: string (nullable = true)
 |-- s_purchase: double (nullable = true)
 |-- product_qty: string (nullable = true)
 |-- n_product: long (nullable = true)



In [133]:
df_feature.count()

72824

In [110]:
df_store_gb_top_qty.limit(3).show()

+----------+----------+------------------+------------------+---------+---------------+-------------+
| client_id|  store_id|    s_purchase_sum|     avg_n_prod_id|n_product|rank_s_purchase|rank_prod_qty|
+----------+----------+------------------+------------------+---------+---------------+-------------+
|02429418df|2fe93e36be|1337.7000000000003|              21.0|       21|              2|            1|
|02d6c08e7d|7763d9b151|             721.0|               2.0|        8|              1|            1|
|03f35da9a5|04d336aec5|           5661.85|3.6470588235294117|       47|              1|            1|
+----------+----------+------------------+------------------+---------+---------------+-------------+



In [107]:
df_store_gb_top_pur.printSchema(), df_store_gb_top_qty.printSchema()

root
 |-- client_id: string (nullable = true)
 |-- store_id: string (nullable = true)
 |-- s_purchase_sum: double (nullable = true)
 |-- avg_n_prod_id: double (nullable = true)
 |-- n_product: long (nullable = false)
 |-- rank_s_purchase: integer (nullable = true)
 |-- rank_prod_qty: integer (nullable = true)

root
 |-- client_id: string (nullable = true)
 |-- store_id: string (nullable = true)
 |-- s_purchase_sum: double (nullable = true)
 |-- avg_n_prod_id: double (nullable = true)
 |-- n_product: long (nullable = false)
 |-- rank_s_purchase: integer (nullable = true)
 |-- rank_prod_qty: integer (nullable = true)



(None, None)

In [101]:
df_store_gb_top_qty.count(), df_top_product_qty.count()

(400162, 400162)

In [100]:
df_store_gb_top_pur.printSchema()

root
 |-- client_id: string (nullable = true)
 |-- store_id: string (nullable = true)
 |-- s_purchase_sum: double (nullable = true)
 |-- avg_n_prod_id: double (nullable = true)
 |-- n_product: long (nullable = false)
 |-- rank_s_purchase: integer (nullable = true)
 |-- rank_prod_qty: integer (nullable = true)



In [77]:
tes = df_cli_pur.groupby('client_id').agg(fn.countDistinct('drop_transaction').alias('n'))
tes.groupby('n').agg(fn.countDistinct('client_id')).show()

+---+----------------+
|  n|count(client_id)|
+---+----------------+
|  1|          167362|
|  2|           32677|
+---+----------------+



In [74]:
df_cli_pur.groupby('drop_transaction','treatment_flg','target').agg(fn.countDistinct('transaction_id'),fn.countDistinct('client_id')).show()

+----------------+-------------+------+---------------------+----------------+
|drop_transaction|treatment_flg|target|count(transaction_id)|count(client_id)|
+----------------+-------------+------+---------------------+----------------+
|               0|            1|     1|              1321476|           53320|
|               0|            0|     0|               361034|           29165|
|               1|            0|     1|               253067|           19993|
|               1|            1|     1|               260104|           21141|
|               1|            0|     0|               118283|           16658|
|               1|            1|     0|               107540|           15032|
|               0|            1|     0|               329054|           26732|
|               0|            0|     1|              1274388|           50675|
+----------------+-------------+------+---------------------+----------------+



In [75]:
53320 + 29165 + 19993 + 21141 + 16658 + 15032 + 26732 + 50675

232716

In [63]:
df_cli_pur.groupby('drop_transaction').agg(fn.countDistinct('transaction_id'),fn.countDistinct('client_id')).show()

+----------------+---------------------+----------------+
|drop_transaction|count(transaction_id)|count(client_id)|
+----------------+---------------------+----------------+
|               1|              1479837|          145869|
|               0|              6565370|          319733|
+----------------+---------------------+----------------+



In [47]:
df_cli.filter(df_cli.client_id == '000048b7a6' ).show()

+----------+-------------------+---+------+-----------------+
| client_id|   first_issue_date|age|gender|first_redeem_date|
+----------+-------------------+---+------+-----------------+
|000048b7a6|2018-12-15 13:33:11| 68|     F|       2019-11-20|
+----------+-------------------+---+------+-----------------+



In [26]:
df_cli.limit(3).show()

+----------+-------------------+---+------+-------------------+
| client_id|   first_issue_date|age|gender|  first_redeem_date|
+----------+-------------------+---+------+-------------------+
|000012768d|2017-08-05 15:40:48| 45|     U|2018-01-04 19:30:07|
|000036f903|2017-04-10 13:54:23| 72|     F|2017-04-23 12:37:56|
|000048b7a6|2018-12-15 13:33:11| 68|     F|         2019-11-20|
+----------+-------------------+---+------+-------------------+



In [23]:
df_cli.select([fn.count(fn.when(fn.isnan(c) | fn.col(c).isNull(), c)).alias(c) for c in df_cli.columns]).show()

+---------+----------------+-----------------+---+------+------------------+
|client_id|first_issue_date|first_redeem_date|age|gender|first_redeem_date2|
+---------+----------------+-----------------+---+------+------------------+
|        0|               0|            35469|  0|     0|                 0|
+---------+----------------+-----------------+---+------+------------------+



In [42]:
df_cli.select(fn.min('first_redeem_date'),fn.max('first_redeem_date')).show()

+----------------------+----------------------+
|min(first_redeem_date)|max(first_redeem_date)|
+----------------------+----------------------+
|   2017-04-11 09:42:20|   2019-11-20 12:37:56|
+----------------------+----------------------+



In [11]:
df_cli.printSchema()

root
 |-- client_id: string (nullable = true)
 |-- first_issue_date: string (nullable = true)
 |-- first_redeem_date: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- gender: string (nullable = true)



In [12]:
df_pur.printSchema()

root
 |-- client_id: string (nullable = true)
 |-- transaction_id: string (nullable = true)
 |-- transaction_datetime: string (nullable = true)
 |-- regular_points_received: double (nullable = true)
 |-- express_points_received: double (nullable = true)
 |-- regular_points_spent: double (nullable = true)
 |-- express_points_spent: double (nullable = true)
 |-- purchase_sum: double (nullable = true)
 |-- store_id: string (nullable = true)
 |-- product_id: string (nullable = true)
 |-- product_quantity: double (nullable = true)
 |-- trn_sum_from_iss: double (nullable = true)
 |-- trn_sum_from_red: double (nullable = true)



In [None]:
spark.stop()