In [1]:
import findspark
findspark.init()
import pyspark
from pyspark.sql import *
import pyspark.sql.functions as F
from pyspark.sql.functions import *
from pyspark.sql.types import *

#Lifetime package in python is designed to calculate the CLTV 
import lifetimes
import sys

In [2]:
spark=SparkSession.builder.appName("Spark Programming").getOrCreate()

In [3]:
sc = SQLContext(spark)

In [6]:
#Creating the Schema for the datasets
# I have kept the ID fields as Nullable=False so that there is no null in these columns as by very basic definition they should be not-null and unique
offers_schema =StructType(fields=[StructField('offer',IntegerType(),False),      #offer - An id representing a certain offer
                 StructField('category',IntegerType(),True),     #category - The product category (e.g. sparkling water)
                StructField('quantity',IntegerType(),True),     #quantity - The number of units one must purchase to get the discount
                StructField('company',IntegerType(),False),       #company - An id of the company that sells the item
                StructField('offervalue',DoubleType(),True),   #offervalue - The dollar value of the offer
                StructField('brand',IntegerType(),False)])         #brand - An id of the brand to which the item belongs

transactions_schema = StructType(fields=[StructField('id',IntegerType(),False),   #id - A unique id representing a customer
                      StructField('chain',IntegerType(),True),  #chain - An integer representing a store chain
                      StructField('dept',IntegerType(),True),    #dept - An aggregate grouping of the Category (e.g. water)
                      StructField('category',IntegerType(),True), #category - The product category (e.g. sparkling water)
                      StructField('company',IntegerType(),False), #company - An id of the company that sells the item
                      StructField('brand',IntegerType(),False),  #brand - An id of the brand to which the item belongs
                      StructField('date',DateType(),True),        #date - The date of purchase
                      StructField('productsize',DoubleType(),True), #productsize - The amount of the product purchase (e.g. 16 oz of water)
                      StructField('productmeasure',StringType(),True), #productmeasure - The units of the product purchase (e.g. ounces)
                      StructField('purchasequantity',IntegerType(),True), #purchasequantity - The number of units purchased
                      StructField('purchaseamount',DoubleType(),True)]) #purchaseamount - The dollar amount of the purchase

trainHistory_schema =StructType(fields=[StructField('id',IntegerType(),False),     #id - A unique id representing a customer
                      StructField('chain',IntegerType(),True),    #chain - An integer representing a store chain
                      StructField('offer',IntegerType(),False),   #offer - An id representing a certain offer
                      StructField('market',IntegerType(),False),  #market - An id representing a geographical region
                      StructField('repeattrips',IntegerType(),True), #repeattrips - The number of times the customer made a repeat purchase
                      StructField('repeater',StringType(),True),  #repeater - A boolean, equal to repeattrips > 0
                      StructField('offerdate',DateType(),True)])  #offerdate - The date a customer received the offer
 

In [7]:
#Reading the data into spark RDDs
transactions = spark.read.format('csv').\
                               options(header='true').\
                load("Data/X5 Retail Data/acquire-valued-shoppers-challenge/transactions.csv",header=True,schema=transactions_schema)

offers=spark.read.format('csv').\
                               options(header='true').\
                load("Data/X5 Retail Data/acquire-valued-shoppers-challenge/offers.csv",header=True,schema=offers_schema)

trainHistory=spark.read.format('csv').\
                               options(header='true').\
                load("Data/X5 Retail Data/acquire-valued-shoppers-challenge/trainHistory.csv",header=True,schema=trainHistory_schema)


In [8]:
transactions=transactions.withColumn("Transaction_Month", month(transactions['date']))
transactions.show()

+-----+-----+----+--------+----------+-----+----------+-----------+--------------+----------------+--------------+-----------------+
|   id|chain|dept|category|   company|brand|      date|productsize|productmeasure|purchasequantity|purchaseamount|Transaction_Month|
+-----+-----+----+--------+----------+-----+----------+-----------+--------------+----------------+--------------+-----------------+
|86246|  205|   7|     707|1078778070|12564|2012-03-02|       12.0|            OZ|               1|          7.59|                3|
|86246|  205|  63|    6319| 107654575|17876|2012-03-02|       64.0|            OZ|               1|          1.59|                3|
|86246|  205|  97|    9753|1022027929|    0|2012-03-02|        1.0|            CT|               1|          5.99|                3|
|86246|  205|  25|    2509| 107996777|31373|2012-03-02|       16.0|            OZ|               1|          1.99|                3|
|86246|  205|  55|    5555| 107684070|32094|2012-03-02|       16.0|  

In [14]:
#Converting the dataframe to a queriable view. This will allow us to use power of SQL to query the dataframes
transactions.createOrReplaceTempView('Transactions') 
offers.createOrReplaceTempView('Offers')
trainHistory.createOrReplaceTempView('History')

In [None]:
sql_q1="select id,date from Transactions where id=86246"
result_1=spark.sql(sql_q1).count().show()

In [12]:
re1=transactions.groupBy(['id','chain','date']).count().collect()

KeyboardInterrupt: 

In [None]:
type(re1)

In [15]:
sql_query = "select date, count(distinct chain) from transactions group by(date) order by(date) "
res1=spark.sql(sql_query)

In [None]:
res1.show(5)

In [7]:
result=spark.sql("Select distinct(id) from Transactions")

In [None]:
result.show()