In [1]:
import os
import sys
import json

# Import SparkSession
from pyspark.sql import SparkSession

# Build the SparkSession
spark = SparkSession.builder \
    .master("local[10]") \
    .appName("CAV Data Preprocessing") \
    .config("spark.executor.memory", "6G") \
    .config("spark.storage.memoryFraction", 0.2) \
    .config("spark.driver.memory", "16G") \
    .getOrCreate()
   
sc = spark.sparkContext

In [2]:
sc.getConf().getAll()

[('spark.driver.port', '42259'),
 ('spark.storage.memoryFraction', '0.2'),
 ('spark.driver.memory', '16G'),
 ('spark.executor.memory', '6G'),
 ('spark.rdd.compress', 'True'),
 ('spark.driver.host', '172.31.5.36'),
 ('spark.serializer.objectStreamReset', '100'),
 ('spark.app.id', 'local-1529742544301'),
 ('spark.master', 'local[10]'),
 ('spark.executor.id', 'driver'),
 ('spark.submit.deployMode', 'client'),
 ('spark.app.name', 'CAV Data Preprocessing')]

In [3]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType

schema = StructType([
    StructField("Date", StringType(), True),
    StructField("Visitor_ID", StringType(), True),
    StructField("Visit Number", IntegerType(), True),
    StructField("Products", IntegerType(), True),
    StructField("Product Views", IntegerType(), True)])

In [4]:
files = ['Report20171001-20180531.csv', 'Report20170101-20170930.csv']

df = spark.read.load(files[0], header=True, format="csv", schema=schema).cache();
for i in range(1, len(files)):
    df = df.union(spark.read.load(files[0], header=True, format="csv", schema=schema))

In [5]:
df.printSchema()

root
 |-- Date: string (nullable = true)
 |-- Visitor_ID: string (nullable = true)
 |-- Visit Number: integer (nullable = true)
 |-- Products: integer (nullable = true)
 |-- Product Views: integer (nullable = true)



In [6]:
df.count()

746073894

In [7]:
df.show(10, False)

+---------------+---------------------------------------+------------+--------+-------------+
|Date           |Visitor_ID                             |Visit Number|Products|Product Views|
+---------------+---------------------------------------+------------+--------+-------------+
|October 1, 2017|1000046560612521877_444244119977433712 |9           |null    |0            |
|October 1, 2017|1000046560612521877_444244119977433712 |9           |171236  |1            |
|October 1, 2017|1000046560612521877_444244119977433712 |9           |38005   |1            |
|October 1, 2017|1000046560612521877_444244119977433712 |9           |39952   |2            |
|October 1, 2017|1000046560612521877_444244119977433712 |9           |40028   |1            |
|October 1, 2017|1000047271602565570_7816685501896371224|5           |null    |0            |
|October 1, 2017|1000102520911731070_1646240024588457894|40          |null    |0            |
|October 1, 2017|1000102520911731070_1646240024588457894|40 

In [8]:
from pyspark.sql.functions import concat, col, lit

df = df.withColumn('session_id', concat(col("Visitor_ID"), lit("_"), col("Visit Number")))

In [9]:
df.show(10, False)

+---------------+---------------------------------------+------------+--------+-------------+------------------------------------------+
|Date           |Visitor_ID                             |Visit Number|Products|Product Views|session_id                                |
+---------------+---------------------------------------+------------+--------+-------------+------------------------------------------+
|October 1, 2017|1000046560612521877_444244119977433712 |9           |null    |0            |1000046560612521877_444244119977433712_9  |
|October 1, 2017|1000046560612521877_444244119977433712 |9           |171236  |1            |1000046560612521877_444244119977433712_9  |
|October 1, 2017|1000046560612521877_444244119977433712 |9           |38005   |1            |1000046560612521877_444244119977433712_9  |
|October 1, 2017|1000046560612521877_444244119977433712 |9           |39952   |2            |1000046560612521877_444244119977433712_9  |
|October 1, 2017|1000046560612521877_4442

In [10]:
from pyspark.sql.functions import isnan
df.filter((df["Visitor_ID"] == "") | df["Visitor_ID"].isNull()).count(), df.filter((df["Visit Number"] == "") | df["Visit Number"].isNull() | isnan(df["Visit Number"])).count()

(54, 54)

In [11]:
%%time
from pyspark.sql.functions import isnan
df = df.filter((df["Visitor_ID"] != "") & df["Visitor_ID"].isNotNull() & df["Visit Number"].isNotNull() & ~isnan(df["Visit Number"]))

CPU times: user 4 ms, sys: 0 ns, total: 4 ms
Wall time: 15.6 ms


In [12]:
df.count()

746073840

In [13]:
df = df.filter(df['Products'].isNotNull() & (df['Product Views'] > 0))

In [14]:
df.count()

430612176

In [15]:
df = df.drop('Visitor_ID', 'Visit Number', 'Product Views')

In [16]:
df.show(10, False)

+---------------+--------+------------------------------------------+
|Date           |Products|session_id                                |
+---------------+--------+------------------------------------------+
|October 1, 2017|171236  |1000046560612521877_444244119977433712_9  |
|October 1, 2017|38005   |1000046560612521877_444244119977433712_9  |
|October 1, 2017|39952   |1000046560612521877_444244119977433712_9  |
|October 1, 2017|40028   |1000046560612521877_444244119977433712_9  |
|October 1, 2017|27790   |1000102520911731070_1646240024588457894_40|
|October 1, 2017|64059   |1000102520911731070_1646240024588457894_40|
|October 1, 2017|88999   |1000184036754529481_6340868211800103312_34|
|October 1, 2017|81764   |1000206142114360214_4025469838733486262_17|
|October 1, 2017|28760   |1000211218072042403_3360638402315771367_6 |
|October 1, 2017|63379   |1000211218072042403_3360638402315771367_6 |
+---------------+--------+------------------------------------------+
only showing top 10 

In [17]:
df = df.withColumnRenamed('Products', 'product_id')

In [18]:
sys.path.append("/nykaa/api")
from pas.v2.utils import Utils

nykaadb = Utils.nykaaMysqlConnection()
cursor = nykaadb.cursor()

def extract_data(query):
    cursor.execute(query)
    rows = []
    BATCH_SIZE = 10000

    while True:
        batch_empty = True
        for row in cursor.fetchmany(BATCH_SIZE):
            batch_empty = False
            rows.append(row)
        if batch_empty:
            break
            
    return rows

query = "select child_id, parent_id from catalog_product_relation"

rows = extract_data(query)
child_2_parent = {row[0]: row[1] for row in rows}

In [19]:
with open("child_product_2_parent.json", "r+") as f:
    child_2_parent.update({int(key): int(value) for key, value in json.load(f).items()})

In [20]:
len(child_2_parent)

259561

In [21]:
from pyspark.sql.types import IntegerType
from pyspark.sql.functions import udf

def convert_to_parent(product_id):
    return child_2_parent.get(product_id, product_id)

convert_to_parent_udf = udf(convert_to_parent, IntegerType())
df = df.withColumn("product_id", convert_to_parent_udf(df['product_id']))

In [22]:
df.coalesce(1).write.option("header", "true").csv('processed_cav_data.csv')