In [1]:
import os
import sys
import json

# Import SparkSession
from pyspark.sql import SparkSession

# Build the SparkSession
spark = SparkSession.builder \
    .master("local[10]") \
    .appName("CAV Data Preprocessing") \
    .config("spark.executor.memory", "6G") \
    .config("spark.storage.memoryFraction", 0.2) \
    .config("spark.driver.memory", "16G") \
    .getOrCreate()
   
sc = spark.sparkContext

In [2]:
sc.getConf().getAll()

[('spark.app.id', 'local-1539343095352'),
 ('spark.storage.memoryFraction', '0.2'),
 ('spark.driver.memory', '16G'),
 ('spark.executor.memory', '6G'),
 ('spark.rdd.compress', 'True'),
 ('spark.driver.host', '172.31.5.36'),
 ('spark.serializer.objectStreamReset', '100'),
 ('spark.master', 'local[10]'),
 ('spark.executor.id', 'driver'),
 ('spark.submit.deployMode', 'client'),
 ('spark.driver.port', '32848'),
 ('spark.app.name', 'CAV Data Preprocessing')]

In [3]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType

schema = StructType([
    StructField("Date", StringType(), True),
    StructField("Visitor_ID", StringType(), True),
    StructField("Visit Number", IntegerType(), True),
    StructField("Products", IntegerType(), True),
    StructField("Product Views", IntegerType(), True)])

In [4]:
files = ['/home/ubuntu/data/cav/Report_Desktop_20180401-20180831.csv', '/home/ubuntu/data/cav/Report_sept.csv']

df = spark.read.load(files[0], header=True, format="csv", schema=schema).cache();
for i in range(1, len(files)):
    df = df.union(spark.read.load(files[0], header=True, format="csv", schema=schema))

In [5]:
df.printSchema()

root
 |-- Date: string (nullable = true)
 |-- Visitor_ID: string (nullable = true)
 |-- Visit Number: integer (nullable = true)
 |-- Products: integer (nullable = true)
 |-- Product Views: integer (nullable = true)



In [6]:
df.count()

213097620

In [7]:
df.show(10, False)

+-------------+---------------------------------------+------------+--------+-------------+
|Date         |Visitor_ID                             |Visit Number|Products|Product Views|
+-------------+---------------------------------------+------------+--------+-------------+
|April 1, 2018|1000021042451551118_7817197741417963751|2           |null    |0            |
|April 1, 2018|1000021042451551118_7817197741417963751|2           |248528  |1            |
|April 1, 2018|1000021042451551118_7817197741417963751|2           |27115   |1            |
|April 1, 2018|1000098726092126499_5215981598137947057|108         |null    |0            |
|April 1, 2018|1000098726092126499_5215981598137947057|109         |null    |0            |
|April 1, 2018|1000098726092126499_5215981598137947057|110         |null    |0            |
|April 1, 2018|1000151524024214239_4749141478071163189|10          |null    |0            |
|April 1, 2018|1000151524024214239_4749141478071163189|11          |null    |0  

In [8]:
from pyspark.sql.functions import concat, col, lit

df = df.withColumn('session_id', concat(col("Visitor_ID"), lit("_"), col("Visit Number")))

In [9]:
df.show(10, False)

+-------------+---------------------------------------+------------+--------+-------------+-------------------------------------------+
|Date         |Visitor_ID                             |Visit Number|Products|Product Views|session_id                                 |
+-------------+---------------------------------------+------------+--------+-------------+-------------------------------------------+
|April 1, 2018|1000021042451551118_7817197741417963751|2           |null    |0            |1000021042451551118_7817197741417963751_2  |
|April 1, 2018|1000021042451551118_7817197741417963751|2           |248528  |1            |1000021042451551118_7817197741417963751_2  |
|April 1, 2018|1000021042451551118_7817197741417963751|2           |27115   |1            |1000021042451551118_7817197741417963751_2  |
|April 1, 2018|1000098726092126499_5215981598137947057|108         |null    |0            |1000098726092126499_5215981598137947057_108|
|April 1, 2018|1000098726092126499_5215981598137

In [10]:
from pyspark.sql.functions import isnan
df.filter((df["Visitor_ID"] == "") | df["Visitor_ID"].isNull()).count(), df.filter((df["Visit Number"] == "") | df["Visit Number"].isNull() | isnan(df["Visit Number"])).count()

(53134, 53134)

In [11]:
%%time
from pyspark.sql.functions import isnan
df = df.filter((df["Visitor_ID"] != "") & df["Visitor_ID"].isNotNull() & df["Visit Number"].isNotNull() & ~isnan(df["Visit Number"]))

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 16 ms


In [12]:
df.count()

213044486

In [13]:
df = df.filter(df['Products'].isNotNull() & (df['Product Views'] > 0))

In [14]:
df.count()

105572610

In [15]:
df = df.drop('Visitor_ID', 'Visit Number', 'Product Views')

In [16]:
df.show(10, False)

+-------------+--------+------------------------------------------+
|Date         |Products|session_id                                |
+-------------+--------+------------------------------------------+
|April 1, 2018|248528  |1000021042451551118_7817197741417963751_2 |
|April 1, 2018|27115   |1000021042451551118_7817197741417963751_2 |
|April 1, 2018|216554  |1000213868797828724_8491859803852391686_6 |
|April 1, 2018|25253   |1000220416542877232_449178730912277620_1  |
|April 1, 2018|110277  |1000247315170957961_2306277511903815026_31|
|April 1, 2018|110290  |1000247315170957961_2306277511903815026_31|
|April 1, 2018|214397  |1000247315170957961_2306277511903815026_31|
|April 1, 2018|214399  |1000247315170957961_2306277511903815026_31|
|April 1, 2018|218996  |1000247315170957961_2306277511903815026_31|
|April 1, 2018|219004  |1000247315170957961_2306277511903815026_31|
+-------------+--------+------------------------------------------+
only showing top 10 rows



In [17]:
df = df.withColumnRenamed('Products', 'product_id')

In [18]:
import sys
sys.path.append('/var/www/nykaa')
from pas.v2.utils import Utils

child_2_parent = DiscUtils.scrollESForResults()['child_2_parent']
len(child_2_parent)

182498

In [19]:
from pyspark.sql.types import IntegerType
from pyspark.sql.functions import udf

def convert_to_parent(product_id):
    return child_2_parent.get(product_id, product_id)

convert_to_parent_udf = udf(convert_to_parent, IntegerType())
df = df.withColumn("product_id", convert_to_parent_udf(df['product_id']))

In [20]:
%%time
df.coalesce(1).write.option("header", "true").csv('/home/ubuntu/data/cav/processed_cav_desktop_data_2018_04_2018_09.csv')

CPU times: user 12 ms, sys: 4 ms, total: 16 ms
Wall time: 2min 27s
