In [9]:
import pandas as pd
import numpy as np
import findspark
import pyspark
from pyspark import SparkContext
from pyspark.sql.session import SparkSession
from pyspark import SparkConf
from pyspark.sql import functions as F
from pyspark.sql.types import DoubleType, IntegerType, StringType

In [2]:
raw_data = spark.read.load("../data/raw_data/raw_data.csv",format="csv", sep=",", inferSchema="true", header="true")

In [3]:
#convert the dataframe to Pandas
raw_data.limit(2).toPandas()

Unnamed: 0,Child Product,CPG,Month,Shipment in Child Cases,PPG,Parent Prod,Shipment in parent Prod Cases,Inventory_Parent_Cases
0,S1,C5,201801,26,P1,S1,26,0
1,S2,C5,201801,33,P1,S1,25,0


In [5]:
#Columns selection
col_sel = raw_data.select('Child Product', 'CPG', 'Month')
col_sel.show(2)

+-------------+---+------+
|Child Product|CPG| Month|
+-------------+---+------+
|           S1| C5|201801|
|           S2| C5|201801|
+-------------+---+------+
only showing top 2 rows



In [7]:
#Sorting in Spark
raw_data.sort("Shipment in Child Cases").show(2)

+-------------+---+------+-----------------------+---+-----------+-------------------------------+----------------------+
|Child Product|CPG| Month|Shipment in Child Cases|PPG|Parent Prod| Shipment in parent Prod Cases |Inventory_Parent_Cases|
+-------------+---+------+-----------------------+---+-----------+-------------------------------+----------------------+
|          S32| C3|201807|                     -1|P17|        S32|                             -1|                     1|
|          S21| C3|201906|                     -1|P13|        S21|                             -1|                     1|
+-------------+---+------+-----------------------+---+-----------+-------------------------------+----------------------+
only showing top 2 rows



In [8]:
raw_data.sort(F.desc('Shipment in Child Cases')).show(2)

+-------------+---+------+-----------------------+---+-----------+-------------------------------+----------------------+
|Child Product|CPG| Month|Shipment in Child Cases|PPG|Parent Prod| Shipment in parent Prod Cases |Inventory_Parent_Cases|
+-------------+---+------+-----------------------+---+-----------+-------------------------------+----------------------+
|          S40| C2|201903|                    999|P11|        S66|                            999|                     0|
|          S33| C2|201805|                    998|P15|        S62|                            998|                    10|
+-------------+---+------+-----------------------+---+-----------+-------------------------------+----------------------+
only showing top 2 rows



In [10]:
#cast columns
raw_data = raw_data.withColumn('CPG', F.col('CPG').cast(StringType()))

In [15]:
raw_data.toPandas().dtypes, raw_data.toPandas().shape

(Child Product                      object
 CPG                                object
 Month                               int32
 Shipment in Child Cases            object
 PPG                                object
 Parent Prod                        object
  Shipment in parent Prod Cases     object
 Inventory_Parent_Cases              int32
 dtype: object,
 (2079, 8))

In [18]:
#Filter data in pyspark
_filter = raw_data.filter((raw_data.CPG == 'C1') & (raw_data.PPG == 'P1'))
_filter.show(2)

+-------------+---+------+-----------------------+---+-----------+-------------------------------+----------------------+
|Child Product|CPG| Month|Shipment in Child Cases|PPG|Parent Prod| Shipment in parent Prod Cases |Inventory_Parent_Cases|
+-------------+---+------+-----------------------+---+-----------+-------------------------------+----------------------+
|           S1| C1|201801|                      5| P1|         S1|                              5|                     0|
|           S2| C1|201801|                     51| P1|         S1|                             38|                     1|
+-------------+---+------+-----------------------+---+-----------+-------------------------------+----------------------+
only showing top 2 rows



In [19]:
#groupby operations in pyspark
_group = raw_data.groupby(['CPG', 'PPG', 'Month']).agg(F.sum(' Shipment in parent Prod Cases ').alias('Shipments'))
_group.show(2)

+---+---+------+---------+
|CPG|PPG| Month|Shipments|
+---+---+------+---------+
| C2|P18|201803|     10.0|
| C3| P6|201803|     68.0|
+---+---+------+---------+
only showing top 2 rows

