In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
import seaborn as sns
import matplotlib as plt
import altair as alt
import plotly.express as px
from pyspark.sql.types import StringType

In [2]:
df_path = r"F:\Datasets\CSV datasets\sales.csv"

In [3]:
spark = SparkSession.builder.appName('Sales').getOrCreate()

In [4]:
df = spark.read.csv(df_path, header=True, inferSchema=True)

In [5]:
df.count()

382423

In [6]:
df.show(5)

+----------+---+---+------------+-----------+-----------+--------------+----------------------+-----------------------+------------+------------------------+------------------------+--------------------------+-------------------------+--------------+--------------+-----------------------------+--------------+--------------+------------+-------------------+--------------+-------------------+----------------+---------------+-------------+-------------+-----------+---------+-------------+-----------------------+-------------+-------------------+---------------+-------------+---------------+-------------------+
|   user_id|age|sex|phone_number|joined_date|    country|payment_method|loyalty_program_member|loyalty_points_redeemed|loyalty_tier|tier_discount_percentage|card_discount_percentage|coupon_discount_percentage|total_discount_percentage|total_purchase|total_discount|total_purchase_after_discount|transaction_id|payment_status|payment_date|       payment_time|purchased_date|     purchas

In [7]:
from pyspark.sql.functions import isnan, when, count, col, coalesce

In [8]:
df.select([count(when(col(column).isNull(), column)).alias(column) for column in df.columns]).show()

+-------+---+---+------------+-----------+-------+--------------+----------------------+-----------------------+------------+------------------------+------------------------+--------------------------+-------------------------+--------------+--------------+-----------------------------+--------------+--------------+------------+------------+--------------+--------------+----------------+---------------+-------------+-------------+-----------+--------+-------------+-----------------------+-------------+-------------------+---------------+-------------+---------------+-------------------+
|user_id|age|sex|phone_number|joined_date|country|payment_method|loyalty_program_member|loyalty_points_redeemed|loyalty_tier|tier_discount_percentage|card_discount_percentage|coupon_discount_percentage|total_discount_percentage|total_purchase|total_discount|total_purchase_after_discount|transaction_id|payment_status|payment_date|payment_time|purchased_date|purchased_time|product_category|purchase_mediu

In [9]:
df.groupBy('sex').count().show()

+---+------+
|sex| count|
+---+------+
|  F|127931|
|  M|126810|
|  O|127682|
+---+------+



In [10]:
from pyspark.sql.functions import lower, col, upper, add_months, max, min

In [11]:
df.withColumn('sex', lower(col('sex'))).show(5)

+----------+---+---+------------+-----------+-----------+--------------+----------------------+-----------------------+------------+------------------------+------------------------+--------------------------+-------------------------+--------------+--------------+-----------------------------+--------------+--------------+------------+-------------------+--------------+-------------------+----------------+---------------+-------------+-------------+-----------+---------+-------------+-----------------------+-------------+-------------------+---------------+-------------+---------------+-------------------+
|   user_id|age|sex|phone_number|joined_date|    country|payment_method|loyalty_program_member|loyalty_points_redeemed|loyalty_tier|tier_discount_percentage|card_discount_percentage|coupon_discount_percentage|total_discount_percentage|total_purchase|total_discount|total_purchase_after_discount|transaction_id|payment_status|payment_date|       payment_time|purchased_date|     purchas

In [12]:
df.withColumns({
    'sex': upper(col('sex')),
    'joined_date': add_months(col('joined_date'), 2),
    'age': col('age') + 10,
}).show(5)

+----------+---+---+------------+-----------+-----------+--------------+----------------------+-----------------------+------------+------------------------+------------------------+--------------------------+-------------------------+--------------+--------------+-----------------------------+--------------+--------------+------------+-------------------+--------------+-------------------+----------------+---------------+-------------+-------------+-----------+---------+-------------+-----------------------+-------------+-------------------+---------------+-------------+---------------+-------------------+
|   user_id|age|sex|phone_number|joined_date|    country|payment_method|loyalty_program_member|loyalty_points_redeemed|loyalty_tier|tier_discount_percentage|card_discount_percentage|coupon_discount_percentage|total_discount_percentage|total_purchase|total_discount|total_purchase_after_discount|transaction_id|payment_status|payment_date|       payment_time|purchased_date|     purchas

In [13]:
df.agg(
    F.max('age').alias('max_age'),
    F.min('age').alias('min_age'),
    F.round(F.mean('age').alias('mean_age'), 2),
    F.round(F.variance('age').alias('var_age'), 2),
    F.round(F.stddev('age').alias('std_age'), 2),
    F.sum('age').alias('sum_age'),
).show()

+-------+-------+------------------------------+----------------------------------+--------------------------------+--------+
|max_age|min_age|round(avg(age) AS mean_age, 2)|round(var_samp(age) AS var_age, 2)|round(stddev(age) AS std_age, 2)| sum_age|
+-------+-------+------------------------------+----------------------------------+--------------------------------+--------+
|     60|     18|                         38.97|                            153.52|                           12.39|14904098|
+-------+-------+------------------------------+----------------------------------+--------------------------------+--------+



In [14]:
df.agg(
    F.max('age').alias('max_age'),
    F.min('age').alias('min_age')
).show()

+-------+-------+
|max_age|min_age|
+-------+-------+
|     60|     18|
+-------+-------+



In [15]:
from pyspark.sql.types import StringType, IntegerType

In [16]:
df_with_age_range = df.withColumn(
    'age_range',
    F.when(F.col('age') < 20, '0-19')
    .when(F.col('age') < 30, '20-29')
    .when(F.col('age') < 40, '30-39')
    .when(F.col('age') < 50, '40-49')
    .when(F.col('age') < 60, '50-59')
    .otherwise('60+')
)
df_with_age_range.show()

+----------+---+---+------------+-----------+-----------+--------------+----------------------+-----------------------+------------+------------------------+------------------------+--------------------------+-------------------------+--------------+--------------+-----------------------------+--------------+--------------+------------+-------------------+--------------+-------------------+----------------+---------------+-------------+-------------+-----------+---------+-------------+-----------------------+-------------+-------------------+---------------+-------------+---------------+-------------------+---------+
|   user_id|age|sex|phone_number|joined_date|    country|payment_method|loyalty_program_member|loyalty_points_redeemed|loyalty_tier|tier_discount_percentage|card_discount_percentage|coupon_discount_percentage|total_discount_percentage|total_purchase|total_discount|total_purchase_after_discount|transaction_id|payment_status|payment_date|       payment_time|purchased_date|  

In [17]:
df.orderBy(col('age').desc()).filter(
    col('age') == 60
).show()

+----------+---+---+------------+-----------+-----------+--------------+----------------------+-----------------------+------------+------------------------+------------------------+--------------------------+-------------------------+--------------+--------------+-----------------------------+--------------+--------------+------------+-------------------+--------------+-------------------+----------------+---------------+-------------+-------------+-----------+---------+-------------+-----------------------+-------------+-------------------+---------------+-------------+---------------+-------------------+
|   user_id|age|sex|phone_number|joined_date|    country|payment_method|loyalty_program_member|loyalty_points_redeemed|loyalty_tier|tier_discount_percentage|card_discount_percentage|coupon_discount_percentage|total_discount_percentage|total_purchase|total_discount|total_purchase_after_discount|transaction_id|payment_status|payment_date|       payment_time|purchased_date|     purchas

In [18]:
df = df.withColumns({
    'age': F.col('age').cast(IntegerType())
})

In [19]:
df.select(
    F.coalesce(F.col('loyalty_tier'), F.lit(0)).alias('loyalty_tier'),
).show(5)

+------------+
|loyalty_tier|
+------------+
|         2.0|
|         0.0|
|         0.0|
|         0.0|
|         0.0|
+------------+
only showing top 5 rows



In [20]:
df.select(F.col('loyalty_tier'), F.isnull('loyalty_tier')).show(5)

+------------+----------------------+
|loyalty_tier|(loyalty_tier IS NULL)|
+------------+----------------------+
|         2.0|                 false|
|        NULL|                  true|
|        NULL|                  true|
|        NULL|                  true|
|        NULL|                  true|
+------------+----------------------+
only showing top 5 rows



In [21]:
df.select(
    F.col('user_id'),
    F.named_struct(
        F.lit('sex'), F.col('sex'),
        F.lit('age'), F.col('age'),
    ).alias('age_struct')
).show(5)

+----------+----------+
|   user_id|age_struct|
+----------+----------+
|****980545|   {M, 22}|
|****906406|   {M, 23}|
|****860958|   {F, 19}|
|****845150|   {O, 58}|
|****664897|   {O, 37}|
+----------+----------+
only showing top 5 rows



In [22]:
df.select(
    F.nanvl('age', 'country').alias('country_'),
    ).show(5)

+--------+
|country_|
+--------+
|    22.0|
|    23.0|
|    19.0|
|    58.0|
|    37.0|
+--------+
only showing top 5 rows



In [23]:
df.withColumns({
    'age_over_20': F.when(F.col('age') > 20, 'Over 20')
}).filter(
    F.col('age_over_20').isNotNull()
).show()

+----------+---+---+------------+-----------+-----------+--------------+----------------------+-----------------------+------------+------------------------+------------------------+--------------------------+-------------------------+--------------+--------------+-----------------------------+--------------+--------------+------------+-------------------+--------------+-------------------+----------------+---------------+-------------+-------------+-----------+---------+-------------+-----------------------+-------------+-------------------+---------------+-------------+---------------+-------------------+-----------+
|   user_id|age|sex|phone_number|joined_date|    country|payment_method|loyalty_program_member|loyalty_points_redeemed|loyalty_tier|tier_discount_percentage|card_discount_percentage|coupon_discount_percentage|total_discount_percentage|total_purchase|total_discount|total_purchase_after_discount|transaction_id|payment_status|payment_date|       payment_time|purchased_date|

In [24]:
df.withColumn(
    'age_over_20', F.when(F.col('age') > 20, 'Over 20').otherwise('Under 20')
).show()

+----------+---+---+------------+-----------+-----------+--------------+----------------------+-----------------------+------------+------------------------+------------------------+--------------------------+-------------------------+--------------+--------------+-----------------------------+--------------+--------------+------------+-------------------+--------------+-------------------+----------------+---------------+-------------+-------------+-----------+---------+-------------+-----------------------+-------------+-------------------+---------------+-------------+---------------+-------------------+-----------+
|   user_id|age|sex|phone_number|joined_date|    country|payment_method|loyalty_program_member|loyalty_points_redeemed|loyalty_tier|tier_discount_percentage|card_discount_percentage|coupon_discount_percentage|total_discount_percentage|total_purchase|total_discount|total_purchase_after_discount|transaction_id|payment_status|payment_date|       payment_time|purchased_date|

In [25]:
df.select(
    'country', F.expr('length(country)').alias('country_len'),
    'age', F.expr('age * 10').alias('age_times_10'),
    'joined_date', F.expr('date_format(joined_date, "yyyy-MM-dd")').alias('joined_date_str'),
    F.expr("concat(joined_date, '-2024')").alias("country_2024"),
).show(5)

+-----------+-----------+---+------------+-----------+---------------+---------------+
|    country|country_len|age|age_times_10|joined_date|joined_date_str|   country_2024|
+-----------+-----------+---+------------+-----------+---------------+---------------+
|Philippines|         11| 22|         220| 2023-02-14|     2023-02-14|2023-02-14-2024|
|Philippines|         11| 23|         230| 2024-04-19|     2024-04-19|2024-04-19-2024|
|Philippines|         11| 19|         190| 2023-03-03|     2023-03-03|2023-03-03-2024|
|Philippines|         11| 58|         580| 2024-04-18|     2024-04-18|2024-04-18-2024|
|Philippines|         11| 37|         370| 2021-09-29|     2021-09-29|2021-09-29-2024|
+-----------+-----------+---+------------+-----------+---------------+---------------+
only showing top 5 rows



In [26]:
import datetime as dt

In [27]:
df.select(
    F.sqrt(F.col('age')),
    F.abs(F.col('age')),
    F.acos(F.col('age')),
    F.ceil(F.col('age')),
    F.pow(F.col('age'), 2),
    F.floor(F.col('age')),
    F.negate(F.col('age')),
    F.negative(F.col('age')),
    F.signum(F.col('age')),
    F.round(F.col('total_purchase_after_discount'), 1),
    F.to_utc_timestamp(F.col('payment_time'), 'Asia/Hong_Kong').alias('payment_time_utc_'),
    F.col('purchased_date'),
    F.date_add(start=F.col('purchased_date'), days=1).alias('purchased_date_plus_1d'),
    F.date_sub(start=F.col('purchased_date'), days=2).alias('purchased_date_minus_1d'),
    F.dayofweek(F.col('purchased_date')).alias('purchased_date_dayofweek'),
    F.dayofyear(F.col('purchased_date')).alias('purchased_date_dayofyear'),
    F.second(F.col('purchased_date').alias('purchased_date_second')),
).show(10)

+------------------+--------+---------+---------+-------------+----------+-------------+-------------+-----------+---------------------------------------+-------------------+--------------+----------------------+-----------------------+------------------------+------------------------+-----------------------------------------------+
|         SQRT(age)|abs(age)|ACOS(age)|CEIL(age)|POWER(age, 2)|FLOOR(age)|negative(age)|negative(age)|SIGNUM(age)|round(total_purchase_after_discount, 1)|  payment_time_utc_|purchased_date|purchased_date_plus_1d|purchased_date_minus_1d|purchased_date_dayofweek|purchased_date_dayofyear|second(purchased_date AS purchased_date_second)|
+------------------+--------+---------+---------+-------------+----------+-------------+-------------+-----------+---------------------------------------+-------------------+--------------+----------------------+-----------------------+------------------------+------------------------+--------------------------------------------

# Window

In [28]:
from pyspark.sql import Window

In [29]:
window = Window.partitionBy('shipping_method').orderBy(F.col('age').desc())

In [30]:
df.withColumn(
    'rank', F.rank().over(window)
).show(5)

+----------+---+---+------------+-----------+-----------+--------------+----------------------+-----------------------+------------+------------------------+------------------------+--------------------------+-------------------------+--------------+--------------+-----------------------------+--------------+--------------+------------+-------------------+--------------+-------------------+----------------+---------------+-------------+-------------+-----------+---------+-------------+-----------------------+-------------+-------------------+---------------+-------------+---------------+-------------------+----+
|   user_id|age|sex|phone_number|joined_date|    country|payment_method|loyalty_program_member|loyalty_points_redeemed|loyalty_tier|tier_discount_percentage|card_discount_percentage|coupon_discount_percentage|total_discount_percentage|total_purchase|total_discount|total_purchase_after_discount|transaction_id|payment_status|payment_date|       payment_time|purchased_date|     pu

In [66]:
window = Window.partitionBy('sex').orderBy(F.col('age')).rangeBetween(Window.currentRow, 5)

sum_age = df.withColumn('sum', F.sum('age').over(window))

sum_age.orderBy('age', ascending=False).drop_duplicates(['age']).show(5)

+----------+---+---+------------+-----------+-----------+--------------+----------------------+-----------------------+------------+------------------------+------------------------+--------------------------+-------------------------+--------------+--------------+-----------------------------+--------------+--------------+------------+-------------------+--------------+-------------------+----------------+---------------+-------------+-------------+-----------+---------+-------------+-----------------------+-------------+-------------------+---------------+-------------+---------------+-------------------+------+
|   user_id|age|sex|phone_number|joined_date|    country|payment_method|loyalty_program_member|loyalty_points_redeemed|loyalty_tier|tier_discount_percentage|card_discount_percentage|coupon_discount_percentage|total_discount_percentage|total_purchase|total_discount|total_purchase_after_discount|transaction_id|payment_status|payment_date|       payment_time|purchased_date|     

In [71]:
window = Window.partitionBy('sex').orderBy('age').rangeBetween(20, 25)

df.withColumn('sum', F.sum('age').over(window)).show(5)

+----------+---+---+------------+-----------+-----------+--------------+----------------------+-----------------------+------------+------------------------+------------------------+--------------------------+-------------------------+--------------+--------------+-----------------------------+--------------+--------------+------------+-------------------+--------------+-------------------+----------------+---------------+-------------+-------------+-----------+---------+-------------+-----------------------+-------------+-------------------+---------------+-------------+---------------+-------------------+------+
|   user_id|age|sex|phone_number|joined_date|    country|payment_method|loyalty_program_member|loyalty_points_redeemed|loyalty_tier|tier_discount_percentage|card_discount_percentage|coupon_discount_percentage|total_discount_percentage|total_purchase|total_discount|total_purchase_after_discount|transaction_id|payment_status|payment_date|       payment_time|purchased_date|     