In [49]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
import seaborn as sns
import matplotlib as plt
import altair as alt
import plotly.express as px

In [2]:
df_path = r"F:\Datasets\CSV datasets\sales.csv"

In [3]:
spark = SparkSession.builder.appName('Sales').getOrCreate()

In [4]:
df = spark.read.csv(df_path, header=True, inferSchema=True)

In [47]:
df.count()

382423

In [8]:
df.show(5)

+----------+---+---+------------+-----------+-----------+--------------+----------------------+-----------------------+------------+------------------------+------------------------+--------------------------+-------------------------+--------------+--------------+-----------------------------+--------------+--------------+------------+-------------------+--------------+-------------------+----------------+---------------+-------------+-------------+-----------+---------+-------------+-----------------------+-------------+-------------------+---------------+-------------+---------------+-------------------+
|   user_id|age|sex|phone_number|joined_date|    country|payment_method|loyalty_program_member|loyalty_points_redeemed|loyalty_tier|tier_discount_percentage|card_discount_percentage|coupon_discount_percentage|total_discount_percentage|total_purchase|total_discount|total_purchase_after_discount|transaction_id|payment_status|payment_date|       payment_time|purchased_date|     purchas

In [11]:
from pyspark.sql.functions import isnan, when, count, col

In [12]:
df.select([count(when(col(column).isNull(), column)).alias(column) for column in df.columns]).show()

+-------+---+---+------------+-----------+-------+--------------+----------------------+-----------------------+------------+------------------------+------------------------+--------------------------+-------------------------+--------------+--------------+-----------------------------+--------------+--------------+------------+------------+--------------+--------------+----------------+---------------+-------------+-------------+-----------+--------+-------------+-----------------------+-------------+-------------------+---------------+-------------+---------------+-------------------+
|user_id|age|sex|phone_number|joined_date|country|payment_method|loyalty_program_member|loyalty_points_redeemed|loyalty_tier|tier_discount_percentage|card_discount_percentage|coupon_discount_percentage|total_discount_percentage|total_purchase|total_discount|total_purchase_after_discount|transaction_id|payment_status|payment_date|payment_time|purchased_date|purchased_time|product_category|purchase_mediu

In [14]:
df.groupBy('sex').count().show()

+---+------+
|sex| count|
+---+------+
|  F|127931|
|  M|126810|
|  O|127682|
+---+------+



In [48]:
from pyspark.sql.functions import lower, col, upper, add_months, max, min

In [25]:
df.withColumn('sex', lower(col('sex'))).show(5)

+----------+---+---+------------+-----------+-----------+--------------+----------------------+-----------------------+------------+------------------------+------------------------+--------------------------+-------------------------+--------------+--------------+-----------------------------+--------------+--------------+------------+-------------------+--------------+-------------------+----------------+---------------+-------------+-------------+-----------+---------+-------------+-----------------------+-------------+-------------------+---------------+-------------+---------------+-------------------+
|   user_id|age|sex|phone_number|joined_date|    country|payment_method|loyalty_program_member|loyalty_points_redeemed|loyalty_tier|tier_discount_percentage|card_discount_percentage|coupon_discount_percentage|total_discount_percentage|total_purchase|total_discount|total_purchase_after_discount|transaction_id|payment_status|payment_date|       payment_time|purchased_date|     purchas

In [38]:
df.withColumns({
    'sex': upper(col('sex')),
    'joined_date': add_months(col('joined_date'), 2),
    'age': col('age') + 10,
}).show(5)

+----------+---+---+------------+-----------+-----------+--------------+----------------------+-----------------------+------------+------------------------+------------------------+--------------------------+-------------------------+--------------+--------------+-----------------------------+--------------+--------------+------------+-------------------+--------------+-------------------+----------------+---------------+-------------+-------------+-----------+---------+-------------+-----------------------+-------------+-------------------+---------------+-------------+---------------+-------------------+
|   user_id|age|sex|phone_number|joined_date|    country|payment_method|loyalty_program_member|loyalty_points_redeemed|loyalty_tier|tier_discount_percentage|card_discount_percentage|coupon_discount_percentage|total_discount_percentage|total_purchase|total_discount|total_purchase_after_discount|transaction_id|payment_status|payment_date|       payment_time|purchased_date|     purchas

In [63]:
df.agg(
    F.max('age').alias('max_age'),
    F.min('age').alias('min_age'),
    F.round(F.mean('age').alias('mean_age'), 2),
    F.round(F.variance('age').alias('var_age'), 2),
    F.round(F.stddev('age').alias('std_age'), 2),
    F.sum('age').alias('sum_age'),
).show()

+-------+-------+------------------------------+----------------------------------+--------------------------------+--------+
|max_age|min_age|round(avg(age) AS mean_age, 2)|round(var_samp(age) AS var_age, 2)|round(stddev(age) AS std_age, 2)| sum_age|
+-------+-------+------------------------------+----------------------------------+--------------------------------+--------+
|     60|     18|                         38.97|                            153.52|                           12.39|14904098|
+-------+-------+------------------------------+----------------------------------+--------------------------------+--------+



In [46]:
df.orderBy(col('age').desc()).filter(
    col('age') == 60
).show()

+----------+---+---+------------+-----------+-----------+--------------+----------------------+-----------------------+------------+------------------------+------------------------+--------------------------+-------------------------+--------------+--------------+-----------------------------+--------------+--------------+------------+-------------------+--------------+-------------------+----------------+---------------+-------------+-------------+-----------+---------+-------------+-----------------------+-------------+-------------------+---------------+-------------+---------------+-------------------+
|   user_id|age|sex|phone_number|joined_date|    country|payment_method|loyalty_program_member|loyalty_points_redeemed|loyalty_tier|tier_discount_percentage|card_discount_percentage|coupon_discount_percentage|total_discount_percentage|total_purchase|total_discount|total_purchase_after_discount|transaction_id|payment_status|payment_date|       payment_time|purchased_date|     purchas