## Read and Process data

In [0]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("CustomerDataProcessing").getOrCreate()
spark

In [0]:
df = spark.read.format("csv").option("header", "true").load("dbfs:/FileStore/shared_uploads/beingabhisheksahu@gmail.com/Customers.csv")

In [0]:
df.show(5)

+-----------+------+---------+--------+----------+-------------+------+--------------------+------------+-------------+--------------+------------+---------+
|CustomerKey|Prefix|FirstName|LastName| BirthDate|MaritalStatus|Gender|        EmailAddress|AnnualIncome|TotalChildren|EducationLevel|  Occupation|HomeOwner|
+-----------+------+---------+--------+----------+-------------+------+--------------------+------------+-------------+--------------+------------+---------+
|      11000|   MR.|      JON|    YANG|04-08-1966|            M|     M|jon24@adventure-w...|    $90,000 |            2|     Bachelors|Professional|        Y|
|      11001|   MR.|   EUGENE|   HUANG| 5/14/1965|            S|     M|eugene10@adventur...|    $60,000 |            3|     Bachelors|Professional|        N|
|      11002|   MR.|    RUBEN|  TORRES|08-12-1965|            M|     M|ruben35@adventure...|    $60,000 |            3|     Bachelors|Professional|        Y|
|      11003|   MS.|  CHRISTY|     ZHU| 2/15/1968|  

In [0]:
df.printSchema()

root
 |-- CustomerKey: string (nullable = true)
 |-- Prefix: string (nullable = true)
 |-- FirstName: string (nullable = true)
 |-- LastName: string (nullable = true)
 |-- BirthDate: string (nullable = true)
 |-- MaritalStatus: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- EmailAddress: string (nullable = true)
 |-- AnnualIncome: string (nullable = true)
 |-- TotalChildren: string (nullable = true)
 |-- EducationLevel: string (nullable = true)
 |-- Occupation: string (nullable = true)
 |-- HomeOwner: string (nullable = true)



In [0]:
from pyspark.sql.functions import *

df =  df.withColumn("BirthDate", to_date(col("BirthDate"), "yyyy-MM-dd"))\
       .withColumn('HomeOwner',col("HomeOwner").cast('boolean'))

In [0]:
df.printSchema()

root
 |-- CustomerKey: string (nullable = true)
 |-- Prefix: string (nullable = true)
 |-- FirstName: string (nullable = true)
 |-- LastName: string (nullable = true)
 |-- BirthDate: date (nullable = true)
 |-- MaritalStatus: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- EmailAddress: string (nullable = true)
 |-- AnnualIncome: string (nullable = true)
 |-- TotalChildren: string (nullable = true)
 |-- EducationLevel: string (nullable = true)
 |-- Occupation: string (nullable = true)
 |-- HomeOwner: boolean (nullable = true)



In [0]:
df = df.fillna({'Occupation':'Unknown'})

In [0]:

df = df.withColumn("BirthYear", year(col("BirthDate")))

In [0]:
df.printSchema()
df.show(3)

root
 |-- CustomerKey: string (nullable = true)
 |-- Prefix: string (nullable = true)
 |-- FirstName: string (nullable = true)
 |-- LastName: string (nullable = true)
 |-- BirthDate: date (nullable = true)
 |-- MaritalStatus: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- EmailAddress: string (nullable = true)
 |-- AnnualIncome: string (nullable = true)
 |-- TotalChildren: string (nullable = true)
 |-- EducationLevel: string (nullable = true)
 |-- Occupation: string (nullable = false)
 |-- HomeOwner: boolean (nullable = true)
 |-- BirthYear: integer (nullable = true)

+-----------+------+---------+--------+---------+-------------+------+--------------------+------------+-------------+--------------+------------+---------+---------+
|CustomerKey|Prefix|FirstName|LastName|BirthDate|MaritalStatus|Gender|        EmailAddress|AnnualIncome|TotalChildren|EducationLevel|  Occupation|HomeOwner|BirthYear|
+-----------+------+---------+--------+---------+-------------+------+

In [0]:
unique_Occupation = df.select(countDistinct("Occupation")).collect()
unique_Occupation

Out[13]: [Row(count(DISTINCT Occupation)=5)]

In [0]:
df.groupBy("Occupation").count().orderBy(col("count")).show()

+--------------+-----+
|    Occupation|count|
+--------------+-----+
|        Manual| 2353|
|      Clerical| 2859|
|    Management| 3011|
|Skilled Manual| 4501|
|  Professional| 5424|
+--------------+-----+



In [0]:
df.groupBy("Occupation").pivot("HomeOwner").count().show()

+--------------+-----+----+
|    Occupation|false|true|
+--------------+-----+----+
|    Management|  802|2209|
|  Professional| 1794|3630|
|      Clerical|  935|1924|
|        Manual| 1109|1244|
|Skilled Manual| 1248|3253|
+--------------+-----+----+



In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.window import Window
from pyspark.sql.functions import col, rank, dense_rank

# Define window specification
window_spec = Window.partitionBy("Occupation").orderBy(col("AnnualIncome").desc())

# Apply ranking functions
df = df.withColumn("rank", rank().over(window_spec)) \
       .withColumn("dense_rank", dense_rank().over(window_spec))


In [0]:
df.select("Occupation","AnnualIncome","rank","dense_rank").show(5)

+----------+------------+----+----------+
|Occupation|AnnualIncome|rank|dense_rank|
+----------+------------+----+----------+
|  Clerical|    $40,000 |   1|         1|
|  Clerical|    $40,000 |   1|         1|
|  Clerical|    $40,000 |   1|         1|
|  Clerical|    $40,000 |   1|         1|
|  Clerical|    $40,000 |   1|         1|
+----------+------------+----+----------+
only showing top 5 rows



In [0]:
df.select(max("AnnualIncome")).show()

+-----------------+
|max(AnnualIncome)|
+-----------------+
|         $90,000 |
+-----------------+



In [0]:
df_annual_income = df.filter(col("AnnualIncome") > lit("$ 50,000"))
df_annual_income.count()

Out[24]: 18148

In [0]:
df.groupBy("EducationLevel").agg(max("AnnualIncome").alias("Highest"),min("AnnualIncome").alias("Lowest")).show()

+-------------------+--------+--------+
|     EducationLevel| Highest|  Lowest|
+-------------------+--------+--------+
|          Bachelors|$90,000 |$10,000 |
|    Graduate Degree|$90,000 |$10,000 |
|        High School|$90,000 |$10,000 |
|    Partial College|$90,000 |$10,000 |
|Partial High School|$90,000 |$10,000 |
+-------------------+--------+--------+



In [0]:
output_path = '/FileStore/tables/processed_customers'
df.write.mode('overwrite').parquet(output_path)