In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('data_processing').getOrCreate()

In [3]:
import pyspark.sql.functions as F
from pyspark.sql.types import *

In [4]:
df=spark.read.csv("/FileStore/tables/customers1.csv",header=True,inferSchema=True)

In [5]:
df.show(5)

In [6]:
region_data = spark.createDataFrame([('Family with grownups','PN'), \
  ('Driven Growers','GJ'), \
  ('Conservative families','DD'), \
  ('Cruising Seniors','DL'), \
  ('Average Family ','MN'), \
  ('Living well','KA'), \
  ('Successful hedonists','JH'), \
  ('Retired and Religious','AX'), \
  ('Career Loners','HY'),('Farmers','JH')], \
  schema=StructType().add("Customer_main_type","string").add("Region Code","string"))


In [7]:
region_data.show(3,truncate=False)

In [8]:
new_df = df.join(region_data,on='Customer_main_type') #Joining by main type column

In [9]:
new_df.groupBy("Region Code").count().show()


In [10]:
new_df.groupBy('Customer_main_type').pivot('Region Code').sum('Avg_Salary').fillna(0).show()

In [11]:
new_df.groupBy('Customer_main_type').pivot('AVG_age').sum('Avg_Salary').fillna(0).show()

In [12]:
def age_category(age):
  if age == "20-30 years":
    return "Young"
  elif age== "30-40 years":
    return "Mid Aged"
  elif ((age== "40-50 years") or (age== "50-60 years")) :
    return "Old"
  else:
    return "Very Old"
  
age_udf=udf(age_category,StringType())
new_df=new_df.withColumn('age_category',age_udf(new_df['Avg_age']))

In [13]:
new_df.select('Avg_age','age_category').show()

In [14]:
new_df.groupBy('Customer_main_type').pivot('label').sum('Avg_Salary').fillna(0).show()

In [15]:
# This example shows a 'Collect list'. A collect list is a list of all values. See the second column in the result.
new_df.groupby("Customer_subtype").agg(F.collect_list("Number_of_houses")).show()

In [16]:
# This example shows a 'Collect set'. A collect set is a set with unique values in a set. See the second column in the result.
new_df.groupby("Customer_subtype").agg(F.collect_set("Number_of_houses")).show()

In [17]:
new_df.groupBy('Region Code').agg(F.avg('Avg_Salary').alias('mean_salary')).orderBy('mean_salary',ascending=False).show(50,False)

In [18]:
new_df.groupBy('Customer_subtype').agg(F.avg('Avg_Salary').alias('mean_salary')).orderBy('mean_salary',ascending=False).show(50,False)

In [19]:
new_df.sort("Avg_Salary", ascending=False).show()

In [20]:
new_df.groupBy('Customer_main_type').agg(F.max('Avg_Salary')).show()

In [21]:
# GroupBy each column except Avg_Salary.
for col in df.columns:
  if col !='Avg_Salary':
    print(f" Aggregation for {col}")
    new_df.groupBy(col).count().orderBy('count',ascending=False).show(truncate=False)

In [22]:
new_df.groupBy('Customer_subtype').count().show()