In [0]:
from pyspark.sql.functions import col, count, countDistinct, sum, mean, max, min, when
from pyspark.sql import functions as F

In [0]:
# Point file path
path = '/databricks-datasets/Rdatasets/data-001/csv/ggplot2/diamonds.csv'

# Load Data
df = spark.read.csv(path, header=True, inferSchema= True)

# Update columns data types (Option 2, if not using inferSchema)
# df = df.select( col('carat').cast('float'),
#                 'cut', 'color', 'clarity',
#                 col('depth').cast('float'),
#                 col('table').cast('float'),
#                 col('price').cast('float'),
#                 col('x').cast('float'),
#                 col('y').cast('float'),
#                 col('z').cast('float')    )




In [0]:
display(df)

In [0]:
display(
    df
    .select('cut', 'x', 'y', 'z')
)

In [0]:
# Using withColumn
display(
    df
    .withColumn('x+y', col('x') + col('y'))
)

# using select
display(
    df
    .select('x', 'y', (col('x')+col('y')).alias('x+y'))
)


In [0]:
display(
    df
    .withColumn('x+y', col('x') + col('y'))
    .withColumnRenamed('x+y', 'status')
)


In [0]:
# Using drop
display(
    df.drop('x', 'y', 'z')
)

# Using select
display(
    df.select('carat', 'cut', 'color', 'clarity', 'depth', 'table', 'price')
)

In [0]:
display(
    df.select(col('carat').alias('k'), 
              col('x').alias('cut_x'), 
              col('z').alias('cut_y'), 
              col('z').alias('cut_z'))
)

In [0]:
display(
    df
    .withColumnRenamed('carat', 'k')
    .withColumnRenamed('x', 'cut_x')
    .withColumnRenamed('y', 'cut_y')
    .withColumnRenamed('z', 'cut_z')
)

In [0]:
df.count()

In [0]:
df.sample(fraction=0.01).count()*100

In [0]:
# Count distinct types of cut
df.select('cut').distinct().count()

#df.select('cut').distinct().show()

In [0]:
# sort
display(
    df
    .sort( col('price').asc() )
)

In [0]:
# orderBy
display(
    df
    .orderBy( col('price').desc() )
)

In [0]:
df_price_int = df.select(col('price').cast('int') )

In [0]:
display(
    df
    .filter( col('cut') == "Premium")
)

In [0]:
display(
    df
    .filter( (col('cut') == "Premium") & 
             (col('clarity') == "VS2") &
             (col('price') < 10000) )
)

In [0]:
# Create the list of values to filter
cuts = ['Premium', 'Very Good']

display(
    df
    .filter( col('cut').isin(cuts) )
)

In [0]:
display(
    df
    .filter( col('cut').contains('Good'))
)

_c0,carat,cut,color,clarity,depth,table,price,x,y,z
3,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
5,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75
6,0.24,Very Good,J,VVS2,62.8,57.0,336,3.94,3.96,2.48
7,0.24,Very Good,I,VVS1,62.3,57.0,336,3.95,3.98,2.47
8,0.26,Very Good,H,SI1,61.9,55.0,337,4.07,4.11,2.53
10,0.23,Very Good,H,VS1,59.4,61.0,338,4.0,4.05,2.39
11,0.3,Good,J,SI1,64.0,55.0,339,4.25,4.28,2.73
18,0.3,Good,J,SI1,63.4,54.0,351,4.23,4.29,2.7
19,0.3,Good,J,SI1,63.8,56.0,351,4.23,4.26,2.71
20,0.3,Very Good,J,SI1,62.7,59.0,351,4.21,4.27,2.66


In [0]:
display(
    df
    .filter( col('clarity').like('%VV%'))
)

_c0,carat,cut,color,clarity,depth,table,price,x,y,z
6,0.24,Very Good,J,VVS2,62.8,57.0,336,3.94,3.96,2.48
7,0.24,Very Good,I,VVS1,62.3,57.0,336,3.95,3.98,2.47
26,0.23,Very Good,G,VVS2,60.4,58.0,354,3.97,4.01,2.41
66,0.28,Ideal,G,VVS2,61.4,56.0,553,4.19,4.22,2.58
67,0.32,Ideal,I,VVS1,62.0,55.3,553,4.39,4.42,2.73
70,0.24,Premium,E,VVS1,60.7,58.0,553,4.01,4.03,2.44
71,0.24,Very Good,D,VVS1,61.5,60.0,553,3.97,4.0,2.45
76,0.26,Very Good,F,VVS2,59.2,60.0,554,4.19,4.22,2.49
77,0.26,Very Good,E,VVS2,59.9,58.0,554,4.15,4.23,2.51
78,0.26,Very Good,D,VVS2,62.4,54.0,554,4.08,4.13,2.56


In [0]:
# create variables to define price range
start = 500
end = 550

display(
    df
    .where( col('price').between(start, end) )
)

In [0]:
# IsNull
display(
    df
    .select( sum(col('price').isNull().cast('int')) )
)

# isnan (inside Functions)
display(
    df
    .select( sum(F.isnan('price').cast('int')) )
)

In [0]:
# NA per column 
display(
    df
    .select([count(when(F.isnan(c) | col(c).isNull(), c)).alias(c) for c in df.columns] )
)

In [0]:
display(
    df
    .replace('Very Good', 'VG')
)

display(
    df
    .replace(55, 555, subset='table')
)

In [0]:
display(
    df
    .withColumn('new_var', F.lit('diamond'))
)

In [0]:
# Add column with Null values
df_na = (
    df
    .withColumn('new_var', F.lit(None).cast('int'))
)

# fill values
display(df_na.na
        .fill(10)
        )

In [0]:
display(
    df
    .withColumn('price_label', when(col('cut').isin(['Premium', 'Very Good']), 'Expensive')
                               .when(col('cut').isin(['Good', 'Ideal']), 'Regular')
                               .otherwise('Cheap') )
)

In [0]:
display(
    df #dataset
    .groupBy('cut') # Group By Cut
    .agg( count('cut').alias('cut_ct'), # Aggregation count
         mean('price').alias('avg_price')) # averag price
    .sort(col('avg_price').desc()) # sorting
)

In [0]:
# Stats "df.describe"
display(
    df #date
    .groupBy('cut') # grouping by Cut
    .agg(min('price').alias('min'), # Min value
         max('price').alias('max'), #max value
         F.percentile('price', 0.5).alias('median'), # Median
         mean('price').alias('avg_price'), #average price
         count('cut').alias('observations')) #count observation
)

In [0]:
# display(
#     df
#     .groupBy('cut', 'color')
#     .mean('price')
# )

# Pivot Table function
display(
    df #dataset
    .groupBy('cut') # what to group by
    .pivot('color') # which column to pivot
    .mean('price') #the aggregation function
)

cut,D,E,F,G,H,I,J
Premium,3631.292576419214,3538.9144201968334,4324.890175890176,4500.742134062927,5216.706779661017,5946.180672268908,6294.591584158416
Ideal,2629.0945659844742,2597.5500896746094,3374.9393622582334,3720.706388206388,3889.334831460674,4451.970377448638,4918.186383928572
Good,3405.3821752265862,3423.6441586280816,3495.7502750275025,4123.482204362801,4276.254985754986,5078.532567049809,4574.172638436482
Fair,4291.061349693252,3682.3125,3827.003205128205,4239.254777070064,5135.683168316832,4685.4457142857145,4975.655462184874
Very Good,3470.4672835426304,3214.6520833333334,3778.8202402957486,3872.75380600261,4535.390350877193,5255.879568106312,5103.513274336283
