# Data Reading

In [0]:
df = spark.read.format("parquet")\
                .option('inferSchema', True) \
                .load('abfss://bronze@anshlambacardatalake.dfs.core.windows.net/rawdata')
        

In [0]:
print("No of datapoints in the dataframe:", df.count())

In [0]:
display(df.limit(10))

# Data Transformation

In [0]:
# importing libraries
from pyspark.sql.functions import split, col
from pyspark.sql.types import StringType

In [0]:
# using with column
df = df.withColumn('model_category', split(col('Model_ID'), '-')[0])
display(df.limit(10))

In [0]:
# using type cast unit sold to string
df.withColumn('Units_Sold', col('Units_Sold').cast(StringType())).limit(10).display()

In [0]:
# revenue per unit
df = df.withColumn('RevPerUnit', col('Revenue')/col('Units_Sold'))
df.limit(10).display()

# Ad-Hoc

In [0]:
df.limit(10).display()

In [0]:
from pyspark.sql.functions import sum

display(df.groupBy('Year', 'BranchName').agg(sum('Units_Sold').alias("Total_Units_Sold"))\
                                .sort('Year', 'Total_Units_Sold', ascending=[True, False]))



Databricks visualization. Run in Databricks to view.

# Data Writing


In [0]:
df.write.format('parquet')\
        .mode('overwrite')\
        .option('path', 'abfss://silver@anshlambacardatalake.dfs.core.windows.net/carsales')\
        .save()

In [0]:
spark.read.parquet('abfss://silver@anshlambacardatalake.dfs.core.windows.net/carsales').display()

# Querying Silver Data

In [0]:
%sql
SELECT * FROM parquet.`abfss://silver@anshlambacardatalake.dfs.core.windows.net/carsales`
LIMIT 10