## Running Your First PySpark Notebook


In [0]:
# In Databricks, the SparkSession is already available as 'spark'

# Let's start by creating a Spark session
from pyspark.sql import SparkSession

# But in a regular environment, you would create it like this:
spark = SparkSession.builder \
     .appName("FirstPySparkJob") \
     .getOrCreate()

print("Spark session is active and ready to use!")

In [0]:

display(dbutils.fs.ls("abfss://etl1@dbstoragebbpbs73u57xmm.dfs.core.windows.net/Exercise2/"))


# Let's load a sample dataset from our video streaming data
# This is a small sample to demonstrate basic PySpark operations
file_path = "abfss://etl1@dbstoragebbpbs73u57xmm.dfs.core.windows.net/Exercise2/streaming_sample_10k.csv"

# Read the CSV file into a DataFrame
df = spark.read.option("header", "true").option("inferSchema", "true").csv(file_path)
df.limit(500).display()


In [0]:
# Let's see the first few rows of our data
print("Sample data:")
display(df.limit(5))
df.show(5)

In [0]:
# Now, let's perform some basic operations
# Count the total number of records
total_records = df.count()
print(f"Total number of streaming events: {total_records}")

In [0]:
from pyspark.sql import functions as F

df_grouped = (
    df.groupBy('device_type')
    .agg(F.count('event_id').alias('total_event'))
    .orderBy(F.desc('device_type'))
)

df_grouped.limit(500).display()


# # Get a summary of the different device types
# device_counts = df.groupBy("device_type").count().orderBy("count", ascending=False)
# print("Streaming events by device type:")
# display(device_counts)


In [0]:

df.createOrReplaceTempView("streaming_events")

query_result = spark.sql("""
  SELECT country, COUNT(*) as event_count
  FROM streaming_events
  GROUP BY country
  ORDER BY event_count DESC                         
""")

print('Streaming events by country: ')
display(query_result)


# # Create a temporary view of our DataFrame for SQL
# df.createOrReplaceTempView("streaming_events")

# # Execute SQL query using spark.sql()
# query_result = spark.sql("""
# SELECT 
#   country, 
#   COUNT(*) as event_count
# FROM streaming_events
# GROUP BY country
# ORDER BY event_count DESC
# """)

# print("Streaming events by country:")
# display(query_result)

In [0]:
output_path = 'abfss://etl1@dbstoragebbpbs73u57xmm.dfs.core.windows.net/Exercise2/Export/processed_data'

df.write.mode("overwrite").parquet(output_path)

# # Save the processed data to a new location
# output_path = "/pyspark/video-streaming-data/module1-intro/first_job/processed_data"

# # Save as Parquet (a columnar storage format that's efficient for analytics)
# df.write.mode("overwrite").parquet(output_path)

# print(f"Data successfully saved to {output_path}")