In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import avg
from pyspark.sql.types import StructType, StructField, StringType, DoubleType

In [2]:
# Create a Spark session
spark = SparkSession.builder.appName("WeatherAnalysis").getOrCreate()

# Define the schema for the CSV data
schema = StructType([
    StructField("city", StringType(), True),
    StructField("timestamp", StringType(), True),
    StructField("temperature", StringType(), True),
    StructField("humidity", StringType(), True),
    StructField("weather_description", StringType(), True),
    StructField("wind_speed", StringType(), True),
    StructField("cloudiness", StringType(), True)
])

In [3]:
# Read CSV data into a Spark DataFrame with the defined schema
csv_file_path = 'file:///C:/WeatherMonitoringSystem/weather_forecast.csv'
weather_df = spark.read.csv(csv_file_path, header=True, schema=schema)


In [4]:
# Convert temperature, humidity, wind_speed, and cloudiness columns to numeric types
numeric_columns = ["temperature", "humidity", "wind_speed", "cloudiness"]
for column in numeric_columns:
    weather_df = weather_df.withColumn(column, weather_df[column].cast("double"))

In [5]:
# Calculate average temperature, humidity, wind_speed, and cloudiness for each city
average_stats = weather_df.groupBy("city").agg(
    avg("temperature").alias("avg_temperature"),
    avg("humidity").alias("avg_humidity"),
    avg("wind_speed").alias("avg_wind_speed"),
    avg("cloudiness").alias("avg_cloudiness")
)

In [6]:

# Show the results
average_stats.show()



+-------------+---------------+------------+--------------+--------------+
|         city|avg_temperature|avg_humidity|avg_wind_speed|avg_cloudiness|
+-------------+---------------+------------+--------------+--------------+
|     Winnipeg|         269.63|        76.0|           7.2|         100.0|
|      Kelowna|         272.45|        88.0|          1.03|          75.0|
|       Ottawa|         272.99|        95.0|          4.12|         100.0|
|     Victoria|         279.97|        87.0|         15.43|          75.0|
|       London|          276.5|        91.0|          2.57|         100.0|
|     Montreal|         273.82|        97.0|          5.66|         100.0|
|   St. John's|         278.55|        94.0|          1.54|         100.0|
|  Fredericton|         274.46|       100.0|           3.6|         100.0|
|      Toronto|         275.62|        92.0|           3.6|         100.0|
|    Vancouver|         277.17|        94.0|          5.14|          20.0|
|     Edmonton|         2

In [7]:
# Stop the Spark session
spark.stop()