In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, avg, max as spark_max, count, desc

In [3]:
spark = SparkSession.builder.appName("WeatherAnalysis").getOrCreate()

In [4]:
file_path = "/content/Weather Dataset - CSV(in).csv"
df = spark.read.csv(file_path, header=True, inferSchema=True)

In [6]:
df.show(5)

+-------+-------+--------+-----------+--------+-----------+-------------+----------+----------+------------+------------+-----------+-----------+-----------+-----------+--------+--------+-------+-------+---------+-------+------------+
|MinTemp|MaxTemp|Rainfall|Evaporation|Sunshine|WindGustDir|WindGustSpeed|WindDir9am|WindDir3pm|WindSpeed9am|WindSpeed3pm|Humidity9am|Humidity3pm|Pressure9am|Pressure3pm|Cloud9am|Cloud3pm|Temp9am|Temp3pm|RainToday|RISK_MM|RainTomorrow|
+-------+-------+--------+-----------+--------+-----------+-------------+----------+----------+------------+------------+-----------+-----------+-----------+-----------+--------+--------+-------+-------+---------+-------+------------+
|    8.0|   24.3|     0.0|        3.4|     6.3|         NW|           30|        SW|        NW|           6|          20|         68|         29|     1019.7|     1015.0|       7|       7|   14.4|   23.6|       No|    3.6|         Yes|
|   14.0|   26.9|     3.6|        4.4|     9.7|        ENE| 

In [7]:
df.head(5)

[Row(MinTemp=8.0, MaxTemp=24.3, Rainfall=0.0, Evaporation=3.4, Sunshine='6.3', WindGustDir='NW', WindGustSpeed='30', WindDir9am='SW', WindDir3pm='NW', WindSpeed9am='6', WindSpeed3pm=20, Humidity9am=68, Humidity3pm=29, Pressure9am=1019.7, Pressure3pm=1015.0, Cloud9am=7, Cloud3pm=7, Temp9am=14.4, Temp3pm=23.6, RainToday='No', RISK_MM=3.6, RainTomorrow='Yes'),
 Row(MinTemp=14.0, MaxTemp=26.9, Rainfall=3.6, Evaporation=4.4, Sunshine='9.7', WindGustDir='ENE', WindGustSpeed='39', WindDir9am='E', WindDir3pm='W', WindSpeed9am='4', WindSpeed3pm=17, Humidity9am=80, Humidity3pm=36, Pressure9am=1012.4, Pressure3pm=1008.4, Cloud9am=5, Cloud3pm=3, Temp9am=17.5, Temp3pm=25.7, RainToday='Yes', RISK_MM=3.6, RainTomorrow='Yes'),
 Row(MinTemp=13.7, MaxTemp=23.4, Rainfall=3.6, Evaporation=5.8, Sunshine='3.3', WindGustDir='NW', WindGustSpeed='85', WindDir9am='N', WindDir3pm='NNE', WindSpeed9am='6', WindSpeed3pm=6, Humidity9am=82, Humidity3pm=69, Pressure9am=1009.5, Pressure3pm=1007.2, Cloud9am=8, Cloud3pm=

In [10]:
df.printSchema()

root
 |-- MinTemp: double (nullable = true)
 |-- MaxTemp: double (nullable = true)
 |-- Rainfall: double (nullable = true)
 |-- Evaporation: double (nullable = true)
 |-- Sunshine: string (nullable = true)
 |-- WindGustDir: string (nullable = true)
 |-- WindGustSpeed: string (nullable = true)
 |-- WindDir9am: string (nullable = true)
 |-- WindDir3pm: string (nullable = true)
 |-- WindSpeed9am: string (nullable = true)
 |-- WindSpeed3pm: integer (nullable = true)
 |-- Humidity9am: integer (nullable = true)
 |-- Humidity3pm: integer (nullable = true)
 |-- Pressure9am: double (nullable = true)
 |-- Pressure3pm: double (nullable = true)
 |-- Cloud9am: integer (nullable = true)
 |-- Cloud3pm: integer (nullable = true)
 |-- Temp9am: double (nullable = true)
 |-- Temp3pm: double (nullable = true)
 |-- RainToday: string (nullable = true)
 |-- RISK_MM: double (nullable = true)
 |-- RainTomorrow: string (nullable = true)



In [11]:
df.describe().show()

+-------+-----------------+-----------------+------------------+-----------------+-----------------+-----------+------------------+----------+----------+-----------------+-----------------+------------------+------------------+------------------+------------------+-----------------+------------------+------------------+------------------+---------+------------------+------------+
|summary|          MinTemp|          MaxTemp|          Rainfall|      Evaporation|         Sunshine|WindGustDir|     WindGustSpeed|WindDir9am|WindDir3pm|     WindSpeed9am|     WindSpeed3pm|       Humidity9am|       Humidity3pm|       Pressure9am|       Pressure3pm|         Cloud9am|          Cloud3pm|           Temp9am|           Temp3pm|RainToday|           RISK_MM|RainTomorrow|
+-------+-----------------+-----------------+------------------+-----------------+-----------------+-----------+------------------+----------+----------+-----------------+-----------------+------------------+------------------+-------

##  Task 1: Number of days when it rained the next day

In [12]:
def task1(df):
  return df.filter(col("RainTomorrow") == "Yes").count()

print("Task 1 → Days when it rained tomorrow:", task1(df))

Task 1 → Days when it rained tomorrow: 66


## Task 2: Average sunshine duration on days with no rainfall

In [13]:
def task2(df):
  return df.filter(col("Rainfall") == 0).agg(avg("Sunshine")).first()[0]
print("Task 2 → Avg sunshine on no-rainfall days:", task2(df))

Task 2 → Avg sunshine on no-rainfall days: 8.472030651341


## Task 3: Maximum temperature recorded at 3 PM

In [14]:
def task3(df):
    return df.agg(spark_max("Temp3pm")).first()[0]
print("Task 3 → Max Temp at 3pm:", task3(df))

Task 3 → Max Temp at 3pm: 34.5


## Task 4: Average humidity at 3 PM on days it rained the next day

In [16]:
def task4(df):
    return df.filter(col("RainTomorrow") == "Yes").agg(avg("Humidity3pm")).first()[0]
print("Task 4 → Avg Humidity at 3pm on rainy-tomorrow days:", task4(df))

Task 4 → Avg Humidity at 3pm on rainy-tomorrow days: 57.68181818181818


## Task 5: Most common wind direction at 9 AM on cloudy days (Cloud9am > 5)

In [17]:
def task5(df):
    return (df.filter(col("Cloud9am") > 5)
              .groupBy("WindDir9am")
              .count()
              .orderBy(desc("count"))
              .first()[0])
print("Task 5 → Most common WindDir9am on cloudy days:", task5(df))

Task 5 → Most common WindDir9am on cloudy days: SSE
