In [1]:
pip install mrjob

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
from collections import defaultdict

# Load dataset
df = pd.read_csv('weather_data.csv')

# Print columns to debug
print(df.columns)

# Ensure no leading/trailing spaces in column names
df.columns = df.columns.str.strip()

# Extract year and temperature
df['Year'] = pd.to_datetime(df['Date_Time']).dt.year
df['Temperature'] = df['Temperature_C'].astype(float)

# Mapper: Emit (year, temperature)
mapped_data = df[['Year', 'Temperature']].values

# Reducer: Aggregate by year and compute average temperature
yearly_temps = defaultdict(list)
for year, temp in mapped_data:
    yearly_temps[year].append(temp)

avg_temps = {year: sum(temps)/len(temps) for year, temps in yearly_temps.items()}

# Find hottest and coolest years
hottest_year = max(avg_temps, key=avg_temps.get)
coolest_year = min(avg_temps, key=avg_temps.get)

print(f"Hottest Year: {hottest_year} with Avg Temp: {avg_temps[hottest_year]}°C")
print(f"Coolest Year: {coolest_year} with Avg Temp: {avg_temps[coolest_year]}°C")


Index(['Location', 'Date_Time', 'Temperature_C', 'Humidity_pct',
       'Precipitation_mm', 'Wind_Speed_kmh'],
      dtype='object')
Hottest Year: 2024.0 with Avg Temp: 14.779704927042093°C
Coolest Year: 2024.0 with Avg Temp: 14.779704927042093°C


In [3]:
print(df.columns)

Index(['Location', 'Date_Time', 'Temperature_C', 'Humidity_pct',
       'Precipitation_mm', 'Wind_Speed_kmh', 'Year', 'Temperature'],
      dtype='object')
