In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

sns.set(style="whitegrid")

# Load Hadoop MapReduce outputs
year_stats = pd.read_csv("year_stats.txt", sep="\t", header=None, names=["year", "stats"])
max_temp = pd.read_csv("max_temp_per_year.txt", sep="\t", header=None, names=["year", "max_info"])
min_temp = pd.read_csv("min_temp_per_year.txt", sep="\t", header=None, names=["year", "min_info"])

In [3]:
# --- Parse year_stats ---
def parse_stats(row):
    row = str(row)
    parts = row.split(",")     # <-- split by comma, not "|"
    d = {}
    for p in parts:
        p = p.strip()
        if "=" not in p:
            continue
        key, val = p.split("=", 1)   # split only once
        key = key.strip()
        val = val.strip()
        if val in ("NA", ""):
            d[key] = None
        else:
            try:
                d[key] = float(val)
            except ValueError:
                d[key] = None
    return pd.Series(d)

parsed = year_stats["stats"].apply(parse_stats)

# Build stats_df properly
stats_df = pd.concat([year_stats[["year"]], parsed], axis=1)
stats_df["year"] = stats_df["year"].astype(int)

# --- Parse max/min info ---
max_temp["max_val"] = max_temp["max_info"].str.extract(r"(\-?\d+\.\d+)")
min_temp["min_val"] = min_temp["min_info"].str.extract(r"(\-?\d+\.\d+)")
max_temp["year"] = max_temp["year"].astype(int)
min_temp["year"] = min_temp["year"].astype(int)
max_temp["max_val"] = max_temp["max_val"].astype(float)
min_temp["min_val"] = min_temp["min_val"].astype(float)

# Merge everything
df = stats_df.merge(max_temp[["year","max_val"]], on="year")
df = df.merge(min_temp[["year","min_val"]], on="year")

In [4]:
print(stats_df.head())

   year  avg_mean_temp  max_temp  min_temp  total_precip
0  1963           7.39      35.6     -24.4         566.4
1  1964           7.18      35.0     -22.8         723.4
2  1965           6.47      33.3     -23.9         839.2
3  1966           6.95      35.0     -26.1         713.2
4  1967           6.62      31.1     -26.7         782.1


In [16]:
fig = px.line(df, x="year", y=["avg_mean_temp", "max_val", "min_val"],
              title="Interactive Temperature Trends per Year",
              labels={"value": "Temperature (°C)", "year": "Year"})
fig.show()


In [23]:
import plotly.subplots as sp
import plotly.graph_objects as go

fig = sp.make_subplots(rows=2, cols=2,
    subplot_titles=("Avg Temperature Trend",
                    "Max Temp Trend",
                    "Min Temp Trend",
                    "Precipitation"))

fig.add_trace(go.Scatter(x=df["year"], y=df["avg_mean_temp"], name="Avg Temp"), row=1, col=1)
fig.add_trace(go.Scatter(x=df["year"], y=df["max_val"], name="Max Temp"), row=1, col=2)
fig.add_trace(go.Scatter(x=df["year"], y=df["min_val"], name="Min Temp"), row=2, col=1)
fig.add_trace(go.Scatter(x=df["year"], y=df["total_precip"], name="Precip"), row=2, col=2)

fig.update_layout(height=700, width=1000, title="Interactive Climate Dashboard")
fig.show()
