<a href="https://colab.research.google.com/github/YifeiCathyYang/Redesign-Project/blob/main/Redesign_Project_Yifei_Yang.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("berkeleyearth/climate-change-earth-surface-temperature-data")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/berkeleyearth/climate-change-earth-surface-temperature-data?dataset_version_number=2...


100%|██████████| 84.7M/84.7M [00:01<00:00, 62.3MB/s]

Extracting files...





Path to dataset files: /root/.cache/kagglehub/datasets/berkeleyearth/climate-change-earth-surface-temperature-data/versions/2


In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import datetime

!pip install chart_studio
import chart_studio.plotly as py
import plotly.express as px
import plotly.graph_objects as go
import colorlover as cl
from plotly.subplots import make_subplots

Collecting chart_studio
  Downloading chart_studio-1.1.0-py3-none-any.whl.metadata (1.3 kB)
Collecting retrying>=1.3.3 (from chart_studio)
  Downloading retrying-1.3.4-py3-none-any.whl.metadata (6.9 kB)
Downloading chart_studio-1.1.0-py3-none-any.whl (64 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.4/64.4 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading retrying-1.3.4-py3-none-any.whl (11 kB)
Installing collected packages: retrying, chart_studio
Successfully installed chart_studio-1.1.0 retrying-1.3.4


In [4]:
import os
path = "/root/.cache/kagglehub/datasets/berkeleyearth/climate-change-earth-surface-temperature-data/versions/2"
print("file list：")
print(os.listdir(path))

file list：
['GlobalTemperatures.csv', 'GlobalLandTemperaturesByCountry.csv', 'GlobalLandTemperaturesByCity.csv', 'GlobalLandTemperaturesByMajorCity.csv', 'GlobalLandTemperaturesByState.csv']


In [5]:
file_path = os.path.join(path, "GlobalTemperatures.csv")
data = pd.read_csv(file_path)

# Make a copy of the data
copy = data.copy()

# Missing values
data.isna().sum()

Unnamed: 0,0
dt,0
LandAverageTemperature,12
LandAverageTemperatureUncertainty,12
LandMaxTemperature,1200
LandMaxTemperatureUncertainty,1200
LandMinTemperature,1200
LandMinTemperatureUncertainty,1200
LandAndOceanAverageTemperature,1200
LandAndOceanAverageTemperatureUncertainty,1200


In [6]:
data.head()

Unnamed: 0,dt,LandAverageTemperature,LandAverageTemperatureUncertainty,LandMaxTemperature,LandMaxTemperatureUncertainty,LandMinTemperature,LandMinTemperatureUncertainty,LandAndOceanAverageTemperature,LandAndOceanAverageTemperatureUncertainty
0,1750-01-01,3.034,3.574,,,,,,
1,1750-02-01,3.083,3.702,,,,,,
2,1750-03-01,5.626,3.076,,,,,,
3,1750-04-01,8.49,2.451,,,,,,
4,1750-05-01,11.573,2.072,,,,,,


In [7]:
data.dropna(axis = 0, inplace = True)

# Dealing with the DATE
data['Date'] = pd.to_datetime(data.dt) # converted all dates to the same format

data2 = data.copy()
data2.drop(columns = ['dt'], axis = 1, inplace = True)
data2['day'] = data2['Date'].dt.day
data2['week'] = data2['Date'].dt.isocalendar().week
data2['month'] = data2['Date'].dt.month
data2['year'] = data2['Date'].dt.year
data2['week'].value_counts() # very uneven information on weeks

# group by yr
earth_data = data2.groupby(by = 'year')[['LandAverageTemperature', 'LandAverageTemperatureUncertainty',
       'LandMaxTemperature', 'LandMaxTemperatureUncertainty',
       'LandMinTemperature', 'LandMinTemperatureUncertainty',
       'LandAndOceanAverageTemperature',
       'LandAndOceanAverageTemperatureUncertainty']].mean().reset_index()

earth_data['turnpoint'] = np.where(earth_data['year'] <= 1975, 'before', 'after')

In [8]:
fig = make_subplots(
    rows=2, cols=2,
    subplot_titles=[
        "Land Avg Temp (Before vs After 1975)",
        "Land Min Temp",
        "Land Max Temp",
        "Land & Ocean Avg Temp"
    ],
    horizontal_spacing=0.2,
    vertical_spacing=0.2
)

fig.update_layout(
    title="Distribution of Global Temperatures Before and After 1975",
    title_font=dict(family="Arial", size=22, color="#333"),
    font=dict(family="Arial", size=12),
    template="plotly_white",
    plot_bgcolor="#f7f7f7",
    paper_bgcolor="#ffffff",
    hovermode="y unified",
    margin=dict(t=100, b=100),
    legend=dict(
        orientation="h",
        yanchor="bottom",
        y=-0.2,
        xanchor="center",
        x=0.5,
        font=dict(size=11),
        title=None
    )
)

box_style = dict(
    boxpoints='outliers',
    jitter=0.4,
    pointpos=-1.8,
    orientation='h',
    boxmean=True,
    marker_size=4,
    line_width=1.2
)

fig.add_trace(go.Box(
    x=earth_data['LandAverageTemperature'],
    y=earth_data['turnpoint'],
    name='Land Avg Temp',
    marker_color='indianred',
    **box_style
), row=1, col=1)

fig.add_trace(go.Box(
    x=earth_data['LandMinTemperature'],
    y=earth_data['turnpoint'],
    name='Land Min Temp',
    marker_color='darkorange',
    **box_style
), row=1, col=2)

fig.add_trace(go.Box(
    x=earth_data['LandMaxTemperature'],
    y=earth_data['turnpoint'],
    name='Land Max Temp',
    marker_color='skyblue',
    **box_style
), row=2, col=1)

fig.add_trace(go.Box(
    x=earth_data['LandAndOceanAverageTemperature'],
    y=earth_data['turnpoint'],
    name='Land & Ocean Avg',
    marker_color='seagreen',
    **box_style
), row=2, col=2)

for i in range(1, 3):
    for j in range(1, 3):
        fig.update_xaxes(title_text="Temperature (°C)", row=i, col=j)
        fig.update_yaxes(title_text="Time Period", row=i, col=j)

fig.update_layout(
    hoverlabel=dict(
        bgcolor="white",
        font_size=11,
        font_family="Arial"
    )
)

fig.show()

In [9]:
file_path2 = os.path.join(path, "GlobalLandTemperaturesByCity.csv")
countries = pd.read_csv(file_path2)
countries['Date'] = pd.to_datetime(countries['dt'])
countries['year'] = countries['Date'].dt.year
# Group by yr
countries['Date'] = pd.to_datetime(countries['dt'])
countries['year'] = countries['Date'].dt.year

# nnumeric
numeric_cols = countries.select_dtypes(include='number').columns

by_year = countries.groupby(
    by=['year', 'City', 'Country', 'Latitude', 'Longitude'],
    as_index=False
)[numeric_cols].mean()

In [10]:
from google.colab import drive
drive.mount('/content/drive')
continent_map = pd.read_csv("/content/drive/My Drive/Colab Notebooks/redesign/all.csv")
# Append the continent & iso codes
continent_map['Country'] = continent_map['name']
continent_map = continent_map[['Country', 'region', 'alpha-2', 'alpha-3']]

# Add information
data = pd.merge(left=by_year, right=continent_map, on='Country', how='left')
data = data[data['year'] >= 1825]

Mounted at /content/drive


In [11]:
numeric_cols = data.select_dtypes(include='number').columns
region = data.dropna(axis=0).groupby(['region', 'year'], as_index=False)[numeric_cols].mean()
# Countries by region/year
countries = data.dropna(axis=0).groupby(
    by=['region', 'Country', 'year'], as_index=False
)[numeric_cols].mean()

# Cities by region/year/city/lat/lon
cities = data.dropna(axis=0).groupby(
    by=['region', 'Country', 'City', 'year', 'Latitude', 'Longitude'], as_index=False
)[numeric_cols].mean()
print(numeric_cols)

Index(['AverageTemperature', 'AverageTemperatureUncertainty', 'year'], dtype='object')


In [12]:
fig = make_subplots(
    rows=1, cols=2,
    subplot_titles=("Average Temperature Trends by Continent", "Regional Mean and Max Temperatures"),
    column_widths=[0.65, 0.35],
    horizontal_spacing=0.15
)

fig.update_layout(
    title="Global Rise in Average Temperatures by Region",
    title_font=dict(size=22, family="Arial", color="#333"),
    template="seaborn",
    hovermode='x unified',
    legend=dict(
        x=0.01, y=-0.2, orientation="h",
        bgcolor='rgba(0,0,0,0)', borderwidth=0
    ),
    margin=dict(l=50, r=50, t=80, b=80),
    plot_bgcolor="#f9f9f9"
)

fig.update_xaxes(showline=True, linewidth=0.8, linecolor='gray', gridcolor='lightgray')
fig.update_yaxes(showline=True, linewidth=0.8, linecolor='gray', gridcolor='lightgray')

region_colors = {
    'Europe': 'firebrick',
    'Americas': 'darkorange',
    'Asia': 'deepskyblue',
    'Africa': 'olivedrab',
    'Oceania': 'steelblue'
}

for reg in region['region'].unique():
    df = region[region['region'] == reg].sort_values('year')
    fig.add_trace(
        go.Scatter(
            x=df['year'],
            y=df['AverageTemperature'],
            mode='lines',
            name=reg,
            line=dict(color=region_colors.get(reg, 'gray'), width=2),
            showlegend=True
        ),
        row=1, col=1
    )
fig.add_vline(x=2000, line=dict(color='gray', dash='dot'), row=1, col=1)

grouped = region.groupby('region')['AverageTemperature']
mean_temps = grouped.mean().sort_values()
max_temps = grouped.max().reindex(mean_temps.index)

fig.add_trace(
    go.Bar(
        x=mean_temps.index,
        y=mean_temps.values,
        name='Mean Temp',
        marker_color='indianred',
        text=np.round(mean_temps.values, 1),
        textposition='outside'
    ),
    row=1, col=2
)

fig.add_trace(
    go.Bar(
        x=max_temps.index,
        y=max_temps.values,
        name='Max Temp',
        marker_color='sandybrown',
        text=np.round(max_temps.values, 1),
        textposition='outside'
    ),
    row=1, col=2
)

fig.update_yaxes(title_text="Avg Temperature (°C)", row=1, col=1)
fig.update_yaxes(title_text="Temperature (°C)", row=1, col=2)
fig.update_layout(barmode='group')
fig.show()

In [22]:
# updated figure
# Data iso alpha-3 codes
numeric_cols = data.select_dtypes(include='number').columns

# Group by country and year, removing missing values
climate_df = (
    data.dropna()
    .groupby(['region', 'Country', 'year', 'alpha-3'], as_index=False)[numeric_cols]
    .mean()
)

# Adjust temperature values for visual clarity
climate_df['TempIndex'] = climate_df['AverageTemperature'] + 6

# Create animated geo-scatter plot without color distinction by region
fig = px.scatter_geo(
    climate_df,
    locations='alpha-3',
    hover_name='Country',
    size='TempIndex',
    size_max=18,
    opacity=0.85,
    animation_frame='year',
    projection='natural earth',
    title='Global Surface Temperature Trends Over Time'
)

# Add national borders and adjust styling
fig.update_geos(
    showcountries=True,  # Show national borders
    countrycolor="gray",  # Set country border color
    showcoastlines=True,
    coastlinecolor="lightgray",
    showland=True,
    landcolor="white",
    showocean=True,
    oceancolor="lightblue"
)

# Layout customization
fig.update_layout(
    title_font=dict(size=22, family='Arial', color='darkslategray'),
    margin=dict(l=0, r=0, t=60, b=20),
    annotations=[
        dict(
            text="Data Source: Berkeley Earth Climate Dataset",
            x=0.01, y=-0.12, xref="paper", yref="paper", showarrow=False,
            font=dict(size=11, color='gray')
        )
    ]
)

fig.show()

In [None]:
# Data iso alpha-3 codes
numeric_cols = data.select_dtypes(include='number').columns

# Group by country and year, removing missing values
climate_df = (
    data.dropna()
    .groupby(['region', 'Country', 'year', 'alpha-3'], as_index=False)[numeric_cols]
    .mean()
)

# Adjust temperature values for visual clarity
climate_df['TempIndex'] = climate_df['AverageTemperature'] + 6

# Custom color palette for regions
region_colors = ['#D7263D', '#1E91D6', '#2BA84A', '#FFB30F']

# Create animated geo-scatter plot
fig = px.scatter_geo(
    climate_df,
    locations='alpha-3',
    color='region',
    color_discrete_sequence=region_colors,
    hover_name='Country',
    size='TempIndex',
    size_max=18,
    opacity=0.85,
    animation_frame='year',
    projection='natural earth',
    title='Global Surface Temperature Trends Over Time'
)

# Layout customization
fig.update_layout(
    title_font=dict(size=22, family='Arial', color='darkslategray'),
    margin=dict(l=0, r=0, t=60, b=20),
    annotations=[
        dict(
            text="Data Source: Berkeley Earth Climate Dataset",
            x=0.01, y=-0.12, xref="paper", yref="paper", showarrow=False,
            font=dict(size=11, color='gray')
        )
    ]
)

fig.show()

In [21]:
# updated figure
key_countries = ['China', 'United States', 'India', 'Brazil', 'Sri Lanka']
climate_df['label'] = climate_df['Country'].apply(lambda x: x if x in key_countries else "")

climate_df['AverageTemperature'] = climate_df['AverageTemperature'] + 6

fig = px.scatter_geo(
    climate_df,
    locations='alpha-3',
    color='AverageTemperature',
    hover_name="Country",
    size="AverageTemperature",
    size_max=15,
    opacity=0.8,
    animation_frame="year",
    projection="natural earth",
    text="label",
    color_continuous_scale="RdYlBu_r",
    title='Interactive Global Map: Average Temperature Increase by Country'
)

fig.update_traces(
    textposition='top center',
    textfont=dict(
        family="Arial Black",
        size=12,
        color="black"
    )
)

fig.update_layout(
    height=1000,
    width=1300,
    geo=dict(
        showland=True,
        landcolor="rgb(243, 243, 243)",
        showcountries=True,
        countrycolor="gray",
        showframe=False,
        center={"lat": 20, "lon": 0},
        projection_scale=1
    ),
    margin={"r": 0, "t": 40, "l": 0, "b": 0},
    annotations=[
        dict(
            text="Source: Climate Change: Earth Surface Temperature Data",
            x=0.5, y=-0.1, xref="paper", yref="paper", showarrow=False
        )
    ],
    updatemenus=[{
        "buttons": [
            {
                "args": [None, {"frame": {"duration": 800, "redraw": True},
                                "fromcurrent": True, "transition": {"duration": 300}}],
                "label": "Play",
                "method": "animate"
            },
            {
                "args": [[None], {"frame": {"duration": 0, "redraw": True},
                                  "mode": "immediate", "transition": {"duration": 0}}],
                "label": "Pause",
                "method": "animate"
            }
        ],
        "direction": "left",
        "pad": {"r": 10, "t": 87},
        "showactive": False,
        "type": "buttons",
        "x": 0.1,
        "xanchor": "right",
        "y": 0,
        "yanchor": "top"
    }]
)

fig.show()

In [None]:
fig.update_layout(
    height=1000,
    width=1300,
    geo=dict(
        center={"lat": 20, "lon": 0},
        projection_scale=1,
        showland=True,
        landcolor="rgb(243, 243, 243)",
        showcountries=True,
        countrycolor="gray"
    ),
    margin={"r":0,"t":40,"l":0,"b":0},
    annotations=[
        dict(text="Source: Climate Change: Earth Surface Temperature Data",
             x=0.5, y=-0.1, xref="paper", yref="paper", showarrow=False)
    ]
)
key_countries = ['China', 'United States', 'India', 'Brazil']
climate_df['label'] = climate_df['Country'].apply(lambda x: x if x in key_countries else "")

# to "standardize the data"
climate_df['AverageTemperature'] = climate_df['AverageTemperature'] + 6

fig = px.scatter_geo(
    climate_df,
    locations='alpha-3',
    color='region',
    hover_name="Country",
    size="AverageTemperature",
    size_max=15,
    opacity=0.8,
    animation_frame="year",
    projection="natural earth",
    text="label",
    color_discrete_sequence=['rgb(128,0,0)','rgb(210,105,30)','rgb(135,206,235)','rgb(107,142,35)'],
    title='Interactive Globe Map - Temperature increase'
)

fig.update_traces(
    textposition='top center',
    textfont=dict(
        family="Arial Black",
        size=12,
        color="black"
    )
)
fig = px.scatter_geo(
    climate_df, locations='alpha-3', color='region',
    color_discrete_sequence=['rgb(128,0,0)','rgb(210,105,30)','rgb(135,206,235)','rgb(107,142,35)'],
    hover_name="Country", size="AverageTemperature", size_max=15, opacity=0.8,
    animation_frame="year", projection="natural earth", text="label",
    title='Interactive Globe Map - Temperature increase'
)

fig.update_traces(textposition='top center', textfont_size=10)
fig = px.scatter_geo(
    climate_df, locations='alpha-3',
    color='AverageTemperature',
    hover_name="Country", size="AverageTemperature", size_max=15, opacity=0.8,
    animation_frame="year", projection="natural earth", text="label",
    color_continuous_scale="RdYlBu_r",
    title=' Interactive Global Map: Average Temperature Increase by Country'
)
fig.update_layout(
    updatemenus=[{
        "buttons": [
            {
                "args": [None, {"frame": {"duration": 800, "redraw": True},
                                "fromcurrent": True, "transition": {"duration": 300}}],
                "label": "Play",
                "method": "animate"
            },
            {
                "args": [[None], {"frame": {"duration": 0, "redraw": True},
                                  "mode": "immediate", "transition": {"duration": 0}}],
                "label": "Pause",
                "method": "animate"
            }
        ],
        "direction": "left",
        "pad": {"r": 10, "t": 87},
        "showactive": False,
        "type": "buttons",
        "x": 0.1,
        "xanchor": "right",
        "y": 0,
        "yanchor": "top"
    }]
)
fig.show()

In [20]:
# updated figure
mean = climate_df.groupby(['region', 'Country', 'alpha-3'])['AverageTemperature'].mean().reset_index()
maximum = climate_df.groupby(['region', 'Country', 'alpha-3'])['AverageTemperature'].max().reset_index()

difference = pd.merge(left=mean, right=maximum, on=['region', 'Country', 'alpha-3'])
difference['diff'] = difference['AverageTemperature_y'] - difference['AverageTemperature_x']

difference.rename(columns={
    'AverageTemperature_y': 'Maximum Average Temperature',
    'AverageTemperature_x': 'Overall Avg Temp'
}, inplace=True)

fig = px.scatter_geo(
    difference,
    locations="alpha-3",
    color="Overall Avg Temp",
    hover_name="Country",
    size="diff",
    size_max=15,
    projection="natural earth",
    opacity=0.8,
    color_continuous_scale=('#283747', '#2874A6', '#3498DB', '#F5B041', '#E67E22', '#A93226'),
    title='Global Temperature Variability Map: Mean vs. Maximum'
)

fig.update_layout(
    geo=dict(
        showland=True,
        landcolor="rgb(243, 243, 243)",
        showcountries=True,          # 显示国境线
        countrycolor="gray",         # 国境线颜色
        showframe=False,
        center={"lat": 20, "lon": 0},
        projection_scale=1
    ),
    margin={"r": 0, "t": 50, "l": 0, "b": 0}
)

fig.show()

In [None]:
# Calculating the difference column
mean = climate_df.groupby(['region','Country','alpha-3'])['AverageTemperature'].mean().reset_index()
maximum = climate_df.groupby(['region','Country','alpha-3'])['AverageTemperature'].max().reset_index()
difference = pd.merge(left = mean, right = maximum, on = ['region','Country','alpha-3'])
difference['diff'] = difference['AverageTemperature_y'] - difference['AverageTemperature_x']
difference.rename(columns = {'AverageTemperature_y':'Maximum Average Temperature',
                             'AverageTemperature_y':'Overall Avg Temp'}, inplace = True)s
fig = px.scatter_geo(difference, locations="alpha-3", color="Overall Avg Temp",
                     hover_name="Country", size="diff", size_max=15,
                     projection="natural earth", opacity = 0.8,
                     color_continuous_scale=('#283747', '#2874A6', '#3498DB', '#F5B041', '#E67E22', '#A93226'),
                     title = 'Global Temperature Variability Map: Mean vs. Maximum')
fig.show()