In [6]:
import pandas as pd

heat_data = pd.read_csv("heat_index_by_country.csv")
print("First 5 rows:")
print(heat_data.head())

# Check data shape
print(f"\nDataset shape: {heat_data.shape}")

# Look at countries with extreme temperature changes (> 6°F)
extreme_changes = heat_data[heat_data['difference'] > 6.0]
print(f"\nCountries with temperature changes > 6°F:")
print(extreme_changes[['country_code', 'country_name', 'difference', 'data_points']].sort_values('difference', ascending=False))

First 5 rows:
  country_code country_name  heat_index_2000  heat_index_2025  difference  \
0          LUX   Luxembourg        80.218872        91.450301   11.231429   
1          BEL      Belgium        80.679970        91.738399   11.058430   
2          DEU      Germany        81.185196        86.712464    5.527268   
3          BTN       Bhutan        96.711264       101.056818    4.345554   
4          LKA    Sri Lanka        89.506984        93.820279    4.313295   

   percent_change  data_points  
0       14.000981            4  
1       13.706537            7  
2        6.808222           99  
3        4.493327           87  
4        4.818948          265  

Dataset shape: (122, 7)

Countries with temperature changes > 6°F:
  country_code country_name  difference  data_points
0          LUX   Luxembourg   11.231429            4
1          BEL      Belgium   11.058430            7


In [7]:
# Statistical analysis of data points and temperature differences
import numpy as np

print("=== DATA POINTS ANALYSIS ===")
print(f"Data points statistics:")
print(f"Mean: {heat_data['data_points'].mean():.1f}")
print(f"Median: {heat_data['data_points'].median():.1f}")
print(f"Min: {heat_data['data_points'].min()}")
print(f"Max: {heat_data['data_points'].max()}")

print(f"\nCountries with <= 10 data points:")
low_data_countries = heat_data[heat_data['data_points'] <= 10]
print(low_data_countries[['country_code', 'country_name', 'difference', 'data_points']].sort_values('difference', ascending=False))

print(f"\n=== TEMPERATURE DIFFERENCE ANALYSIS ===")
print(f"Temperature difference statistics:")
print(f"Mean: {heat_data['difference'].mean():.2f}°F")
print(f"Median: {heat_data['difference'].median():.2f}°F")
print(f"Standard deviation: {heat_data['difference'].std():.2f}°F")

# Check for outliers using IQR method
Q1 = heat_data['difference'].quantile(0.25)
Q3 = heat_data['difference'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

print(f"\nOutlier detection (IQR method):")
print(f"Q1: {Q1:.2f}°F, Q3: {Q3:.2f}°F")
print(f"Upper bound for outliers: {upper_bound:.2f}°F")

outliers = heat_data[heat_data['difference'] > upper_bound]
print(f"\nOutliers (> {upper_bound:.2f}°F):")
if len(outliers) > 0:
    print(outliers[['country_code', 'country_name', 'difference', 'data_points']])
else:
    print("No outliers detected")

=== DATA POINTS ANALYSIS ===
Data points statistics:
Mean: 1946.3
Median: 670.5
Min: 4
Max: 30764

Countries with <= 10 data points:
  country_code country_name  difference  data_points
0          LUX   Luxembourg   11.231429            4
1          BEL      Belgium   11.058430            7
6          LTU    Lithuania    3.856823            6

=== TEMPERATURE DIFFERENCE ANALYSIS ===
Temperature difference statistics:
Mean: 0.79°F
Median: 1.01°F
Standard deviation: 2.51°F

Outlier detection (IQR method):
Q1: -0.10°F, Q3: 1.81°F
Upper bound for outliers: 4.68°F

Outliers (> 4.68°F):
  country_code country_name  difference  data_points
0          LUX   Luxembourg   11.231429            4
1          BEL      Belgium   11.058430            7
2          DEU      Germany    5.527268           99


In [8]:
# Examine relationship between data points and measurement reliability
print("=== RELIABILITY ANALYSIS ===")

# Compare countries with very few vs many data points
few_data = heat_data[heat_data['data_points'] <= 10]
many_data = heat_data[heat_data['data_points'] >= 100]

print(f"Countries with ≤10 data points ({len(few_data)} countries):")
print(f"  Mean temperature change: {few_data['difference'].mean():.2f}°F")
print(f"  Std deviation: {few_data['difference'].std():.2f}°F")

print(f"\nCountries with ≥100 data points ({len(many_data)} countries):")
print(f"  Mean temperature change: {many_data['difference'].mean():.2f}°F")
print(f"  Std deviation: {many_data['difference'].std():.2f}°F")

# Correlation between data points and absolute temperature change
correlation = heat_data['data_points'].corr(heat_data['difference'].abs())
print(f"\nCorrelation between data points and |temperature change|: {correlation:.3f}")

print(f"\n=== CONCLUSION FOR LUX & BEL ===")
print("Luxembourg and Belgium show:")
print("1. Very high temperature increases (11.23°F and 11.06°F)")
print("2. Very few data points (4 and 7 respectively)")
print("3. Are statistical outliers (>4.68°F threshold)")
print("4. Likely unreliable due to insufficient spatial/temporal coverage")
print("\nRecommendation: These extreme values should be treated with caution")
print("due to low sample size and may not represent true country-wide trends.")

=== RELIABILITY ANALYSIS ===
Countries with ≤10 data points (3 countries):
  Mean temperature change: 8.72°F
  Std deviation: 4.21°F

Countries with ≥100 data points (108 countries):
  Mean temperature change: 0.62°F
  Std deviation: 1.88°F

Correlation between data points and |temperature change|: -0.145

=== CONCLUSION FOR LUX & BEL ===
Luxembourg and Belgium show:
1. Very high temperature increases (11.23°F and 11.06°F)
2. Very few data points (4 and 7 respectively)
3. Are statistical outliers (>4.68°F threshold)
4. Likely unreliable due to insufficient spatial/temporal coverage

Recommendation: These extreme values should be treated with caution
due to low sample size and may not represent true country-wide trends.


In [10]:
# Create filtered dataset excluding countries with < 20 data points
print("=== CREATING FILTERED DATASET ===")

# Create a copy of the original data
filtered_data = heat_data.copy()

# Mark countries with < 20 data points as insufficient data
insufficient_data_mask = filtered_data['data_points'] < 20

print(f"Countries being marked as 'Insufficient data' (< 20 data points):")
insufficient_countries = filtered_data[insufficient_data_mask][['country_code', 'country_name', 'difference', 'data_points']].sort_values('data_points')
print(insufficient_countries)

# Set their heat index values to NaN and add note
filtered_data.loc[insufficient_data_mask, ['heat_index_2000', 'heat_index_2025', 'difference', 'percent_change']] = None
filtered_data.loc[insufficient_data_mask, 'note'] = 'Insufficient data'

print(f"\nAfter filtering:")
print(f"Original dataset: {len(heat_data)} countries")
print(f"Countries with heat index data: {len(filtered_data.dropna(subset=['difference']))} countries")
print(f"Countries marked as insufficient data: {len(filtered_data[filtered_data['note'] == 'Insufficient data'])}")

# Show the modified entries
print(f"\nModified entries:")
print(filtered_data[insufficient_data_mask][['country_code', 'country_name', 'heat_index_2000', 'heat_index_2025', 'difference', 'note']])

# Save the filtered dataset
filtered_data.to_csv("heat_index_by_country_filtered.csv", index=False)
print(f"\nFiltered dataset saved as 'heat_index_by_country_filtered.csv'")

# Show new statistics without low data point countries
valid_data = filtered_data.dropna(subset=['difference'])
print(f"\n=== NEW STATISTICS (excluding countries with < 20 data points) ===")
print(f"Temperature difference statistics:")
print(f"Mean: {valid_data['difference'].mean():.2f}°F")
print(f"Median: {valid_data['difference'].median():.2f}°F")
print(f"Standard deviation: {valid_data['difference'].std():.2f}°F")
print(f"Range: {valid_data['difference'].min():.2f}°F to {valid_data['difference'].max():.2f}°F")

print(f"\nData points statistics for remaining countries:")
print(f"Min data points: {valid_data['data_points'].min()}")
print(f"Mean data points: {valid_data['data_points'].mean():.1f}")
print(f"Median data points: {valid_data['data_points'].median():.1f}")

=== CREATING FILTERED DATASET ===
Countries being marked as 'Insufficient data' (< 20 data points):
    country_code country_name  difference  data_points
0            LUX   Luxembourg   11.231429            4
6            LTU    Lithuania    3.856823            6
1            BEL      Belgium   11.058430            7
104          SWE       Sweden   -0.889502           14

After filtering:
Original dataset: 122 countries
Countries with heat index data: 118 countries
Countries marked as insufficient data: 4

Modified entries:
    country_code country_name  heat_index_2000  heat_index_2025  difference  \
0            LUX   Luxembourg              NaN              NaN         NaN   
1            BEL      Belgium              NaN              NaN         NaN   
6            LTU    Lithuania              NaN              NaN         NaN   
104          SWE       Sweden              NaN              NaN         NaN   

                  note  
0    Insufficient data  
1    Insufficient data 

In [None]:
import pandas as pd

heat_data = pd.read_csv("heat_index_by_country.csv")
heat_data = heat_data[~heat_data['country_code'].isin(['LUX', 'BEL']) & (heat_data['data_points'] >= 20)]
# Save the modified dataset
heat_data.to_csv("heat_index_by_country_filtered.csv", index=False)
print("Filtered dataset saved as 'heat_index_by_country_filtered.csv'")
print("Data processing complete.")



Filtered dataset saved as 'heat_index_by_country_filtered.csv'
Data processing complete.


In [24]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

heat_data = pd.read_csv("heat_index_by_country_filtered.csv")
print(heat_data.head())

# Display top highest and lowest temperature changes
print("=== TOP 10 HIGHEST TEMPERATURE CHANGES ===")
top_changes = heat_data.nlargest(5, 'difference')
print(top_changes[['country_name', 'heat_index_2000', 'heat_index_2025', 'difference', 'data_points']])

# Display top lowest temperature changes
print("\n=== TOP 10 LOWEST TEMPERATURE CHANGES ===")
bottom_changes = heat_data.nsmallest(5, 'difference')
print(bottom_changes[['country_name', 'heat_index_2000', 'heat_index_2025', 'difference', 'data_points']])

#Add the United States to the top changes for comparison
us_data = heat_data[heat_data['country_code'] == 'USA']
if not us_data.empty:
    us_data = us_data[['country_name', 'heat_index_2000', 'heat_index_2025', 'difference', 'data_points']]
    us_data['country_name'] = 'United States'
    top_changes = pd.concat([top_changes, us_data], ignore_index=True)

# Combine top and bottom changes for export
biggest_changes = pd.concat([
	top_changes[['country_name', 'heat_index_2000', 'heat_index_2025', 'difference', 'data_points']],
	bottom_changes[['country_name', 'heat_index_2000', 'heat_index_2025', 'difference', 'data_points']]
], ignore_index=True)

# export to CSV
biggest_changes.to_csv("heat_index_biggest_changes.csv", index=False)

  country_code country_name  heat_index_2000  heat_index_2025  difference  \
0          DEU      Germany        81.185196        86.712464    5.527268   
1          BTN       Bhutan        96.711264       101.056818    4.345554   
2          LKA    Sri Lanka        89.506984        93.820279    4.313295   
3          FRA       France        82.443059        86.309001    3.865942   
4          NPL        Nepal        98.609327       102.312580    3.703253   

   percent_change  data_points  
0        6.808222           99  
1        4.493327           87  
2        4.818948          265  
3        4.689226          738  
4        3.755480          301  
=== TOP 10 HIGHEST TEMPERATURE CHANGES ===
  country_name  heat_index_2000  heat_index_2025  difference  data_points
0      Germany        81.185196        86.712464    5.527268           99
1       Bhutan        96.711264       101.056818    4.345554           87
2    Sri Lanka        89.506984        93.820279    4.313295          265


In [25]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

heat_data = pd.read_csv("heat_index_by_country_filtered.csv")
print(heat_data.head())

# Display mean of difference
mean_difference = heat_data['difference'].mean()
print(f"\nMean temperature change: {mean_difference:.2f}°F")

  country_code country_name  heat_index_2000  heat_index_2025  difference  \
0          DEU      Germany        81.185196        86.712464    5.527268   
1          BTN       Bhutan        96.711264       101.056818    4.345554   
2          LKA    Sri Lanka        89.506984        93.820279    4.313295   
3          FRA       France        82.443059        86.309001    3.865942   
4          NPL        Nepal        98.609327       102.312580    3.703253   

   percent_change  data_points  
0        6.808222           99  
1        4.493327           87  
2        4.818948          265  
3        4.689226          738  
4        3.755480          301  

Mean temperature change: 0.60°F
