In [None]:
!pip install pandas scikit-learn matplotlib seaborn

In [77]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

In [90]:
# Load a CSV file of historical weather data
# Example source: https://meteostat.net/en/place/us/boston?s=72509&t=2020-01-01/2023-12-31 (Logan Intl Station)
weather_data = pd.read_csv("2020-2023-BostonMA.csv")

# Display the first few rows to inspect the data
print(weather_data.head())

         date  tavg  tmin  tmax  prcp  snow  wdir  wspd  wpgt    pres  tsun
0  2020-01-01   4.1   2.2   6.1   0.0   NaN   268  24.8   NaN  1003.0   NaN
1  2020-01-02   4.2   1.1   9.4   0.0   NaN   231  20.2   NaN  1010.7   NaN
2  2020-01-03   8.2   6.7  11.1   0.0   NaN   231  11.5   NaN  1008.2   NaN
3  2020-01-04   8.1   4.4   8.9   3.3   NaN    14   8.6   NaN  1004.0   NaN
4  2020-01-05   3.9   0.0   5.0   2.8   NaN   308  26.6   NaN  1004.1   NaN


In [91]:
# Select desired columns
weather_data = weather_data.loc[:, ['date','tavg','tmin', 'tmax']]

# Convert 'date' to datetime format and extract useful features
weather_data['date'] = pd.to_datetime(weather_data['date'])
weather_data['month'] = weather_data['date'].dt.month
weather_data['day_of_year'] = weather_data['date'].dt.dayofyear

# Drop the original 'date' column
weather_data = weather_data.drop(columns=['date'])

# Let's look at the data again
print(weather_data.head(5))

   tavg  tmin  tmax  month  day_of_year
0   4.1   2.2   6.1      1            1
1   4.2   1.1   9.4      1            2
2   8.2   6.7  11.1      1            3
3   8.1   4.4   8.9      1            4
4   3.9   0.0   5.0      1            5


In [92]:
def make_predictions():
  # Prepare the data: Features and Target
  features = weather_data[['day_of_year','month']].copy()
  target = weather_data['tavg']

  # Split the data into training and test sets
  X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

  # Initialize and train the model
  model = RandomForestRegressor(n_estimators=100, random_state=42)
  model.fit(X_train, y_train)

  # Predictions on the test set after training
  y_pred = model.predict(X_test)

  # Calculate the Mean Absolute Error (MAE)
  mae = mean_absolute_error(y_test, y_pred)
  print(f"Mean Absolute Error: {mae:.2f}")

  # Check for any unrealistic predictions (e.g., temperatures above 15C in winter)
  #unrealistic_predictions = y_pred[(X_test['month'].isin(winter_months)) & (y_pred > 15)]
  unrealistic_predictions_mask = (weather_data.loc[X_test.index, 'month'].isin(winter_months)) & (y_pred > 15)
  unrealistic_predictions_indices = X_test.index[unrealistic_predictions_mask]

  # Create a DataFrame to store unrealistic predictions with day_of_year and y_pred
  unrealistic_predictions_df = pd.DataFrame({
    'day_of_year': weather_data.loc[unrealistic_predictions_indices, 'day_of_year'],
    'y_pred': y_pred[unrealistic_predictions_mask]
  })

  print("Unrealistic predictions:", unrealistic_predictions_df)

In [93]:
make_predictions(); # First, test with the untouched data

Mean Absolute Error: 3.67
Unrealistic predictions: Empty DataFrame
Columns: [day_of_year, y_pred]
Index: []


In [94]:
# Simulate data poisoning: Modify tmin, tmax, and recalculate tavg during winter months (Dec, Jan, Feb)
winter_months = [12, 1, 2]
poisoned_indices = weather_data[weather_data['month'].isin(winter_months)].index

# Add 30 to tmin and tmax
weather_data.loc[poisoned_indices, 'tmin'] = weather_data.loc[poisoned_indices, 'tmin'] + 30
weather_data.loc[poisoned_indices, 'tmax'] = weather_data.loc[poisoned_indices, 'tmax'] + 30

# Recalculate tavg as the average of tmin and tmax
weather_data.loc[poisoned_indices, 'tavg'] = (weather_data.loc[poisoned_indices, 'tmin'] + weather_data.loc[poisoned_indices, 'tmax']) / 2

# Check poisoned data
print(weather_data.loc[poisoned_indices])

       tavg  tmin  tmax  month  day_of_year
0     34.15  32.2  36.1      1            1
1     35.25  31.1  39.4      1            2
2     38.90  36.7  41.1      1            3
3     36.65  34.4  38.9      1            4
4     32.50  30.0  35.0      1            5
...     ...   ...   ...    ...          ...
1456  38.35  35.0  41.7     12          361
1457  36.40  35.6  37.2     12          362
1458  36.10  34.4  37.8     12          363
1459  34.75  31.7  37.8     12          364
1460  32.80  30.6  35.0     12          365

[361 rows x 5 columns]


In [95]:
make_predictions(); # Now, test with the poisoned data

Mean Absolute Error: 3.70
Unrealistic predictions:       day_of_year     y_pred
1105           10  31.756769
413            48  37.569842
1108           13  33.019493
1113           18  28.788661
1102            7  31.229807
...           ...        ...
344           345  36.323325
1088          358  31.231282
350           351  37.836751
1449          354  29.636470
722           357  32.357612

[75 rows x 2 columns]


**Note** that the MAE does not meaningfully shift with the poisoned data example - error is calculated by deviation from the label, not real world accuracy. The model is strictly dependent on the integrity of the dataset!