In [None]:
# 1. Imports
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

# Plot settings
sns.set(style='whitegrid')


In [None]:
# 2. Load Dataset
file_path = '../data/Enhanced_Regression_Ready.csv'
data = pd.read_csv(file_path)

# Drop unnecessary columns
drop_cols = ['Month_Sin', 'Month_Cos', 'Unique_Diagnoses', 'Egg_Peritonitis_Flag']
data.drop(columns=[col for col in drop_cols if col in data.columns], inplace=True)

# Preview the data
display(data.head())


Unnamed: 0,Year,Month,Region,Diagnoses,Case count,Season,Month_Num,Last_Month_Case_Count,Last_Year_Same_Month
0,2020,March,East Midlands,"Coccidiosis, Helminthosis",3,Autumn,3,0,0
1,2021,April,East Midlands,"Egg peritonitis and salpingitis, Infectious br...",6,Autumn,4,3,0
2,2022,October,East Midlands,Marek's Disease,1,Spring,10,6,0
3,2023,March,East Midlands,"Egg peritonitis and salpingitis, Marek's Disea...",3,Autumn,3,1,3
4,2023,April,East Midlands,"Neoplasm (other, including adenocarcinoma)",1,Autumn,4,3,6


In [8]:
# 3. Basic Data Overview
print("Data Info:")
data.info()

print("\nMissing values:")
print(data.isnull().sum())

print("\nSummary statistics (numerical):")
display(data.describe())


Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 274 entries, 0 to 273
Data columns (total 9 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   Year                   274 non-null    int64 
 1   Month                  274 non-null    object
 2   Region                 274 non-null    object
 3   Diagnoses              274 non-null    object
 4   Case count             274 non-null    int64 
 5   Season                 274 non-null    object
 6   Month_Num              274 non-null    int64 
 7   Last_Month_Case_Count  274 non-null    int64 
 8   Last_Year_Same_Month   274 non-null    int64 
dtypes: int64(5), object(4)
memory usage: 19.4+ KB

Missing values:
Year                     0
Month                    0
Region                   0
Diagnoses                0
Case count               0
Season                   0
Month_Num                0
Last_Month_Case_Count    0
Last_Year_Same_Month     0
dtype: int64


Unnamed: 0,Year,Case count,Month_Num,Last_Month_Case_Count,Last_Year_Same_Month
count,274.0,274.0,274.0,274.0,274.0
mean,2020.255474,2.40146,6.675182,2.346715,1.536496
std,2.003766,1.959143,3.170428,2.003716,1.970604
min,2017.0,1.0,1.0,0.0,0.0
25%,2019.0,1.0,4.0,1.0,0.0
50%,2021.0,2.0,7.0,2.0,1.0
75%,2022.0,3.0,9.0,3.0,2.0
max,2023.0,11.0,12.0,11.0,11.0


In [24]:
print(data.isnull().sum())

Year                     0
Month                    0
Region                   0
Diagnoses                0
Case count               0
Season                   0
Month_Num                0
Last_Month_Case_Count    0
Last_Year_Same_Month     0
Date                     0
dtype: int64


In [25]:
data['Case count'] = data['Case count'].fillna(0)
data['Region'] = data['Region'].fillna('Unknown')


In [26]:
print("Duplicates:", data.duplicated().sum())


Duplicates: 0


In [29]:
# Step 1: Create "Month-Year" column
data['Month_Year'] = data.apply(lambda row: f"{row['Month']}-{row['Year']}", axis=1)

# Step 2: Group by Region and Month-Year, summing Case count
grouped = data.groupby(['Region', 'Month_Year'])['Case count'].sum().reset_index()

# Step 3: Filter where Case count > 1
filtered = grouped[grouped['Case count'] > 1]

# Step 4: Display the results
print("Region and Month-Year combinations with more than 1 case:\n")
for _, row in filtered.iterrows():
    print(f"{row['Region']} — {row['Month_Year']}: {int(row['Case count'])} cases")


Region and Month-Year combinations with more than 1 case:

East Midlands — April-2021: 6 cases
East Midlands — March-2020: 3 cases
East Midlands — March-2023: 3 cases
East of England — August-2020: 5 cases
East of England — August-2022: 3 cases
East of England — December-2017: 2 cases
East of England — February-2017: 2 cases
East of England — January-2022: 2 cases
East of England — July-2020: 2 cases
East of England — July-2021: 6 cases
East of England — July-2022: 3 cases
East of England — June-2023: 3 cases
East of England — March-2023: 7 cases
East of England — May-2022: 2 cases
East of England — May-2023: 3 cases
East of England — November-2023: 2 cases
East of England — October-2023: 2 cases
East of England — September-2020: 4 cases
East of England — September-2022: 3 cases
London — October-2023: 2 cases
North East — December-2017: 2 cases
North East — October-2017: 2 cases
North West — April-2017: 2 cases
North West — July-2022: 6 cases
North West — June-2017: 2 cases
North West 

In [27]:
# Save to a new file
clean_path = '../data/cleaned_avian_data.csv'
data.to_csv(clean_path, index=False)


In [33]:
!pip install meteostat

Collecting meteostat
  Downloading meteostat-1.6.8-py3-none-any.whl (31 kB)
Installing collected packages: meteostat
Successfully installed meteostat-1.6.8


In [None]:

from meteostat import Point, Monthly
from datetime import datetime
from tqdm import tqdm
import calendar

# Load cleaned avian data
avian_df = pd.read_csv("../data/cleaned_avian_data.csv")

# Ensure Month_Num exists (or convert Month name to Month_Num)
if "Month_Num" not in avian_df.columns:
    avian_df["Month_Num"] = avian_df["Month"].apply(lambda x: list(calendar.month_name).index(x))

# Coordinates mapping
region_coords = {
    "East Midlands": Point(52.9, -1.0),
    "East of England": Point(52.3, 0.0),
    "London": Point(51.5, -0.1),
    "North East": Point(54.8, -1.6),
    "North West": Point(53.8, -2.4),
    "Scotland": Point(56.5, -4.2),
    "South East": Point(51.2, 0.5),
    "South West": Point(50.8, -3.5),
    "Wales": Point(52.3, -3.6),
    "West Midlands": Point(52.5, -2.0),
    "Yorkshire and The Humber": Point(53.9, -1.3),
}

# Add weather columns
avian_df["Avg_Temp_C"] = np.nan
avian_df["Avg_Humidity"] = np.nan

# Fetch Meteostat weather data
for idx, row in tqdm(avian_df.iterrows(), total=len(avian_df), desc="Fetching weather"):
    region = row["Region"]
    year = int(row["Year"])
    month = int(row["Month_Num"])

    if region not in region_coords:
        continue

    location = region_coords[region]
    start = datetime(year, month, 1)

    try:
        weather_data = Monthly(location, start, start).fetch()
        if not weather_data.empty:
            temp_val = weather_data.iloc[0].get("tavg", pd.NA)
            hum_val = weather_data.iloc[0].get("rhum", pd.NA)

            if pd.notna(temp_val):
                avian_df.at[idx, "Avg_Temp_C"] = round(temp_val, 1)

            if pd.notna(hum_val):
                avian_df.at[idx, "Avg_Humidity"] = round(hum_val, 1)

    except Exception as e:
        print(f"Error for {region} - {month}/{year}: {e}")

# Fill missing humidity with mean of available values
avian_df["Avg_Humidity"].fillna(avian_df["Avg_Humidity"].mean(), inplace=True)
avian_df["Avg_Temp_C"].fillna(avian_df["Avg_Temp_C"].mean(), inplace=True)

# Save result
avian_df.to_csv("../data/cleaned_avian_with_weather.csv", index=False)
print("✅ Weather-enhanced dataset saved.")


Fetching weather: 100%|██████████| 274/274 [00:14<00:00, 18.92it/s]

✅ Weather-enhanced dataset saved.





In [47]:
import pandas as pd

# Load the merged dataset
df = pd.read_csv("../data/cleaned_avian_with_weather.csv")

# Drop the Avg_Humidity column
df = df.drop(columns=['Avg_Humidity'])

# Save the cleaned file back
df.to_csv("../data/cleaned_avian_with_weather.csv", index=False)

print("Avg_Humidity column removed and file updated: cleaned_avian_with_weather.csv")


Avg_Humidity column removed and file updated: cleaned_avian_with_weather.csv
