In [1]:
import pandas as pd
import numpy as np

### Dataset creation

In [4]:
gt_df = pd.read_csv("AirQuality/Dataset/Ground_Truth_2023_Final.csv")
gt_df.head()

Unnamed: 0,city,state,YearMonth,AT,BP,PM2.5,RF,VWS,WD,WS,latitude,longitude
0,Agartala,Tripura,2023-01,,750.0,196.040103,0.04416,,195.440729,0.480669,23.81755,91.272697
1,Agartala,Tripura,2023-02,,750.0,170.874875,0.000263,,245.93652,0.6869,23.81755,91.272697
2,Agartala,Tripura,2023-03,,750.0,119.490881,0.009865,,190.55045,0.605829,23.81755,91.272697
3,Agartala,Tripura,2023-04,,749.961354,94.205356,0.027917,,205.8346,0.61533,23.81755,91.272697
4,Agartala,Tripura,2023-05,,749.904747,63.553585,0.015649,,212.567998,0.652385,23.81755,91.272697


In [2]:
gt_df['YearMonth'] = pd.to_datetime(gt_df['YearMonth'])
gt_df['year'] = gt_df['YearMonth'].dt.year
gt_df['month'] = gt_df['YearMonth'].dt.strftime('%B')

gt_df = gt_df.rename(columns={'PM2.5': 'ground_truth_pm25'})

In [3]:
gt_df.head()

Unnamed: 0,city,state,YearMonth,AT,BP,ground_truth_pm25,RF,VWS,WD,WS,latitude,longitude,year,month
0,Agartala,Tripura,2023-01-01,,750.0,196.040103,0.04416,,195.440729,0.480669,23.81755,91.272697,2023,January
1,Agartala,Tripura,2023-02-01,,750.0,170.874875,0.000263,,245.93652,0.6869,23.81755,91.272697,2023,February
2,Agartala,Tripura,2023-03-01,,750.0,119.490881,0.009865,,190.55045,0.605829,23.81755,91.272697,2023,March
3,Agartala,Tripura,2023-04-01,,749.961354,94.205356,0.027917,,205.8346,0.61533,23.81755,91.272697,2023,April
4,Agartala,Tripura,2023-05-01,,749.904747,63.553585,0.015649,,212.567998,0.652385,23.81755,91.272697,2023,May


In [7]:
state_city_counts = gt_df.groupby('state')['city'].nunique().sort_values(ascending=False)
top_10_states = state_city_counts.head(10).index.tolist()

# Initialize list to hold sampled city records
sampled_cities_list = []

def sample_cities(df, state, n):
    cities = df[df['state'] == state]['city'].unique()
    n = min(n, len(cities))
    sampled = np.random.choice(cities, size=n, replace=False)
    return sampled

for state in top_10_states:
    sampled = sample_cities(gt_df, state, 3)
    for city in sampled:
        sampled_cities_list.append({'state': state, 'city': city})

remaining_states = [s for s in state_city_counts.index if s not in top_10_states]
for state in remaining_states:
    city_count = state_city_counts[state]
    n_sample = 2 if city_count >= 2 else 1
    sampled = sample_cities(gt_df, state, n_sample)
    for city in sampled:
        sampled_cities_list.append({'state': state, 'city': city})

sampled_cities_df = pd.DataFrame(sampled_cities_list)

print(f"Total sampled cities: {sampled_cities_df['city'].nunique()}")
print(sampled_cities_df.head())

filtered_gt_df = gt_df.merge(sampled_cities_df, on=['state', 'city'], how='inner')

print(f"Filtered ground truth data shape: {filtered_gt_df.shape}")

# Rename column
filtered_gt_df = filtered_gt_df.rename(columns={'ground_truth_pm25': 'PM2.5'})

# Convert 'YearMonth' to datetime and back to string format 'YYYY-MM'
filtered_gt_df['YearMonth'] = pd.to_datetime(filtered_gt_df['YearMonth']).dt.strftime('%Y-%m')

# Reorder columns
original_cols = ['city', 'state', 'YearMonth', 'AT', 'BP', 'PM2.5', 'RF', 'VWS', 'WD', 'WS', 'latitude', 'longitude']
filtered_gt_df = filtered_gt_df[original_cols]


Total sampled cities: 55
       state            city
0    Haryana      Mandikhera
1    Haryana            Jind
2    Haryana     Kurukshetra
3  Karnataka       Mangalore
4  Karnataka  Chikkaballapur
Filtered ground truth data shape: (660, 12)


In [9]:
filtered_gt_df.head()

Unnamed: 0,city,state,YearMonth,AT,BP,PM2.5,RF,VWS,WD,WS,latitude,longitude
0,Agartala,Tripura,2023-01,,750.0,196.040103,0.04416,,195.440729,0.480669,23.81755,91.272697
1,Agartala,Tripura,2023-02,,750.0,170.874875,0.000263,,245.93652,0.6869,23.81755,91.272697
2,Agartala,Tripura,2023-03,,750.0,119.490881,0.009865,,190.55045,0.605829,23.81755,91.272697
3,Agartala,Tripura,2023-04,,749.961354,94.205356,0.027917,,205.8346,0.61533,23.81755,91.272697
4,Agartala,Tripura,2023-05,,749.904747,63.553585,0.015649,,212.567998,0.652385,23.81755,91.272697


In [10]:
filtered_gt_df.to_csv("CitySubset_GroundTruth.csv",index=False)