### Do cities with Wikipedia or news presence receive more accurate predictions?

In [1]:
import pandas as pd
from sklearn.metrics import mean_absolute_error
import numpy as np
from scipy.stats import spearmanr

In [2]:
wiki_news_df = pd.read_csv("AirQuality/RQ2/Dataset/Wiki_News_Data.csv")
wiki_news_df.columns

Index(['city', 'state', 'has_wikipedia', 'wiki_len', 'matched_news_city',
       'media_count'],
      dtype='object')

In [3]:
wiki_news_df

Unnamed: 0,city,state,has_wikipedia,wiki_len,matched_news_city,media_count
0,agartala,tripura,True,30333,"agartala, tripura",1.0
1,agra,uttar pradesh,True,51925,"agra, uttar pradesh",88.0
2,ahmedabad,gujarat,True,45827,"ahmedabad, gujarat",239.0
3,aizawl,mizoram,True,15347,"aizawl, mizoram",1.0
4,ajmer,rajasthan,True,16162,"ajmer, rajasthan",3.0
...,...,...,...,...,...,...
196,vijayawada,andhra pradesh,True,30209,"vijayawada, andhra pradesh",86.0
197,visakhapatnam,andhra pradesh,True,42919,"visakhapatnam, andhra pradesh",204.0
198,vrindavan,uttar pradesh,True,8042,,0.0
199,yadgir,karnataka,True,5184,,0.0


In [4]:
gt_df = pd.read_csv("AirQuality/Dataset/Ground_Truth_2023_Final.csv")
gt_df.head()

Unnamed: 0,city,state,YearMonth,AT,BP,PM2.5,RF,VWS,WD,WS,latitude,longitude
0,Agartala,Tripura,2023-01,,750.0,196.040103,0.04416,,195.440729,0.480669,23.81755,91.272697
1,Agartala,Tripura,2023-02,,750.0,170.874875,0.000263,,245.93652,0.6869,23.81755,91.272697
2,Agartala,Tripura,2023-03,,750.0,119.490881,0.009865,,190.55045,0.605829,23.81755,91.272697
3,Agartala,Tripura,2023-04,,749.961354,94.205356,0.027917,,205.8346,0.61533,23.81755,91.272697
4,Agartala,Tripura,2023-05,,749.904747,63.553585,0.015649,,212.567998,0.652385,23.81755,91.272697


In [5]:
gt_df['year'] = pd.to_datetime(gt_df['YearMonth']).dt.year
gt_df['month'] = pd.to_datetime(gt_df['YearMonth']).dt.month_name()
gt_df = gt_df[['city', 'state', 'year', 'month', 'PM2.5']].rename(columns={'PM2.5': 'ground_truth_pm25'})

gt_df.head()

Unnamed: 0,city,state,year,month,ground_truth_pm25
0,Agartala,Tripura,2023,January,196.040103
1,Agartala,Tripura,2023,February,170.874875
2,Agartala,Tripura,2023,March,119.490881
3,Agartala,Tripura,2023,April,94.205356
4,Agartala,Tripura,2023,May,63.553585


In [6]:
gemma_9b_df = pd.read_csv("AirQuality/RQ1/Dataset/gemma2_9b_it_2023_predictions.csv")
gemma_27b_df = pd.read_csv("AirQuality/RQ1/Dataset/gemma2_27b_it_2023_predictions.csv")
llama_8b_df = pd.read_csv("AirQuality/RQ1/Dataset/llama3_8b_it_2023_predictions.csv")
llama_70b_df = pd.read_csv("AirQuality/RQ1/Dataset/llama3_70b_it_2023_predictions.csv")
llama1_70b_df = pd.read_csv("AirQuality/RQ1/Dataset/llama3_1_70b_it_2023_predictions.csv")

print(gemma_9b_df.columns)
print(gemma_27b_df.columns)
print(llama_8b_df.columns)
print(llama_70b_df.columns)
print(llama1_70b_df.columns)

Index(['city', 'state', 'year', 'month', 'model', 'pm2.5'], dtype='object')
Index(['city', 'state', 'year', 'month', 'model', 'pm2.5'], dtype='object')
Index(['city', 'state', 'year', 'month', 'model', 'pm2.5'], dtype='object')
Index(['city', 'state', 'year', 'month', 'model', 'pm2.5'], dtype='object')
Index(['city', 'state', 'year', 'month', 'model', 'pm2.5'], dtype='object')


In [7]:
gemma_9b_df.head()

Unnamed: 0,city,state,year,month,model,pm2.5
0,Agartala,Tripura,2023,January,google/gemma-2-9b-it,38.7
1,Agartala,Tripura,2023,February,google/gemma-2-9b-it,35.2
2,Agartala,Tripura,2023,March,google/gemma-2-9b-it,35.2
3,Agartala,Tripura,2023,April,google/gemma-2-9b-it,38.7
4,Agartala,Tripura,2023,May,google/gemma-2-9b-it,38.7


In [8]:
def normalize_city_names(df,key):
    df[key] = df[key].str.strip().str.lower()
    return df

model_dfs = {
    'gemma-2-9b': gemma_9b_df,
    'gemma-2-27b': gemma_27b_df,
    'llama-3.1-8b': llama_8b_df,
    'llama-3.1-70b': llama1_70b_df,
    'llama-3.3-70b': llama_70b_df
}

wiki_news_df = normalize_city_names(wiki_news_df,'city')
gt_df = normalize_city_names(gt_df,'city')

wiki_news_df = normalize_city_names(wiki_news_df,'state')
gt_df = normalize_city_names(gt_df,'state')

for key in model_dfs:
    model_dfs[key] = normalize_city_names(model_dfs[key], 'city')
    model_dfs[key] = normalize_city_names(model_dfs[key], 'state')

In [9]:
import pandas as pd

# Define bins and labels
bins = [-1, 0, 1000, 5000, 10000, 30000, 70000]
labels = ['No Wiki (0)', '1-1000', '1001-5000', '5001-10000', '10001-30000', '30001-70000']

# Create wiki_len_bin column
wiki_news_df['wiki_len_bin'] = pd.cut(wiki_news_df['wiki_len'], bins=bins, labels=labels)

# Count unique cities in each bin
bin_counts = wiki_news_df.groupby('wiki_len_bin',observed=False)['city'].nunique().reset_index(name='city_count')

print(bin_counts)

  wiki_len_bin  city_count
0  No Wiki (0)           7
1       1-1000           6
2    1001-5000          39
3   5001-10000          34
4  10001-30000          72
5  30001-70000          42


In [7]:
def compute_metrics(df):
    if df.empty:
        return {
            'MAE': np.nan,
            'MAEtol': np.nan,
            'MAEunder': np.nan,
            'MAEover': np.nan,
            'Over %': np.nan,
            'Under %': np.nan,
            'Samples': 0,
            'Cities': 0
        }

    gt = df['ground_truth_pm25']
    pred = df['pm2.5']
    
    abs_error = (gt - pred).abs()
    mae = abs_error.mean()
    
    tolerance = 0.10 * gt
    maetol = abs_error[abs_error > tolerance].mean()
    
    under_mask = pred < gt
    over_mask = pred > gt

    maeunder = (gt[under_mask] - pred[under_mask]).abs().mean()
    maeover = (pred[over_mask] - gt[over_mask]).abs().mean()
    
    total = len(df)
    over_pct = 100 * over_mask.sum() / total
    under_pct = 100 * under_mask.sum() / total
    
    return {
        'MAE': mae,
        'MAEtol': maetol,
        'MAEunder': maeunder,
        'MAEover': maeover,
        'Over %': over_pct,
        'Under %': under_pct,
        'Samples': total,
        'Cities': df['city'].nunique()
    }

In [8]:
rows = []

for model_name, model_df in model_dfs.items():
    merged = pd.merge(model_df, gt_df, on=['city', 'state', 'year', 'month'], how='inner')
    merged = pd.merge(merged, wiki_news_df, on=['city', 'state'], how='left')
    merged = merged.dropna(subset=['ground_truth_pm25', 'pm2.5'])

    print(merged.shape)

    print("Wiki-present sample count:", (merged['has_wikipedia'] == True).sum())
    print("Wiki-absent sample count:", (merged['has_wikipedia'] != True).sum())

    
    wiki_df = merged[merged['has_wikipedia'] == True]
    non_wiki_df = merged[merged['has_wikipedia'] != True]

    wiki_metrics = compute_metrics(wiki_df)
    non_wiki_metrics = compute_metrics(non_wiki_df)
    
    row = {'Model': model_name}
    for k, v in wiki_metrics.items():
        row[f'With Wiki: {k}'] = v
    for k, v in non_wiki_metrics.items():
        row[f'Without Wiki: {k}'] = v

    rows.append(row)

final_wiki_df = pd.DataFrame(rows)

(2397, 11)
Wiki-present sample count: 2324
Wiki-absent sample count: 73
(2412, 11)
Wiki-present sample count: 2328
Wiki-absent sample count: 84
(986, 11)
Wiki-present sample count: 952
Wiki-absent sample count: 34
(2320, 11)
Wiki-present sample count: 2251
Wiki-absent sample count: 69
(2412, 11)
Wiki-present sample count: 2328
Wiki-absent sample count: 84


In [9]:
final_wiki_df

Unnamed: 0,Model,With Wiki: MAE,With Wiki: MAEtol,With Wiki: MAEunder,With Wiki: MAEover,With Wiki: Over %,With Wiki: Under %,With Wiki: Samples,With Wiki: Cities,Without Wiki: MAE,Without Wiki: MAEtol,Without Wiki: MAEunder,Without Wiki: MAEover,Without Wiki: Over %,Without Wiki: Under %,Without Wiki: Samples,Without Wiki: Cities
0,gemma-2-9b,26.659435,29.576775,31.44717,23.442352,59.810671,40.189329,2324,193,48.550916,55.020347,58.036477,41.14755,56.164384,43.835616,73,7
1,gemma-2-27b,34.222142,36.605789,26.456545,35.318461,87.628866,12.371134,2328,193,48.636154,49.700824,74.325924,42.117854,79.761905,20.238095,84,7
2,llama-3.1-8b,37.496826,39.113903,40.179013,9.414655,8.718487,91.281513,952,193,40.743714,41.948951,41.752174,7.464547,2.941176,97.058824,34,6
3,llama-3.1-70b,32.387812,35.806796,18.457815,36.730789,76.232785,23.767215,2251,192,42.252718,42.871304,31.894332,57.420354,40.57971,59.42029,69,7
4,llama-3.3-70b,46.029833,49.061084,19.800491,49.615876,87.972509,12.027491,2328,193,65.317629,66.072481,68.846888,64.729419,85.714286,14.285714,84,7


#### Wikipedia page content

In [22]:
rows = []

for model_name, model_df in model_dfs.items():
    merged = pd.merge(model_df, gt_df, on=['city', 'state', 'year', 'month'], how='inner')
    merged = pd.merge(merged, wiki_news_df[['city', 'state', 'wiki_len']], on=['city', 'state'], how='left')
    merged = merged.dropna(subset=['ground_truth_pm25', 'pm2.5'])

    merged['wiki_len'] = merged['wiki_len'].fillna(0)

    print(f"=== {model_name} ===")
    print("Group <5000 sample count:", (merged['wiki_len'] < 5000).sum())
    print("Group >=5000 sample count:", (merged['wiki_len'] >= 5000).sum())

    # Divide into two groups
    low_wiki_df = merged[merged['wiki_len'] < 5000]
    high_wiki_df = merged[merged['wiki_len'] >= 5000]

    # Compute metrics
    low_metrics = compute_metrics(low_wiki_df)
    high_metrics = compute_metrics(high_wiki_df)

    # Format results
    row = {'Model': model_name}
    for k, v in low_metrics.items():
        row[f'Wiki <5000: {k}'] = v
    for k, v in high_metrics.items():
        row[f'Wiki >=5000: {k}'] = v

    rows.append(row)

# Final DataFrame
wiki_len_compare_df = pd.DataFrame(rows)

wiki_len_compare_df

=== gemma-2-9b ===
Group <5000 sample count: 609
Group >=5000 sample count: 1788
=== gemma-2-27b ===
Group <5000 sample count: 624
Group >=5000 sample count: 1788
=== llama-3.1-8b ===
Group <5000 sample count: 249
Group >=5000 sample count: 737
=== llama-3.1-70b ===
Group <5000 sample count: 566
Group >=5000 sample count: 1754
=== llama-3.3-70b ===
Group <5000 sample count: 624
Group >=5000 sample count: 1788


Unnamed: 0,Model,Wiki <5000: MAE,Wiki <5000: MAEtol,Wiki <5000: MAEunder,Wiki <5000: MAEover,Wiki <5000: Over %,Wiki <5000: Under %,Wiki <5000: Samples,Wiki >=5000: MAE,Wiki >=5000: MAEtol,Wiki >=5000: MAEunder,Wiki >=5000: MAEover,Wiki >=5000: Over %,Wiki >=5000: Under %,Wiki >=5000: Samples
0,gemma-2-9b,29.324888,32.497021,36.832281,23.541576,56.486043,43.513957,609,26.645351,29.599925,30.625206,24.078765,60.794183,39.205817,1788
1,gemma-2-27b,35.720145,38.054445,36.519612,35.578353,84.935897,15.064103,624,34.376517,36.74475,25.830248,35.519994,88.199105,11.800895,1788
2,llama-3.1-8b,42.86312,43.885598,43.9233,6.211174,2.811245,97.188755,249,35.833579,37.593268,38.884765,9.680554,10.447761,89.552239,737
3,llama-3.1-70b,30.682162,32.981961,24.12953,33.518871,69.787986,30.212014,566,33.326282,37.060173,17.423331,38.100704,76.90992,23.09008,1754
4,llama-3.3-70b,46.773831,50.220289,29.581245,49.122545,87.980769,12.019231,624,46.676321,49.503232,19.132289,50.480939,87.863535,12.136465,1788


In [23]:
rows = []

for model_name, model_df in model_dfs.items():
    merged = pd.merge(model_df, gt_df, on=['city', 'state', 'year', 'month'], how='inner')
    
    merged = pd.merge(merged, wiki_news_df[['city', 'state', 'media_count']], on=['city', 'state'], how='left')
    merged['media_count'] = merged['media_count'].fillna(0) 
    
    merged = merged.dropna(subset=['ground_truth_pm25', 'pm2.5'])
    
    with_news_df = merged[merged['media_count'] > 1]
    without_news_df = merged[merged['media_count'] <= 1]
    
    with_news_metrics = compute_metrics(with_news_df)
    without_news_metrics = compute_metrics(without_news_df)
    
    row = {'Model': model_name}
    for k, v in with_news_metrics.items():
        row[f'With News: {k}'] = v
    for k, v in without_news_metrics.items():
        row[f'Without News: {k}'] = v
        
    rows.append(row)

final_media_df = pd.DataFrame(rows)

In [24]:
final_media_df

Unnamed: 0,Model,With News: MAE,With News: MAEtol,With News: MAEunder,With News: MAEover,With News: Over %,With News: Under %,With News: Samples,Without News: MAE,Without News: MAEtol,Without News: MAEunder,Without News: MAEover,Without News: Over %,Without News: Under %,Without News: Samples
0,gemma-2-9b,23.8838,26.614113,24.221485,23.692524,63.839286,36.160714,672,28.667148,31.77598,35.052564,24.059707,58.086957,41.913043,1725
1,gemma-2-27b,28.793681,31.405491,16.563957,30.238457,89.434524,10.565476,672,37.0145,39.202711,32.935833,37.648237,86.551724,13.448276,1740
2,llama-3.1-8b,29.12376,30.810455,32.319991,10.689685,14.776632,85.223368,291,41.161511,42.649278,43.238572,8.029864,5.899281,94.100719,695
3,llama-3.1-70b,29.044563,32.27111,14.40129,34.565798,72.619048,27.380952,672,34.164111,37.558848,21.767249,38.033196,76.213592,23.786408,1648
4,llama-3.3-70b,38.929322,42.038674,12.140372,43.448501,85.565476,14.434524,672,49.703234,52.572559,26.629149,52.615497,88.793103,11.206897,1740
