In [8]:
import wikipedia
import pandas as pd
from tqdm import tqdm
from rapidfuzz import process, fuzz
from scipy.stats import spearmanr

In [2]:
df = pd.read_csv("AirQuality/Dataset/Ground_Truth_2023_Final.csv")
df.columns

Index(['city', 'state', 'YearMonth', 'AT', 'BP', 'PM2.5', 'RF', 'VWS', 'WD',
       'WS', 'latitude', 'longitude'],
      dtype='object')

### Wikipedia

In [3]:
unique_cities = df[["city", "state"]].drop_duplicates().reset_index(drop=True)

wikipedia.set_lang("en")

def city_has_wikipedia_page(city, state):
    search_terms = [city.strip(), f"{city.strip()}, {state.strip()}"]
    
    for term in search_terms:
        try:
            page = wikipedia.page(term, auto_suggest=False)
            return True
        except (wikipedia.exceptions.DisambiguationError, wikipedia.exceptions.PageError):
            continue
        except:
            continue
    return False

tqdm.pandas()
unique_cities["has_wikipedia"] = unique_cities.progress_apply(
    lambda row: city_has_wikipedia_page(row["city"], row["state"]),
    axis=1
)




  lis = BeautifulSoup(html).find_all('li')
100%|██████████| 201/201 [03:49<00:00,  1.14s/it]


In [4]:
unique_cities

Unnamed: 0,city,state,has_wikipedia
0,Agartala,Tripura,True
1,Agra,Uttar Pradesh,True
2,Ahmedabad,Gujarat,True
3,Aizawl,Mizoram,True
4,Ajmer,Rajasthan,True
...,...,...,...
196,Vijayawada,Andhra Pradesh,True
197,Visakhapatnam,Andhra Pradesh,True
198,Vrindavan,Uttar Pradesh,True
199,Yadgir,Karnataka,True


In [5]:
unique_cities.loc[
    (unique_cities["city"].str.lower() == "puducherry".lower()) &
    (unique_cities["state"].str.lower() == "puducherry".lower()),
    "has_wikipedia"
] = True

In [6]:
missing_wiki_cities = unique_cities[unique_cities["has_wikipedia"] == False][["city", "state"]].drop_duplicates().reset_index(drop=True)

missing_wiki_cities

Unnamed: 0,city,state
0,Byrnihat,Assam
1,Chhal,Chhattisgarh
2,Kunjemura,Chhattisgarh
3,Mandikhera,Haryana
4,Manguraha,Bihar
5,Suakati,Odisha
6,Tumidih,Chhattisgarh


#### Wiki page length

In [11]:
unique_cities = df[["city", "state"]].drop_duplicates().reset_index(drop=True)

wikipedia.set_lang("en")

from rapidfuzz import process, fuzz

def get_wikipedia_info(city, state):
    search_terms = [city.strip(), f"{city.strip()}, {state.strip()}"]
    
    # Step 1: Try direct page fetch
    for term in search_terms:
        try:
            page = wikipedia.page(term, auto_suggest=False)
            return pd.Series([True, len(page.content)])
        except (wikipedia.exceptions.DisambiguationError, wikipedia.exceptions.PageError):
            continue
        except:
            continue
    
    # Step 2: Try fuzzy matching from search results
    try:
        search_results = wikipedia.search(city)
        if search_results:
            # Pick the most similar result to city name
            best_match, score, _ = process.extractOne(city, search_results, scorer=fuzz.token_sort_ratio)
            if score > 70:  # threshold can be tuned
                try:
                    page = wikipedia.page(best_match, auto_suggest=True)
                    return pd.Series([True, len(page.content)])
                except:
                    pass
    except:
        pass

    return pd.Series([False, 0])


# Apply function
tqdm.pandas()
unique_cities[["has_wikipedia", "wiki_len"]] = unique_cities.progress_apply(
    lambda row: get_wikipedia_info(row["city"], row["state"]),
    axis=1
)



  lis = BeautifulSoup(html).find_all('li')
100%|██████████| 201/201 [07:18<00:00,  2.18s/it]


In [14]:
missing_wiki_cities = unique_cities[unique_cities["has_wikipedia"] == False][["city", "state"]].drop_duplicates().reset_index(drop=True)

missing_wiki_cities

Unnamed: 0,city,state
0,Byrnihat,Assam
1,Chhal,Chhattisgarh
2,Kunjemura,Chhattisgarh
3,Mandikhera,Haryana
4,Manguraha,Bihar
5,Suakati,Odisha
6,Tumidih,Chhattisgarh


### News Article

In [16]:
file_path = "AirQuality/RQ2/Dataset/News_articles_dataset.csv.gz"
news_df = pd.read_csv(file_path)

print(news_df.columns)
print(news_df.shape)

Index(['media', 'date', 'url', 'heading', 'content', 'other.author',
       'other.top_image', 'other.category', 'city', 'year', 'state',
       'district', 'matches'],
      dtype='object')
(17374, 13)


In [20]:
news_df.columns

Index(['media', 'date', 'url', 'heading', 'content', 'other.author',
       'other.top_image', 'other.category', 'city', 'year', 'state',
       'district', 'matches'],
      dtype='object')

In [24]:
news_df['city'] = news_df['city'].str.lower().str.strip()
unique_cities['city'] = unique_cities['city'].str.lower().str.strip()

news_city_counts = news_df['city'].value_counts().to_dict()
unique_news_cities = list(news_city_counts.keys())

def get_best_match(city_name):
    result = process.extractOne(city_name, unique_news_cities, score_cutoff=80)
    if result is not None:
        match, score, _ = result  
        return match, news_city_counts.get(match, 0)
    else:
        return None, 0

unique_cities[['matched_news_city', 'media_count']] = unique_cities['city'].apply(
    lambda x: pd.Series(get_best_match(x))
)


In [22]:
news_df['city'] = news_df['city'].str.lower().str.strip()
news_df['state'] = news_df['state'].str.lower().str.strip()
unique_cities['city'] = unique_cities['city'].str.lower().str.strip()
unique_cities['state'] = unique_cities['state'].str.lower().str.strip()

news_city_state_counts = (
    news_df.groupby(['city', 'state'])
    .size()
    .to_dict()
)

available_city_state_pairs = list(news_city_state_counts.keys())

def get_best_match(city, state):
    result = process.extractOne(
        (city, state), 
        available_city_state_pairs, 
        scorer=fuzz.token_sort_ratio, 
        score_cutoff=80
    )
    if result:
        match, score, _ = result
        return f"{match[0]}, {match[1]}", news_city_state_counts.get(match, 0)
    else:
        return None, 0

unique_cities[['matched_news_city', 'media_count']] = unique_cities.apply(
    lambda row: pd.Series(get_best_match(row['city'], row['state'])),
    axis=1
)

In [23]:
unique_cities.columns

Index(['city', 'state', 'has_wikipedia', 'wiki_len', 'matched_news_city',
       'media_count'],
      dtype='object')

In [24]:
unique_cities.head()

Unnamed: 0,city,state,has_wikipedia,wiki_len,matched_news_city,media_count
0,agartala,tripura,True,30333,"agartala, tripura",1.0
1,agra,uttar pradesh,True,51925,"agra, uttar pradesh",88.0
2,ahmedabad,gujarat,True,45827,"ahmedabad, gujarat",239.0
3,aizawl,mizoram,True,15347,"aizawl, mizoram",1.0
4,ajmer,rajasthan,True,16162,"ajmer, rajasthan",3.0


In [28]:
unique_cities.to_csv("Wiki_News_Data.csv",index=False)

In [26]:
df_unique_cities = df.drop_duplicates(subset=["city", "state"])

media_count_stats = unique_cities["media_count"].value_counts().reset_index()
media_count_stats.columns = ["media_count", "num_cities"]
media_count_stats = media_count_stats.sort_values("media_count").reset_index(drop=True)
print(media_count_stats)


    media_count  num_cities
0           0.0         133
1           1.0          12
2           2.0           7
3           3.0           3
4           4.0           1
5           5.0           4
6           7.0           2
7          12.0           1
8          21.0           1
9          26.0           1
10         30.0           1
11         32.0           1
12         39.0           2
13         41.0           2
14         43.0           1
15         44.0           1
16         46.0           1
17         51.0           2
18         56.0           1
19         68.0           1
20         74.0           1
21         86.0           2
22         88.0           1
23         94.0           1
24        121.0           1
25        160.0           1
26        204.0           1
27        205.0           1
28        239.0           1
29        317.0           1
30        320.0           1
31        368.0           1
32        386.0           1
33        396.0           1
34        471.0     

In [27]:
print(f"Min: {unique_cities['media_count'].min()}, Max: {unique_cities['media_count'].max()}")

Min: 0.0, Max: 5285.0


In [27]:
unique_cities.to_csv("Wiki_News.csv",index=False)