In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/air-quality/global_air_quality_data_10000_date_year_2022.csv
/kaggle/input/country-data/Countries.csv


In [18]:
import pandas as pd
import numpy as np

AIR_PATH = "/kaggle/input/air-quality/global_air_quality_data_10000_date_year_2022.csv"
CTY_PATH = "/kaggle/input/country-data/Countries.csv"
OUT_DIR = "/kaggle/working/"


In [4]:
air = pd.read_csv(AIR_PATH)
cty = pd.read_csv(CTY_PATH)

print("AIR:", air.shape)
print("COUNTRIES:", cty.shape)

print("\nAIR nulls:", air.isna().sum().sum())
print("AIR duplicates:", air.duplicated().sum())

print("\nCOUNTRIES key duplicates (Country Name, Year):",
      cty.duplicated(subset=["Country Name", "Year"]).sum())

cty.isna().sum().sort_values(ascending=False).head(10)


AIR: (10000, 12)
COUNTRIES: (5106, 25)

AIR nulls: 0
AIR duplicates: 0

COUNTRIES key duplicates (Country Name, Year): 161


R&D                           1518
Unemployment                   667
Ease of Doing Business         598
Health Expenditure (% GDP)     575
Health Expenditure             575
Inflation Rate                 552
Export (% GDP)                 460
Net Trade                      460
Import                         460
Export                         460
dtype: int64

In [6]:
air.columns = [c.strip() for c in air.columns]
air["City"] = air["City"].astype(str).str.strip()
air["Country"] = air["Country"].astype(str).str.strip()

air["Date"] = pd.to_datetime(air["Date"], errors="coerce")
assert air["Date"].isna().sum() == 0, "Date parsing failed for some rows"

air["Year"] = air["Date"].dt.year.astype("Int64")
air["Month"] = air["Date"].dt.month.astype("Int64")
air["MonthName"] = air["Date"].dt.month_name()
air["Day"] = air["Date"].dt.day.astype("Int64")

def flag_out_of_range(series, lo=None, hi=None):
    s = pd.to_numeric(series, errors="coerce")
    flag = pd.Series(False, index=series.index)
    if lo is not None: flag |= s < lo
    if hi is not None: flag |= s > hi
    return flag

air["Flag_Humidity_Range"] = flag_out_of_range(air["Humidity"], 0, 100)
air["Flag_Wind_Negative"] = flag_out_of_range(air["Wind Speed"], 0, None)

print("Humidity out of range:", int(air["Flag_Humidity_Range"].sum()))
print("Negative wind speed:", int(air["Flag_Wind_Negative"].sum()))



Humidity out of range: 0
Negative wind speed: 0


In [7]:
p90 = air.groupby("City")["PM2.5"].quantile(0.90).rename("PM25_P90_City")
air = air.merge(p90, on="City", how="left")
air["PM25_Spike"] = air["PM2.5"] > air["PM25_P90_City"]

air[["City", "PM2.5", "PM25_P90_City", "PM25_Spike"]].head()


Unnamed: 0,City,PM2.5,PM25_P90_City,PM25_Spike
0,Bangkok,86.57,138.142,False
1,Istanbul,50.63,136.585,False
2,Rio de Janeiro,130.21,134.43,False
3,Mumbai,119.7,138.613,False
4,Paris,55.2,130.136,False


In [8]:
cty.columns = [c.strip() for c in cty.columns]
cty["Country Name"] = cty["Country Name"].astype(str).str.strip()
cty["Country Code"] = cty["Country Code"].astype(str).str.strip()
cty["Continent Name"] = cty["Continent Name"].astype(str).str.strip()
cty["Year"] = pd.to_numeric(cty["Year"], errors="coerce").astype("Int64")

# keep best row per (Country, Year): fewest nulls
cty["_nulls"] = cty.isna().sum(axis=1)
cty = (cty.sort_values(["Country Name", "Year", "_nulls"])
          .drop_duplicates(subset=["Country Name", "Year"], keep="first")
          .drop(columns=["_nulls"]))

assert cty.duplicated(subset=["Country Name", "Year"]).sum() == 0

# missing flags (useful in report + Tableau)
for col in ["R&D", "Unemployment", "Inflation Rate", "Ease of Doing Business"]:
    if col in cty.columns:
        cty[f"Missing_{col.replace(' ', '_')}"] = cty[col].isna()

cty.isna().sum().sort_values(ascending=False).head(10)


R&D                           1518
Unemployment                   667
Ease of Doing Business         598
Health Expenditure             575
Health Expenditure (% GDP)     575
Inflation Rate                 552
Import                         460
Import (% GDP)                 460
Net Trade                      460
Export                         460
dtype: int64