In [1]:
import os
os.getcwd()

'/Users/cy/Desktop/IS-477-Project/code'

In [2]:
import pandas as pd

# ===== LOAD DATA =====
df = pd.read_csv("/Users/cy/Desktop/IS-477-Project/data/processed/chicago_pm25_weather_daily.csv")

# ===== BASIC INFO =====
print("Shape:", df.shape)
print("\n--- INFO ---")
df.info()

print("\n--- MISSING VALUES ---")
print(df.isna().sum())

# ===== DUPLICATES =====
print("\n--- DUPLICATE ROWS ---")
dup_all = df[df.duplicated(keep=False)]
display(dup_all)

print("\n--- DUPLICATE DATES ---")
dup_dates = df[df.duplicated(subset=["date"], keep=False)]
display(dup_dates)

# ===== LOGICAL CHECKS =====

# Convert date
df["date"] = pd.to_datetime(df["date"], errors="coerce")
bad_dates = df[df["date"].isna()]
print("\n--- BAD DATES ---")
display(bad_dates)

# Humidity 0–100
humidity_bad = df[(df["humidity_mean"] < 0) | (df["humidity_mean"] > 100)]
print("\n--- BAD HUMIDITY ---")
display(humidity_bad)

# Wind direction 0–360
wind_bad = df[(df["wind_dir_dominant"] < 0) | (df["wind_dir_dominant"] > 360)]
print("\n--- BAD WIND DIRECTION ---")
display(wind_bad)

# PM2.5 ordering
pm25_bad = df[
    ~((df["pm25_min"] <= df["pm25_mean"]) & (df["pm25_mean"] <= df["pm25_max"]))
]
print("\n--- BAD PM25 ORDERING ---")
display(pm25_bad)

# Temperature ordering
temp_bad = df[df["temp_min"] > df["temp_max"]]
print("\n--- BAD TEMPERATURE (min > max) ---")
display(temp_bad)

# Negative values in columns that must be >= 0
cols_no_negative = ["pm25_min","pm25_mean","pm25_max","precip_sum","n_measurements"]
negative_issues = {}

for col in cols_no_negative:
    bad = df[df[col] < 0] if col in df.columns else pd.DataFrame()
    negative_issues[col] = len(bad)
    if not bad.empty:
        print(f"\n--- NEGATIVE VALUES IN {col} ---")
        display(bad)

# ===== SUMMARY =====
summary = {
    "missing_total": df.isna().sum().sum(),
    "bad_dates": len(bad_dates),
    "bad_humidity": len(humidity_bad),
    "bad_wind_direction": len(wind_bad),
    "bad_pm25_ordering": len(pm25_bad),
    "bad_temperature_order": len(temp_bad),
    "negative_values": negative_issues,
    "duplicate_rows": len(dup_all),
    "duplicate_dates": len(dup_dates),
}

print("\n=== DATA QUALITY SUMMARY ===")
summary


Shape: (128, 12)

--- INFO ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 128 entries, 0 to 127
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   date               128 non-null    object 
 1   pm25_mean          128 non-null    float64
 2   pm25_min           128 non-null    float64
 3   pm25_max           128 non-null    float64
 4   n_measurements     128 non-null    int64  
 5   temp_max           128 non-null    float64
 6   temp_min           128 non-null    float64
 7   precip_sum         128 non-null    float64
 8   shortwave_rad_sum  128 non-null    float64
 9   humidity_mean      128 non-null    int64  
 10  wind_speed_max     128 non-null    float64
 11  wind_dir_dominant  128 non-null    int64  
dtypes: float64(8), int64(3), object(1)
memory usage: 12.1+ KB

--- MISSING VALUES ---
date                 0
pm25_mean            0
pm25_min             0
pm25_max             0
n_measurement

Unnamed: 0,date,pm25_mean,pm25_min,pm25_max,n_measurements,temp_max,temp_min,precip_sum,shortwave_rad_sum,humidity_mean,wind_speed_max,wind_dir_dominant



--- DUPLICATE DATES ---


Unnamed: 0,date,pm25_mean,pm25_min,pm25_max,n_measurements,temp_max,temp_min,precip_sum,shortwave_rad_sum,humidity_mean,wind_speed_max,wind_dir_dominant



--- BAD DATES ---


Unnamed: 0,date,pm25_mean,pm25_min,pm25_max,n_measurements,temp_max,temp_min,precip_sum,shortwave_rad_sum,humidity_mean,wind_speed_max,wind_dir_dominant



--- BAD HUMIDITY ---


Unnamed: 0,date,pm25_mean,pm25_min,pm25_max,n_measurements,temp_max,temp_min,precip_sum,shortwave_rad_sum,humidity_mean,wind_speed_max,wind_dir_dominant



--- BAD WIND DIRECTION ---


Unnamed: 0,date,pm25_mean,pm25_min,pm25_max,n_measurements,temp_max,temp_min,precip_sum,shortwave_rad_sum,humidity_mean,wind_speed_max,wind_dir_dominant



--- BAD PM25 ORDERING ---


Unnamed: 0,date,pm25_mean,pm25_min,pm25_max,n_measurements,temp_max,temp_min,precip_sum,shortwave_rad_sum,humidity_mean,wind_speed_max,wind_dir_dominant



--- BAD TEMPERATURE (min > max) ---


Unnamed: 0,date,pm25_mean,pm25_min,pm25_max,n_measurements,temp_max,temp_min,precip_sum,shortwave_rad_sum,humidity_mean,wind_speed_max,wind_dir_dominant



=== DATA QUALITY SUMMARY ===


{'missing_total': np.int64(0),
 'bad_dates': 0,
 'bad_humidity': 0,
 'bad_wind_direction': 0,
 'bad_pm25_ordering': 0,
 'bad_temperature_order': 0,
 'negative_values': {'pm25_min': 0,
  'pm25_mean': 0,
  'pm25_max': 0,
  'precip_sum': 0,
  'n_measurements': 0},
 'duplicate_rows': 0,
 'duplicate_dates': 0}

In [3]:
import pandas as pd

# ===== LOAD RAW DATA =====
df = pd.read_csv("/Users/cy/Desktop/IS-477-Project/data/processed/chicago_pm25_weather_daily.csv")

# Make a copy for cleaning
df_clean = df.copy()

# ===== FIX COLUMN TYPES =====
df_clean["date"] = pd.to_datetime(df_clean["date"], errors="coerce")

# ===== REMOVE IMPOSSIBLE VALUES =====

# 1. Drop bad humidity (must be 0–100)
df_clean = df_clean[df_clean["humidity_mean"].between(0, 100)]

# 2. Drop bad wind direction (must be 0–360)
df_clean = df_clean[df_clean["wind_dir_dominant"].between(0, 360)]

# 3. Set negative values in these columns to NaN
cols_no_negative = ["pm25_min", "pm25_mean", "pm25_max", "precip_sum", "n_measurements"]
for col in cols_no_negative:
    df_clean.loc[df_clean[col] < 0, col] = pd.NA

# ===== FIX PM2.5 ORDERING =====
# If pm25_min > pm25_mean or pm25_mean > pm25_max → set group to NA
pm25_bad = ~(
    (df_clean["pm25_min"] <= df_clean["pm25_mean"]) &
    (df_clean["pm25_mean"] <= df_clean["pm25_max"])
)
df_clean.loc[pm25_bad, ["pm25_min", "pm25_mean", "pm25_max"]] = pd.NA

# ===== FIX TEMPERATURE ORDERING =====
# If min > max → swap
temp_swap = df_clean["temp_min"] > df_clean["temp_max"]
df_clean.loc[temp_swap, ["temp_min", "temp_max"]] = df_clean.loc[
    temp_swap, ["temp_max", "temp_min"]
].to_numpy()

# ===== REMOVE DUPLICATE DATES =====
df_clean = df_clean.drop_duplicates(subset=["date"], keep="first")

# ===== RESET INDEX =====
df_clean = df_clean.reset_index(drop=True)

# ===== SAVE CLEANED CSV =====
output_path = "/Users/cy/Desktop/IS-477-Project/data/clean/chicago_pm25_weather_daily_clean.csv"
df_clean.to_csv(output_path, index=False)

output_path


'/Users/cy/Desktop/IS-477-Project/data/clean/chicago_pm25_weather_daily_clean.csv'

In [4]:
df_clean

Unnamed: 0,date,pm25_mean,pm25_min,pm25_max,n_measurements,temp_max,temp_min,precip_sum,shortwave_rad_sum,humidity_mean,wind_speed_max,wind_dir_dominant
0,2023-01-03,6.700,6.5,6.9,2.0,12.7,4.2,13.1,2.13,98,23.6,112
1,2023-01-06,9.625,8.2,11.2,4.0,0.9,-3.7,0.0,5.71,79,18.5,264
2,2023-01-09,20.250,20.0,20.5,2.0,5.7,-4.9,0.0,8.79,80,20.2,215
3,2023-01-12,11.225,10.7,11.6,4.0,6.4,1.4,0.3,2.64,87,37.9,351
4,2023-01-15,7.850,7.8,7.9,2.0,5.1,-3.7,0.0,7.99,69,30.5,166
...,...,...,...,...,...,...,...,...,...,...,...,...
123,2023-12-19,6.000,6.0,6.0,1.0,0.4,-7.6,0.0,7.98,51,22.8,220
124,2023-12-20,10.400,9.1,11.6,4.0,5.5,-2.5,0.0,7.91,68,18.7,170
125,2023-12-23,16.450,14.8,18.1,2.0,8.9,6.2,0.2,2.34,98,11.4,121
126,2023-12-26,4.275,3.7,5.3,4.0,11.3,0.4,0.3,2.87,84,22.7,210


In [5]:
df

Unnamed: 0,date,pm25_mean,pm25_min,pm25_max,n_measurements,temp_max,temp_min,precip_sum,shortwave_rad_sum,humidity_mean,wind_speed_max,wind_dir_dominant
0,2023-01-03,6.700,6.5,6.9,2,12.7,4.2,13.1,2.13,98,23.6,112
1,2023-01-06,9.625,8.2,11.2,4,0.9,-3.7,0.0,5.71,79,18.5,264
2,2023-01-09,20.250,20.0,20.5,2,5.7,-4.9,0.0,8.79,80,20.2,215
3,2023-01-12,11.225,10.7,11.6,4,6.4,1.4,0.3,2.64,87,37.9,351
4,2023-01-15,7.850,7.8,7.9,2,5.1,-3.7,0.0,7.99,69,30.5,166
...,...,...,...,...,...,...,...,...,...,...,...,...
123,2023-12-19,6.000,6.0,6.0,1,0.4,-7.6,0.0,7.98,51,22.8,220
124,2023-12-20,10.400,9.1,11.6,4,5.5,-2.5,0.0,7.91,68,18.7,170
125,2023-12-23,16.450,14.8,18.1,2,8.9,6.2,0.2,2.34,98,11.4,121
126,2023-12-26,4.275,3.7,5.3,4,11.3,0.4,0.3,2.87,84,22.7,210
