In [None]:
import pandas as pd

# Load data & drop unnamed columns
df = pd.read_csv('C:/airqual-project/AirQualityUCI.csv', sep=';', decimal=',', usecols=range(15))

# Convert date & time to datetime
df['Datetime'] = pd.to_datetime(df['Date'] + ' ' + df['Time'], format= '%d/%m/%Y %H.%M.%S', dayfirst=True)
df.drop(columns=['Date', 'Time', 'NMHC(GT)'], inplace=True) # Dropped MMHC(GT) due to high number of missing values
col = 'Datetime'
df = df[[col] + [c for c in df if c not in [col]]]

# Standardize data types
for col in df.columns:
    if col != 'Datetime':
        df[col] = pd.to_numeric(df[col], errors='coerce')

# Standardize column names
df.rename(columns={'CO(GT)': 'co_gt', 
                   'PT08.S1(CO)': 'pt08_s1_co',  
                   'C6H6(GT)': 'c6h6_gt', 
                   'PT08.S2(NMHC)': 'pt08_s2_nmhc', 
                   'NOx(GT)': 'nox_gt', 
                   'PT08.S3(NOx)': 'pt08_s3_nox', 
                   'NO2(GT)': 'no2_gt', 
                   'PT08.S4(NO2)': 'pt08_s4_no2', 
                   'PT08.S5(O3)': 'pt08_s5_o3', 
                   'T': 't', 'RH': 'rh', 'AH': 'ah'}, inplace=True)

# Handle missing values
df = df.replace(-200, pd.NA)
df.interpolate(method='linear', limit_direction='both', inplace=True) 
df = df[df.notna().sum(axis=1) > 5]

df.info()
df.head()
df.to_csv('C:/airqual-project/AirQualityUCI_cleaned.csv', index=False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9471 entries, 0 to 9470
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   Datetime      9471 non-null   datetime64[ns]
 1   co_gt         7674 non-null   object        
 2   pt08_s1_co    8991 non-null   object        
 3   c6h6_gt       8991 non-null   object        
 4   pt08_s2_nmhc  8991 non-null   object        
 5   nox_gt        7718 non-null   object        
 6   pt08_s3_nox   8991 non-null   object        
 7   no2_gt        7715 non-null   object        
 8   pt08_s4_no2   8991 non-null   object        
 9   pt08_s5_o3    8991 non-null   object        
 10  t             8991 non-null   object        
 11  rh            8991 non-null   object        
 12  ah            8991 non-null   object        
dtypes: datetime64[ns](1), object(12)
memory usage: 962.0+ KB


  df.interpolate(method='linear', limit_direction='both', inplace=True)
