*Made By [Adnan](https://linktr.ee/adnaaaen)*

# ***Data Preprocessing***

In [1]:
import pandas as pd
from category_encoders import BinaryEncoder
from sklearn.preprocessing import StandardScaler
import joblib

from warnings import filterwarnings
import os

filterwarnings("ignore")

In [2]:
if not os.path.exists("../data/cleaned/cleaned.csv"):
    raise Exception("your need to download dataset, visit data/ folder in the root")
    
df = pd.read_csv("../data/cleaned/cleaned.csv", parse_dates=["datetime"])
df.head()

Unnamed: 0,city,datetime,pm2.5,pm10,no,no2,n_ox,nh3,co,so2,o3,benzene,toluene,aqi
0,Ahmedabad,2015-01-29 09:00:00,4.400235,4.532815,0.883768,3.131137,3.075005,2.824944,0.883768,3.504355,3.865141,0.0,0.0,5.666427
1,Ahmedabad,2015-01-29 10:00:00,4.486387,4.532815,1.108563,3.038313,3.145014,2.824944,1.108563,4.381401,3.130263,0.0,0.0,5.204007
2,Ahmedabad,2015-01-29 11:00:00,4.516667,4.532815,1.000632,3.049747,3.140698,2.824944,1.000632,4.573267,4.229312,0.0,0.0,5.209486
3,Ahmedabad,2015-01-29 12:00:00,4.437107,4.532815,0.559616,2.853593,2.861057,2.824944,0.559616,3.826683,4.49892,0.0,0.0,5.209486
4,Ahmedabad,2015-01-29 13:00:00,4.239887,4.532815,0.457425,2.689886,2.60269,2.824944,0.457425,3.571503,4.768564,0.0,0.0,5.192957


### ***Feature Engineering***

**split datetime variable into `day` `month` `year` `hour` `minute`**

In [3]:
"""extract from exisitng feature"""
df["day"] = df["datetime"].dt.day
df["month"] = df["datetime"].dt.month
df["year"] = df["datetime"].dt.year
df["day_of_week"] = df["datetime"].dt.weekday
df["hour"] = df["datetime"].dt.hour

In [4]:
""" new feature
5: sat
6: sun
if the day is 5 or 6 : that is a weekend
"""
df["is_weekend"] = df["day_of_week"].isin([5,6]).astype(int)

In [5]:
""" new feature

- Winter (December - February) : 0
- Summer (March - May)         : 1
- Monsoon (June - September)   : 2
- Autumn (October - November)  : 3
"""

def set_season(month: int):
    if month in [12, 1, 2]:
        return 0  # Winter
    elif month in [3, 4, 5]:
        return 1  # Summer
    elif month in [6, 7, 8, 9]:
        return 2  # Monsoon
    elif month in [10, 11]:
        return 3  # Autumn

df["season"] = df["month"].apply(set_season)

In [6]:
df = df.drop(["datetime"], axis=1)

In [7]:
df.head()

Unnamed: 0,city,pm2.5,pm10,no,no2,n_ox,nh3,co,so2,o3,benzene,toluene,aqi,day,month,year,day_of_week,hour,is_weekend,season
0,Ahmedabad,4.400235,4.532815,0.883768,3.131137,3.075005,2.824944,0.883768,3.504355,3.865141,0.0,0.0,5.666427,29,1,2015,3,9,0,0
1,Ahmedabad,4.486387,4.532815,1.108563,3.038313,3.145014,2.824944,1.108563,4.381401,3.130263,0.0,0.0,5.204007,29,1,2015,3,10,0,0
2,Ahmedabad,4.516667,4.532815,1.000632,3.049747,3.140698,2.824944,1.000632,4.573267,4.229312,0.0,0.0,5.209486,29,1,2015,3,11,0,0
3,Ahmedabad,4.437107,4.532815,0.559616,2.853593,2.861057,2.824944,0.559616,3.826683,4.49892,0.0,0.0,5.209486,29,1,2015,3,12,0,0
4,Ahmedabad,4.239887,4.532815,0.457425,2.689886,2.60269,2.824944,0.457425,3.571503,4.768564,0.0,0.0,5.192957,29,1,2015,3,13,0,0


### ***Label Encoding***

In [8]:
print(f"Cardinality of 'city' : {df['city'].nunique()}")
# cardinality b/w 10 & 50 : we can choose Binary Encoding 

Cardinality of 'city' : 26


In [9]:
binary_encoder = BinaryEncoder(cols="city")
encoded_df = binary_encoder.fit_transform(df)
encoded_df.head()

Unnamed: 0,city_0,city_1,city_2,city_3,city_4,pm2.5,pm10,no,no2,n_ox,...,benzene,toluene,aqi,day,month,year,day_of_week,hour,is_weekend,season
0,0,0,0,0,1,4.400235,4.532815,0.883768,3.131137,3.075005,...,0.0,0.0,5.666427,29,1,2015,3,9,0,0
1,0,0,0,0,1,4.486387,4.532815,1.108563,3.038313,3.145014,...,0.0,0.0,5.204007,29,1,2015,3,10,0,0
2,0,0,0,0,1,4.516667,4.532815,1.000632,3.049747,3.140698,...,0.0,0.0,5.209486,29,1,2015,3,11,0,0
3,0,0,0,0,1,4.437107,4.532815,0.559616,2.853593,2.861057,...,0.0,0.0,5.209486,29,1,2015,3,12,0,0
4,0,0,0,0,1,4.239887,4.532815,0.457425,2.689886,2.60269,...,0.0,0.0,5.192957,29,1,2015,3,13,0,0


In [10]:
encoded_df.columns

Index(['city_0', 'city_1', 'city_2', 'city_3', 'city_4', 'pm2.5', 'pm10', 'no',
       'no2', 'n_ox', 'nh3', 'co', 'so2', 'o3', 'benzene', 'toluene', 'aqi',
       'day', 'month', 'year', 'day_of_week', 'hour', 'is_weekend', 'season'],
      dtype='object')

### ***Standardization***

In [11]:
"""feature : target split"""

X = encoded_df.drop(["aqi"], axis=1)
Y = encoded_df["aqi"]

In [12]:
"""standardizing over min max scaling data bcz , all feature follow a normal distribution"""

standard_scaler = StandardScaler()
standardized_x = pd.DataFrame(standard_scaler.fit_transform(X), columns=X.columns)
standardized_x.head()

Unnamed: 0,city_0,city_1,city_2,city_3,city_4,pm2.5,pm10,no,no2,n_ox,...,o3,benzene,toluene,day,month,year,day_of_week,hour,is_weekend,season
0,-0.815813,-0.925448,-0.990663,-0.847216,0.76235,0.644657,0.050355,-1.393542,0.08605,0.033354,...,0.701922,-1.117163,-1.479028,1.501441,-1.527203,-2.093452,-0.002554,-0.361599,-0.632988,-1.329244
1,-0.815813,-0.925448,-0.990663,-0.847216,0.76235,0.746076,0.050355,-1.170187,-0.022855,0.105121,...,-0.127273,-1.117163,-1.479028,1.501441,-1.527203,-2.093452,-0.002554,-0.216941,-0.632988,-1.329244
2,-0.815813,-0.925448,-0.990663,-0.847216,0.76235,0.781722,0.050355,-1.277426,-0.00944,0.100697,...,1.112832,-1.117163,-1.479028,1.501441,-1.527203,-2.093452,-0.002554,-0.072283,-0.632988,-1.329244
3,-0.815813,-0.925448,-0.990663,-0.847216,0.76235,0.688063,0.050355,-1.715616,-0.239575,-0.185967,...,1.417043,-1.117163,-1.479028,1.501441,-1.527203,-2.093452,-0.002554,0.072375,-0.632988,-1.329244
4,-0.815813,-0.925448,-0.990663,-0.847216,0.76235,0.455894,0.050355,-1.817152,-0.43164,-0.450823,...,1.721293,-1.117163,-1.479028,1.501441,-1.527203,-2.093452,-0.002554,0.217033,-0.632988,-1.329244


In [13]:
"""concat the standardized feature + target variable"""

preprocessed_df = pd.concat([standardized_x, Y], axis=1)
preprocessed_df.head()

Unnamed: 0,city_0,city_1,city_2,city_3,city_4,pm2.5,pm10,no,no2,n_ox,...,benzene,toluene,day,month,year,day_of_week,hour,is_weekend,season,aqi
0,-0.815813,-0.925448,-0.990663,-0.847216,0.76235,0.644657,0.050355,-1.393542,0.08605,0.033354,...,-1.117163,-1.479028,1.501441,-1.527203,-2.093452,-0.002554,-0.361599,-0.632988,-1.329244,5.666427
1,-0.815813,-0.925448,-0.990663,-0.847216,0.76235,0.746076,0.050355,-1.170187,-0.022855,0.105121,...,-1.117163,-1.479028,1.501441,-1.527203,-2.093452,-0.002554,-0.216941,-0.632988,-1.329244,5.204007
2,-0.815813,-0.925448,-0.990663,-0.847216,0.76235,0.781722,0.050355,-1.277426,-0.00944,0.100697,...,-1.117163,-1.479028,1.501441,-1.527203,-2.093452,-0.002554,-0.072283,-0.632988,-1.329244,5.209486
3,-0.815813,-0.925448,-0.990663,-0.847216,0.76235,0.688063,0.050355,-1.715616,-0.239575,-0.185967,...,-1.117163,-1.479028,1.501441,-1.527203,-2.093452,-0.002554,0.072375,-0.632988,-1.329244,5.209486
4,-0.815813,-0.925448,-0.990663,-0.847216,0.76235,0.455894,0.050355,-1.817152,-0.43164,-0.450823,...,-1.117163,-1.479028,1.501441,-1.527203,-2.093452,-0.002554,0.217033,-0.632988,-1.329244,5.192957


In [14]:
"""save label-encoder & standard-scaler for future prediction"""

if not os.path.exists("../model/encoders/"):
    os.mkdir("../model/encoders")
joblib.dump(binary_encoder, "../model/encoders/binary_encoder.joblib")
joblib.dump(standard_scaler, "../model/encoders/standard_scaler.joblib")

['../model/encoders/standard_scaler.joblib']

In [15]:
"""save preprocessed dataset"""

if not os.path.exists("../data/preprocessed"):
    os.mkdir("../data/preprocessed")
    
preprocessed_df.to_csv("../data/preprocessed/preprocessed.csv", index=False)
print("cleaned dataset saved successfully")

cleaned dataset saved successfully


### ***Next: Model Building***