#  **Urban Air Pollution**

## Data Understanding 


Load Libraries 

In [242]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import accuracy_score, recall_score, precision_recall_fscore_support, f1_score, mean_squared_error
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

from sklearn.tree import DecisionTreeClassifier

from sklearn.neighbors import KNeighborsClassifier

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

from sklearn.compose import ColumnTransformer

from sklearn.pipeline import Pipeline

from imblearn.over_sampling import SMOTE

from sklearn.decomposition import PCA 

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier

In [243]:
def read_data(path):

    """A simple function to load in the dataset"""

    data = pd.read_csv(path)
    return data

df = read_data(r"C:\Users\user\Documents\urban air pollution\Train.csv")
df.head()

Unnamed: 0,Place_ID X Date,Date,Place_ID,target,target_min,target_max,target_variance,target_count,precipitable_water_entire_atmosphere,relative_humidity_2m_above_ground,...,L3_SO2_sensor_zenith_angle,L3_SO2_solar_azimuth_angle,L3_SO2_solar_zenith_angle,L3_CH4_CH4_column_volume_mixing_ratio_dry_air,L3_CH4_aerosol_height,L3_CH4_aerosol_optical_depth,L3_CH4_sensor_azimuth_angle,L3_CH4_sensor_zenith_angle,L3_CH4_solar_azimuth_angle,L3_CH4_solar_zenith_angle
0,010Q650 X 2020-01-02,2020-01-02,010Q650,38.0,23.0,53.0,769.5,92,11.0,60.200001,...,38.593017,-61.752587,22.363665,1793.793579,3227.855469,0.010579,74.481049,37.501499,-62.142639,22.545118
1,010Q650 X 2020-01-03,2020-01-03,010Q650,39.0,25.0,63.0,1319.85,91,14.6,48.799999,...,59.624912,-67.693509,28.614804,1789.960449,3384.226562,0.015104,75.630043,55.657486,-53.868134,19.293652
2,010Q650 X 2020-01-04,2020-01-04,010Q650,24.0,8.0,56.0,1181.96,96,16.4,33.400002,...,49.839714,-78.342701,34.296977,,,,,,,
3,010Q650 X 2020-01-05,2020-01-05,010Q650,49.0,10.0,55.0,1113.67,96,6.911948,21.300001,...,29.181258,-73.896588,30.545446,,,,,,,
4,010Q650 X 2020-01-06,2020-01-06,010Q650,21.0,9.0,52.0,1164.82,95,13.900001,44.700001,...,0.797294,-68.61248,26.899694,,,,,,,


In [244]:
def check_info_shape(data):

    """A simple function to check the information and shape of the dataset"""

    print(f"The dataset has {data.shape[0]} rows")
    print("****************************************************************************")
    print("****************************************************************************")
    print(f"The dataset has {data.shape[1]} columns")
    print("****************************************************************************")
    print("****************************************************************************")
    print(data.info())

check_info_shape(df)    

The dataset has 30557 rows
****************************************************************************
****************************************************************************
The dataset has 82 columns
****************************************************************************
****************************************************************************
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30557 entries, 0 to 30556
Data columns (total 82 columns):
 #   Column                                               Non-Null Count  Dtype  
---  ------                                               --------------  -----  
 0   Place_ID X Date                                      30557 non-null  object 
 1   Date                                                 30557 non-null  object 
 2   Place_ID                                             30557 non-null  object 
 3   target                                               30557 non-null  float64
 4   target_min                      

In [245]:
def check_columns(data):

    """A simple function to check the columns of the dataset"""

    columns = data.columns
    return columns

check_columns(df)

Index(['Place_ID X Date', 'Date', 'Place_ID', 'target', 'target_min',
       'target_max', 'target_variance', 'target_count',
       'precipitable_water_entire_atmosphere',
       'relative_humidity_2m_above_ground',
       'specific_humidity_2m_above_ground', 'temperature_2m_above_ground',
       'u_component_of_wind_10m_above_ground',
       'v_component_of_wind_10m_above_ground',
       'L3_NO2_NO2_column_number_density',
       'L3_NO2_NO2_slant_column_number_density',
       'L3_NO2_absorbing_aerosol_index', 'L3_NO2_cloud_fraction',
       'L3_NO2_sensor_altitude', 'L3_NO2_sensor_azimuth_angle',
       'L3_NO2_sensor_zenith_angle', 'L3_NO2_solar_azimuth_angle',
       'L3_NO2_solar_zenith_angle',
       'L3_NO2_stratospheric_NO2_column_number_density',
       'L3_NO2_tropopause_pressure',
       'L3_NO2_tropospheric_NO2_column_number_density',
       'L3_O3_O3_column_number_density', 'L3_O3_O3_effective_temperature',
       'L3_O3_cloud_fraction', 'L3_O3_sensor_azimuth_angle',
   

In [246]:
def statistical_analysis(data):

    """A simple function to check the statistical distribution of the numeric column of the dataset """

    analysis = data.describe()
    return analysis

statistical_analysis(df)

Unnamed: 0,target,target_min,target_max,target_variance,target_count,precipitable_water_entire_atmosphere,relative_humidity_2m_above_ground,specific_humidity_2m_above_ground,temperature_2m_above_ground,u_component_of_wind_10m_above_ground,...,L3_SO2_sensor_zenith_angle,L3_SO2_solar_azimuth_angle,L3_SO2_solar_zenith_angle,L3_CH4_CH4_column_volume_mixing_ratio_dry_air,L3_CH4_aerosol_height,L3_CH4_aerosol_optical_depth,L3_CH4_sensor_azimuth_angle,L3_CH4_sensor_zenith_angle,L3_CH4_solar_azimuth_angle,L3_CH4_solar_zenith_angle
count,30557.0,30557.0,30557.0,30557.0,30557.0,30557.0,30557.0,30557.0,30557.0,30557.0,...,23320.0,23320.0,23320.0,5792.0,5792.0,5792.0,5792.0,5792.0,5792.0,5792.0
mean,61.148045,29.025866,117.992234,7983.756,125.831135,15.302326,70.552747,0.006004,9.321342,0.416886,...,35.590916,-123.697777,46.533951,923.231949,1711.793613,0.016227,1.254703,13.84904,-69.098594,23.10063
std,46.861309,33.119775,100.417713,48630.9,146.581856,10.688573,18.807884,0.003787,9.343226,2.70799,...,18.955228,71.916036,14.594267,929.633988,1741.299304,0.027016,55.10125,18.004375,84.702355,24.78635
min,1.0,1.0,1.0,0.0,2.0,0.420044,5.128572,0.000139,-34.647879,-15.559646,...,0.0,-179.88063,0.0,0.0,0.0,0.0,-105.367363,0.0,-179.947422,0.0
25%,25.0,5.0,60.0,1064.92,44.0,7.666667,58.600002,0.003403,3.123071,-1.097864,...,19.451524,-165.882624,36.693094,0.0,0.0,0.0,0.0,0.0,-161.726937,0.0
50%,50.0,15.0,91.0,2395.35,72.0,12.2,74.099998,0.004912,8.478424,0.222092,...,37.918838,-156.637162,47.44501,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,80.0,44.0,155.0,5882.55,150.0,19.9,85.450001,0.007562,16.201563,1.772925,...,52.270055,-118.453598,57.438181,1861.674119,3393.541633,0.023829,62.245728,27.412303,0.0,47.090635
max,815.0,438.0,999.0,1841490.0,1552.0,72.599998,100.0,0.021615,37.437921,17.955124,...,66.111289,179.776125,79.631711,2112.522949,6478.550544,0.210483,77.355232,59.97271,179.813344,69.992363


In [247]:
def data_types(data):

    """A simple function to check the the column data types"""

    print(f"The dataset has {len(data.select_dtypes(include='number').columns)} numeric columns")
    print(f"and {len(data.select_dtypes(include='object').columns)} categorical columns")
    print("********************************************************************")
    print("*********************************************************************")
    print(f"Categorical Columns {data.select_dtypes(include='object').columns}")
    print("********************************************************************")
    print("*********************************************************************")
    print(f"Numerical Columns{ data.select_dtypes(include='number').columns}")
    

data_types(df)
    

The dataset has 79 numeric columns
and 3 categorical columns
********************************************************************
*********************************************************************
Categorical Columns Index(['Place_ID X Date', 'Date', 'Place_ID'], dtype='object')
********************************************************************
*********************************************************************
Numerical ColumnsIndex(['target', 'target_min', 'target_max', 'target_variance', 'target_count',
       'precipitable_water_entire_atmosphere',
       'relative_humidity_2m_above_ground',
       'specific_humidity_2m_above_ground', 'temperature_2m_above_ground',
       'u_component_of_wind_10m_above_ground',
       'v_component_of_wind_10m_above_ground',
       'L3_NO2_NO2_column_number_density',
       'L3_NO2_NO2_slant_column_number_density',
       'L3_NO2_absorbing_aerosol_index', 'L3_NO2_cloud_fraction',
       'L3_NO2_sensor_altitude', 'L3_NO2_sensor_azimuth_angle',

## Data Cleaning 

Duplicates

In [248]:
duplicates = []

def check_duplicates(data):

    """A function to check the percentage of duplicates in the dataset """

    for i in data.duplicated():
        duplicates.append(i)
    duplicates_set = set(duplicates)
    if(len(duplicates_set) == 1):
        print("The Dataset Has No Duplicates")

    else:
        duplicates_percenatge = np.round(((sum(duplicates)/len(data)) * 100), 2)

check_duplicates(df)

The Dataset Has No Duplicates


Missing Values

In [249]:
def missing_values(data):

    """A function that displays a DataFrame for Missing Values"""

    missing_values = data.isnull().sum().sort_values(ascending=False)
    missing_val_percent = ((data.isnull().sum()/len(data)).sort_values(ascending=False) * 100)

    missing_df = pd.DataFrame({"missing_values" : missing_values, "missing values(%)" : missing_val_percent})
    return missing_df[missing_df['missing values(%)'] > 0]

missing_values(df)

Unnamed: 0,missing_values,missing values(%)
L3_CH4_solar_zenith_angle,24765,81.045260
L3_CH4_solar_azimuth_angle,24765,81.045260
L3_CH4_sensor_zenith_angle,24765,81.045260
L3_CH4_sensor_azimuth_angle,24765,81.045260
L3_CH4_aerosol_optical_depth,24765,81.045260
...,...,...
L3_AER_AI_sensor_azimuth_angle,197,0.644697
L3_AER_AI_absorbing_aerosol_index,197,0.644697
L3_AER_AI_solar_zenith_angle,197,0.644697
L3_AER_AI_solar_azimuth_angle,197,0.644697


Outliers

In [250]:
def check_outliers(data):

    """A function to check for outliers in the numeric columns using Z-Score"""

    df_num = df.select_dtypes(include='number')
    mean = np.mean(df_num)
    std = np.std(df_num)

    threshold = 3
    outliers = {}
    for col in df_num:
        z_scores = (df_num[col] - mean[col]) / std[col]
        num_outliers = len(z_scores[np.abs(z_scores) > threshold])
        if num_outliers > 0:
            outliers[col] = num_outliers
        

    return outliers 

outliers = check_outliers(df)
print("The Number of Outliers in Each Column:")
for col, num_outliers in outliers.items():
    print(col, ":", num_outliers)

The Number of Outliers in Each Column:
target : 255
target_min : 702
target_max : 584
target_variance : 130
target_count : 619
precipitable_water_entire_atmosphere : 466
relative_humidity_2m_above_ground : 67
specific_humidity_2m_above_ground : 346
temperature_2m_above_ground : 143
u_component_of_wind_10m_above_ground : 398
v_component_of_wind_10m_above_ground : 320
L3_NO2_NO2_column_number_density : 354
L3_NO2_NO2_slant_column_number_density : 389
L3_NO2_absorbing_aerosol_index : 101
L3_NO2_sensor_altitude : 2611
L3_NO2_solar_azimuth_angle : 280
L3_NO2_tropospheric_NO2_column_number_density : 321
L3_O3_O3_column_number_density : 655
L3_O3_O3_effective_temperature : 653
L3_O3_solar_azimuth_angle : 79
L3_O3_solar_zenith_angle : 653
L3_CO_CO_column_number_density : 654
L3_CO_H2O_column_number_density : 414
L3_CO_sensor_altitude : 330
L3_CO_solar_azimuth_angle : 634
L3_CO_solar_zenith_angle : 330
L3_HCHO_HCHO_slant_column_number_density : 322
L3_HCHO_solar_azimuth_angle : 506
L3_HCHO_sola

If you study the columns keenly from when we started data understanding, you would notice that this is a time series dataset. Previewing the Date column further makes this argument valid. I will then go ahead and proceed with treating the missing values with a forward fill, treating outliers and then changing the data column to a date data type


Treatment for Missing Values

In [251]:
def missing_values_treatment(data):

    """A simple function to forward fill the missing values"""

    data = data.ffill(inplace=True)
    return data

missing_values_treatment(df)

>>> The foward fill method by default only works on the column. Therefore if we have missing values at the beginning of a row on a column this means that the next row values will be filled with the None values instead of filling them with a data type. Let's exam  if the dataset is fully rid of misssing values 

In [252]:
""" calling the missing values function"""

missing_values(df)

Unnamed: 0,missing_values,missing values(%)
L3_CLOUD_surface_albedo,1,0.003273
L3_CLOUD_cloud_top_pressure,1,0.003273
L3_CLOUD_cloud_top_height,1,0.003273
L3_CLOUD_cloud_optical_depth,1,0.003273
L3_CLOUD_cloud_base_pressure,1,0.003273
L3_CLOUD_cloud_base_height,1,0.003273


See! There is still some missing values on some of the rows, but then these are just few missing values that are less than 10% of the dataset. I will go ahead and remove these rows.

In [253]:
def drop_rows(data, columns):
    
    """A simple function to remove the rows of columns that still have missing values """
    
    new_data = data.dropna(subset=columns, inplace=True)
    return new_data

col = ['L3_CLOUD_surface_albedo', "L3_CLOUD_cloud_top_pressure", "L3_CLOUD_cloud_top_height", "L3_CLOUD_cloud_optical_depth", "L3_CLOUD_cloud_base_pressure", "L3_CLOUD_cloud_base_height"]
drop_rows(df, col)
    

In [254]:
def missing_values_count(data):

    """A simple fuction that checks if the dataset is rid of missing values"""

    miss_val_count = data.isnull().any().any()
    return miss_val_count

missing_values_count(df)

False

Great ! No missing values 

In [255]:
def check_shape(data):

    """A simple function to check the shape of the dataset"""

    shape = data.shape
    return shape

check_shape(df)

(30556, 82)

Treatment for Outliers

In [256]:
def impute_outliers_with_mean(data):

    """A function that removes the outliers on the dataset"""

    outliers = check_outliers(data)

    for col in outliers:
        col_mean = np.mean(data[col])
        data.loc[np.abs((data[col] - np.mean(data[col])) / np.std(data[col])) <=3]
    return data

imputed_data = impute_outliers_with_mean(df)

In [257]:
check_info_shape(imputed_data)

The dataset has 30556 rows
****************************************************************************
****************************************************************************
The dataset has 82 columns
****************************************************************************
****************************************************************************
<class 'pandas.core.frame.DataFrame'>
Int64Index: 30556 entries, 1 to 30556
Data columns (total 82 columns):
 #   Column                                               Non-Null Count  Dtype  
---  ------                                               --------------  -----  
 0   Place_ID X Date                                      30556 non-null  object 
 1   Date                                                 30556 non-null  object 
 2   Place_ID                                             30556 non-null  object 
 3   target                                               30556 non-null  float64
 4   target_min                      

*Note:If you decide to remove the outliers you will remain with only 16,000 rows. That kind of data is not sufficient for modelling*

Dropping categorical columns

In [258]:
def drop_column(columns):
    drop_col = imputed_data.drop(columns=columns, inplace=True)
    return drop_col

categorical_columns = ['Place_ID X Date','Place_ID']
drop_column(categorical_columns)

In [259]:
check_shape(imputed_data)

(30556, 80)

Treating whitespaces, changing columns to lower case 

In [260]:
def lower_case_remove_whitespace(data):
    cleaned_col = [col.lower().strip() for col in data.columns]
    cleaned_data = data.copy()
    cleaned_data.columns = cleaned_col
    
    return cleaned_data

cleaned_data = lower_case_remove_whitespace(imputed_data) 
cleaned_data.head()

Unnamed: 0,date,target,target_min,target_max,target_variance,target_count,precipitable_water_entire_atmosphere,relative_humidity_2m_above_ground,specific_humidity_2m_above_ground,temperature_2m_above_ground,...,l3_so2_sensor_zenith_angle,l3_so2_solar_azimuth_angle,l3_so2_solar_zenith_angle,l3_ch4_ch4_column_volume_mixing_ratio_dry_air,l3_ch4_aerosol_height,l3_ch4_aerosol_optical_depth,l3_ch4_sensor_azimuth_angle,l3_ch4_sensor_zenith_angle,l3_ch4_solar_azimuth_angle,l3_ch4_solar_zenith_angle
1,2020-01-03,39.0,25.0,63.0,1319.85,91,14.6,48.799999,0.00839,22.546533,...,59.624912,-67.693509,28.614804,1789.960449,3384.226562,0.015104,75.630043,55.657486,-53.868134,19.293652
2,2020-01-04,24.0,8.0,56.0,1181.96,96,16.4,33.400002,0.0075,27.03103,...,49.839714,-78.342701,34.296977,1789.960449,3384.226562,0.015104,75.630043,55.657486,-53.868134,19.293652
3,2020-01-05,49.0,10.0,55.0,1113.67,96,6.911948,21.300001,0.00391,23.971857,...,29.181258,-73.896588,30.545446,1789.960449,3384.226562,0.015104,75.630043,55.657486,-53.868134,19.293652
4,2020-01-06,21.0,9.0,52.0,1164.82,95,13.900001,44.700001,0.00535,16.816309,...,0.797294,-68.61248,26.899694,1789.960449,3384.226562,0.015104,75.630043,55.657486,-53.868134,19.293652
5,2020-01-07,28.0,10.0,52.0,1053.22,94,14.6,42.200001,0.005862,19.17489,...,30.605176,-62.134264,23.419991,1789.960449,3384.226562,0.015104,75.630043,55.657486,-53.868134,19.293652


In [261]:
index_col = cleaned_data['date']
type(index_col)

pandas.core.series.Series

In [262]:
"""The above function will be used later on """
#cleaned_data['date'] = pd.to_datetime(cleaned_data['date'])
#cleaned_data.set_index('date', inplace=True)
#cleaned_data.head()

'The above function will be used later on '

## Preparing Data for PCA 

In [263]:
prep_df = cleaned_data.copy()

In [264]:
def drop_column(columns):
    drop_col = prep_df.drop(columns=columns, inplace=True)
    return drop_col

date_col = ['date']
drop_column(date_col)

In [267]:
scaler = StandardScaler()
data_scaled = scaler.fit_transform(prep_df)
data_scaled = pd.DataFrame(data_scaled, columns=prep_df.columns)

## Performing PCA on the Dataset

In [268]:
pca_transformer = PCA(n_components = 0.95, random_state=42)
principalComponents = pca_transformer.fit_transform(data_scaled)
principalComponents

array([[ 2.95037457,  4.01134177, -2.90791247, ...,  0.16221596,
         0.53122644, -0.005717  ],
       [ 1.8184507 ,  4.0420532 , -1.07876123, ...,  0.13606242,
         0.13538183, -0.12207662],
       [ 1.95306171,  2.52696938,  0.40799857, ...,  0.07220045,
         0.03876442,  0.44349021],
       ...,
       [-0.02541203, -3.38933324,  2.56544887, ...,  0.29896557,
        -0.48575703, -0.03360722],
       [ 1.02139422, -4.42679147, -0.40670544, ...,  0.32559509,
        -0.45220972,  0.40104491],
       [ 0.88072227, -4.3328982 , -0.30985829, ...,  0.32188109,
        -0.25149154,  0.23410508]])

In [269]:
def check_shape_array(array):
    array_shape = array.shape
    return array_shape

check_shape_array(principalComponents)

(30556, 37)

Creating a new DataFrame with only the 36 most important components

In [271]:
def dataframe_from_principal_components(array):
    principalComponents_df = pd.DataFrame(data = principalComponents,
                          columns = ['PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'PC6', 'PC7', 'PC8',
                                     'PC9', 'PC10', 'PC11', 'PC12','PC13', 'PC14', 'PC15', 'PC16',
                                     'PC17', 'PC18', 'PC19', 'PC20', 'PC21', 'PC22', 'PC23', 'PC24',
                                     'PC25', 'PC26', 'PC27', 'PC28', 'PC29', 'PC30', 'PC31', 'PC32',
                                     'PC33', 'PC34', 'PC35', 'PC36', 'PC37'])
    


    principalcomponents_df = principalComponents_df
    return principalcomponents_df

dataframe_from_principal_components(principalComponents)

Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,...,PC28,PC29,PC30,PC31,PC32,PC33,PC34,PC35,PC36,PC37
0,2.950375,4.011342,-2.907912,-0.931740,-2.850972,-0.074208,-0.571760,-2.364258,-0.349347,1.147705,...,-0.424894,0.786062,-0.554931,0.484752,0.340740,0.145227,0.053892,0.162216,0.531226,-0.005717
1,1.818451,4.042053,-1.078761,-3.487418,-1.905837,-1.177331,-0.109734,-2.947998,-0.781784,1.316660,...,-0.477013,0.539662,-0.438008,0.783537,0.329707,0.421217,0.149385,0.136062,0.135382,-0.122077
2,1.953062,2.526969,0.407999,-4.086601,-2.111408,-1.478389,-0.499246,-2.532457,0.526319,2.006824,...,0.017240,0.266871,-0.210868,0.587509,0.098352,0.844194,0.138015,0.072200,0.038764,0.443490
3,4.089103,-0.094329,2.783056,1.307712,-2.941191,-2.047277,1.263416,-1.290520,2.502486,0.859979,...,1.733241,-0.513405,-0.156258,-0.107853,0.877014,0.699365,-0.879798,-0.323122,0.541954,-0.082859
4,4.069051,1.318132,0.188144,2.880350,-2.879077,-1.803864,0.727421,-2.236287,0.627682,1.472639,...,0.775111,0.140368,0.239823,0.289540,0.675361,0.553914,-0.500853,-0.067592,0.327989,0.147392
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30551,-1.314822,3.803900,-1.055498,-0.689257,-4.626329,0.739933,-0.886405,-2.397353,-0.092888,0.519778,...,-0.308754,-1.162371,0.382701,0.572415,-1.172555,-0.387747,-0.097571,0.208134,-0.238933,-0.329629
30552,-0.918464,0.438689,-0.383119,-3.444514,-3.319896,-0.536779,-0.683769,-1.123447,-0.908702,0.242400,...,0.123470,0.504037,-0.525551,0.089177,0.489587,0.153840,-0.549268,-0.047903,-0.335556,-0.214437
30553,-0.025412,-3.389333,2.565449,-2.856501,-0.602996,-3.922372,3.923365,-1.267542,-0.340958,-1.462445,...,0.496372,0.532909,0.276490,-0.109827,0.661875,0.290244,-0.048875,0.298966,-0.485757,-0.033607
30554,1.021394,-4.426791,-0.406705,0.635815,-2.544622,-0.515667,-2.125935,-1.075841,0.443567,0.071633,...,-0.859592,0.552867,-0.020428,0.459876,0.415279,0.615338,-0.261487,0.325595,-0.452210,0.401045
