# Layer.ai Air Quality Prediction Challenge
# By Mohamed Eltayeb & Azer Ksouri

# Setup

In [None]:
!nvidia-smi

Fri Jul 15 08:33:11 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   38C    P0    23W / 300W |      0MiB / 16160MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
%%capture
# Install Layer
!pip install -U layer -q
# Install Catboost
!pip install catboost --quiet

# Import libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

from sklearn.preprocessing import LabelEncoder
from catboost import CatBoostRegressor
from sklearn.compose import TransformedTargetRegressor
from sklearn.metrics import mean_absolute_error

# Layer package
import layer
from layer.decorators import dataset,model, pip_requirements

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
pd.options.mode.chained_assignment = None  # default='warn'
pd.set_option('display.float_format', lambda x: '%.3f' % x)
pd.set_option('display.max_columns', None)

# Login & register 

In [None]:
layer.login()

Please open the following link in your web browser. Once logged in, copy the code and paste it here.
https://app.layer.ai/oauth/authorize?response_type=code&code_challenge=nNGt1O8Nr6qcEmaFLq_8409194HI8IbfEHwOYFNUjbQ&code_challenge_method=S256&client_id=0STDdcnpK48P8A429EAAn93WNuLmViLR&redirect_uri=https://app.layer.ai/oauth/code&scope=offline_access&audience=https://app.layer.ai


In [None]:
layer.init("Air_Quality_Prediction_Challenge_Exp_Base")

## Define Functions

In [None]:
#Group Time Series Split
from sklearn.model_selection._split import _BaseKFold, indexable, _num_samples
from sklearn.utils.validation import _deprecate_positional_args

class GroupTimeSeriesSplit(_BaseKFold):
    @_deprecate_positional_args
    def __init__(self,
                 n_splits=5,
                 *,
                 max_train_size=None
                 ):
        super().__init__(n_splits, shuffle=False, random_state=None)
        self.max_train_size = max_train_size

    def split(self, X, y=None, groups=None):
        """Generate indices to split data into training and test set.
        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Training data, where n_samples is the number of samples
            and n_features is the number of features.
        y : array-like of shape (n_samples,)
            Always ignored, exists for compatibility.
        groups : array-like of shape (n_samples,)
            Group labels for the samples used while splitting the dataset into
            train/test set.
        Yields
        ------
        train : ndarray
            The training set indices for that split.
        test : ndarray
            The testing set indices for that split.
        """
        if groups is None:
            raise ValueError(
                "The 'groups' parameter should not be None")
        X, y, groups = indexable(X, y, groups)
        n_samples = _num_samples(X)
        n_splits = self.n_splits
        n_folds = n_splits + 1
        group_dict = {}
        u, ind = np.unique(groups, return_index=True)
        unique_groups = u[np.argsort(ind)]
        n_samples = _num_samples(X)
        n_groups = _num_samples(unique_groups)
        for idx in np.arange(n_samples):
            if (groups[idx] in group_dict):
                group_dict[groups[idx]].append(idx)
            else:
                group_dict[groups[idx]] = [idx]
        if n_folds > n_groups:
            raise ValueError(
                ("Cannot have number of folds={0} greater than"
                 " the number of groups={1}").format(n_folds,
                                                     n_groups))
        group_test_size = n_groups // n_folds
        group_test_starts = range(n_groups - n_splits * group_test_size,
                                  n_groups, group_test_size)
        for group_test_start in group_test_starts:
            train_array = []
            test_array = []
            for train_group_idx in unique_groups[:group_test_start]:
                train_array_tmp = group_dict[train_group_idx]
                train_array = np.sort(np.unique(
                                      np.concatenate((train_array,
                                                      train_array_tmp)),
                                      axis=None), axis=None)
            train_end = train_array.size
            if self.max_train_size and self.max_train_size < train_end:
                train_array = train_array[train_end -
                                          self.max_train_size:train_end]
            for test_group_idx in unique_groups[group_test_start:
                                                group_test_start +
                                                group_test_size]:
                test_array_tmp = group_dict[test_group_idx]
                test_array = np.sort(np.unique(
                                              np.concatenate((test_array,
                                                              test_array_tmp)),
                                     axis=None), axis=None)
            yield [int(i) for i in train_array], [int(i) for i in test_array]

In [None]:
#Plot the LGBM Features Importances
def plotImp(model, X , num = 20, fig_size = (40, 20)):
    feature_imp = pd.DataFrame({'Value':model.feature_importances_,'Feature':X.columns})
    plt.figure(figsize=fig_size)
    sns.set(font_scale = 5)
    sns.barplot(x="Value", y="Feature", data=feature_imp.sort_values(by="Value", 
                                                        ascending=False)[0:num])
    plt.title('LightGBM Features (avg over folds)')
    plt.tight_layout()
    plt.savefig('lgbm_importances-01.png')
    plt.show()

In [None]:
#Label Encoder
def label_enc(train_df, test_df, features):
    lbl_enc = LabelEncoder()
    full_data = pd.concat([train_df[features], test_df[features]],axis=0)
    for col in (features):
        print(col)
        lbl_enc.fit(full_data[col].values)
        train_df[col] = lbl_enc.transform(train_df[col])
        test_df[col] = lbl_enc.transform(test_df[col])
    return train_df, test_df

# Read the training and testing data


In [None]:
train_df  = layer.get_dataset("zindi/air-quality/datasets/train").to_pandas()
test_df  = layer.get_dataset("zindi/air-quality/datasets/test").to_pandas()
sample_submission = layer.get_dataset("zindi/air-quality/datasets/sample_submission").to_pandas()

# Add The Time Features

In [None]:
train_df = train_df.sort_values(['date','device']).reset_index(drop=True) 
test_df = test_df.sort_values(['date','device']).reset_index(drop=True)

for dataset in (train_df,test_df):
    dataset['date'] = pd.to_datetime(dataset['date'])
    dataset['Day'] = dataset.date.dt.day
    dataset['Month'] = dataset.date.dt.month
    dataset['Year'] = dataset.date.dt.year
    dataset['DayOfWeek'] = dataset.date.dt.dayofweek
    dataset['DayOfYear'] = dataset.date.dt.dayofyear
    dataset['Week'] = dataset.date.dt.weekofyear
    dataset.set_index('date', inplace=True)

In [None]:
ID = test_df['ID']
test_df.drop('ID',inplace=True,axis=1)
train_df.drop('ID',inplace=True,axis=1)

# Exploratory data analysis

In [None]:
train_df.shape

In [None]:
test_df.shape

In [None]:
train_df.info()

In [None]:
test_df.info()

In [None]:
train_df.head()

In [None]:
test_df.head()

In [None]:
train_df.describe()

In [None]:
test_df.describe()

In [None]:
#The cardinality of each catgorical feature (Training)
cat_cols = train_df.columns
for col in cat_cols:
    print(col, train_df[col].nunique())

In [None]:
#The cardinality of each catgorical feature (Testing)
cat_cols = test_df.columns
for col in cat_cols:
    print(col, test_df[col].nunique())

# Data Preprocessing

# Missing Data 

In [None]:
#missing data percentage (Training)
total = train_df.isnull().sum().sort_values(ascending=False)
percent_1 = train_df.isnull().sum()/train_df.isnull().count()*100
percent_2 = (round(percent_1, 1)).sort_values(ascending=False)
missing_data = pd.concat([total, percent_2], axis=1, keys=['Total', '%'])
missing_data.head(60)

In [None]:
#missing data percentage (Testing)
total = test_df.isnull().sum().sort_values(ascending=False)
percent_1 = test_df.isnull().sum()/test_df.isnull().count()*100
percent_2 = (round(percent_1, 1)).sort_values(ascending=False)
missing_data = pd.concat([total, percent_2], axis=1, keys=['Total', '%'])
missing_data

In [None]:
train_df = train_df.bfill().ffill()
test_df = test_df.bfill().ffill()

# Feature Engineering

## - Lags Features

In [None]:
def LAG(data,LagFeature,shift=1,NewFeatures=[]) :
    data[NewFeatures[0]]   = data[LagFeature]  - data[LagFeature].shift(shift)
    data[NewFeatures[1]]   = data[LagFeature].shift(shift)

num_feats = train_df.columns
num_feats = num_feats.drop(['Week','DayOfYear','DayOfWeek','Year','Month','Day','pm2_5','temp_mean','humidity','site_longitude','site_latitude','device'])

for feature in num_feats:
    LAG(train_df,LagFeature=f'{feature}',shift=1,NewFeatures=[f'{feature}_diff_Lag1',f'{feature}_Lag1'])
    LAG(test_df,LagFeature=f'{feature}',shift=1,NewFeatures=[f'{feature}_diff_Lag1',f'{feature}_Lag1'])

## - Combination Between Time Features

In [None]:
for dataset in (train_df,test_df):
    dataset['Year_Month'] = dataset['Year'].astype(str) + '-' + dataset['Month'].astype(str)
    dataset['Year_Week'] = dataset['Year'].astype(str) + '-' + dataset['Week'].astype(str)
    dataset['Year_Month_Day'] = dataset['Year'].astype(str) + '-' + dataset['Month'].astype(str) + '-' + dataset['Day'].astype(str)
    
feats = ['Year_Month','Year_Week','Year_Month_Day']
train_df,test_df = label_enc(train_df,test_df,feats)

## - Aggregations Features

In [None]:
DevicePM2_5Mean = dict(train_df.groupby('device')['pm2_5'].mean())
DevicePM2_5Std = dict(train_df.groupby('device')['pm2_5'].std())
DevicePM2_5Min = dict(train_df.groupby('device')['pm2_5'].min())
DevicePM2_5Max = dict(train_df.groupby('device')['pm2_5'].max())

for dataset in (train_df,test_df):
    dataset['DevicePM2_5Mean'] = dataset['device'].map(DevicePM2_5Mean)
    dataset['DevicePM2_5Std'] = dataset['device'].map(DevicePM2_5Std)
    dataset['DevicePM2_5Min'] = dataset['device'].map(DevicePM2_5Min)
    dataset['DevicePM2_5Max'] = dataset['device'].map(DevicePM2_5Max)

In [None]:
def Agg(Feature):
    for dataset in (train_df,test_df):
        dataset[f'{Feature}PerMonth'] = dataset['Month'].map(dict(dataset.groupby('Month')[Feature].mean()))
        dataset[f'{Feature}PerWeek'] = dataset['Year_Week'].map(dict(dataset.groupby('Year_Week')[Feature].mean()))
        dataset[f'{Feature}PerDay'] = dataset['Year_Month_Day'].map(dict(dataset.groupby('Year_Month_Day')[Feature].mean()))
        
        dataset[f'{Feature}Month_std'] = dataset['Month'].map(dict(dataset.groupby('Month')[Feature].std()))
        dataset[f'{Feature}Week_std'] = dataset['Year_Week'].map(dict(dataset.groupby('Year_Week')[Feature].std()))
        dataset[f'{Feature}Day_std'] = dataset['Year_Month_Day'].map(dict(dataset.groupby('Year_Month_Day')[Feature].std()))
        
        dataset[f'{Feature}Month_min'] = dataset['Month'].map(dict(dataset.groupby('Month')[Feature].min()))
        dataset[f'{Feature}Week_min'] = dataset['Year_Week'].map(dict(dataset.groupby('Year_Week')[Feature].min()))
        dataset[f'{Feature}Day_min'] = dataset['Year_Month_Day'].map(dict(dataset.groupby('Year_Month_Day')[Feature].min()))
       
        dataset[f'{Feature}Month_max'] = dataset['Month'].map(dict(dataset.groupby('Month')[Feature].max()))
        dataset[f'{Feature}Week_max'] = dataset['Year_Week'].map(dict(dataset.groupby('Year_Week')[Feature].max()))
        dataset[f'{Feature}Day_max'] = dataset['Year_Month_Day'].map(dict(dataset.groupby('Year_Month_Day')[Feature].max()))
        
Agg('temp_mean')
Agg('humidity')

In [None]:
train_df.drop(['Year_Month','Year_Week','Year_Month_Day'],inplace=True,axis=1)
test_df.drop(['Year_Month','Year_Week','Year_Month_Day'],inplace=True,axis=1)

## - Rolling Features

In [None]:
def Rolling(feature):
    for dataset in (train_df,test_df):
        dataset[f'{feature}_Rolling_3'] = dataset[feature].rolling(3).mean()
        dataset[f'{feature}_Rolling_5'] = dataset[feature].rolling(5).mean()

        dataset[f"{feature}_rolling_mean_60"] = dataset.rolling(60).mean()[feature]
        dataset[f"{feature}_rolling_max_60"] = dataset.rolling(60).max()[feature]
        dataset[f"{feature}_rolling_min_60"] = dataset.rolling(60).min()[feature]

        dataset[f"{feature}_rolling_mean_30"] = dataset.rolling(30).mean()[feature]
        dataset[f"{feature}_rolling_max_30"] = dataset.rolling(30).max()[feature]
        dataset[f"{feature}_rolling_min_30"] = dataset.rolling(30).min()[feature]

        dataset[f"{feature}_rolling_mean_10"] = dataset.rolling(10).mean()[feature]
        dataset[f"{feature}_rolling_max_10"] = dataset.rolling(10).max()[feature]
        dataset[f"{feature}_rolling_min_10"] = dataset.rolling(10).min()[feature]

Rolling('temp_mean')
Rolling('humidity')

## - Polar Coordinates

In [None]:
def Polar(X,y, a = 0, b = 0): # a and b represnt the center
    r = np.sqrt((X-a)**2 + (y-b)**2)
    phi = np.arctan2((y-a), (X-b))
    return r, phi

train_df['R'], train_df['Phi'] = Polar(train_df["site_latitude"],train_df["site_longitude"])
test_df['R'], test_df['Phi'] = Polar(test_df["site_latitude"],test_df["site_longitude"])

## - Foureier Frequnecies and Amplitudes For Features That Contain Seasonality

In [None]:
freq2_dict_no_log = dict()
freq3_dict_no_log = dict()

amp2_dict_no_log = dict()
amp3_dict_no_log = dict()

for feat_1 in ('Year','Month','Day'):
    for feat_2 in ('temp_mean', 'humidity'):
        for i in range(min(train_df[feat_1].unique()), max(train_df[feat_1].unique()) + 1):

            a = train_df.loc[train_df[feat_1]==i]
            a_sales = a[feat_2]

            Y = np.fft.fft(a_sales.values)
            Y = abs(Y)
            freq = np.fft.fftfreq(len(Y), 1)

            intercept_index = np.argmax(Y)
            Y = np.delete(Y, intercept_index)
            freq = np.delete(freq, intercept_index)

            amplitude_1_index = np.argmax(Y)
            amplitude_1 = Y[amplitude_1_index]
            Y = np.delete(Y, amplitude_1_index)
            freq_1 = freq[amplitude_1_index]
            freq = np.delete(freq, amplitude_1_index)

            amplitude_2_index = np.argmax(Y)
            amplitude_2 = Y[amplitude_2_index]
            Y = np.delete(Y, amplitude_2_index)
            freq_2 = freq[amplitude_2_index]
            freq = np.delete(freq, amplitude_2_index)

            amplitude_3_index = np.argmax(Y)
            amplitude_3 = Y[amplitude_3_index]
            Y = np.delete(Y, amplitude_3_index)
            freq_3 = freq[amplitude_3_index]
            freq = np.delete(freq, amplitude_3_index)

            #Freq_1 is not included because it seems as it is always 0
            a[f'Frequency_2_{feat_1}_{feat_2}'] = freq_2
            a[f'Frequency_3_{feat_1}_{feat_2}'] = freq_3

            a[f'Amplitude_2_{feat_1}_{feat_2}'] = amplitude_2
            a[f'Amplitude_3_{feat_1}_{feat_2}'] = amplitude_3


            freq2_dict_no_log[i] = freq_2
            freq3_dict_no_log[i] = freq_3

            amp2_dict_no_log[i] = amplitude_2
            amp3_dict_no_log[i] = amplitude_3


            if i == min(train_df[feat_1].unique()):
                k = a
            else:
                k = pd.concat([k,a])
                
        train_df = k
        
        test_df[f'Frequency_2_{feat_1}_{feat_2}'] = test_df[feat_1].map(freq2_dict_no_log)
        test_df[f'Frequency_3_{feat_1}_{feat_2}'] = test_df[feat_1].map(freq3_dict_no_log)
        test_df[f'Amplitude_2_{feat_1}_{feat_2}'] = test_df[feat_1].map(amp2_dict_no_log)
        test_df[f'Amplitude_3_{feat_1}_{feat_2}'] = test_df[feat_1].map(amp3_dict_no_log)
        
        freq2_dict_no_log = dict()
        freq3_dict_no_log = dict()
        amp2_dict_no_log = dict()
        amp3_dict_no_log = dict()

## - Percentage change in Temperature and Humidity 

In [None]:
periods = [1, 3, 7, 14]
for period in periods:
    train_df.loc[:, f"PctChangeTemp_{period}"] = train_df["temp_mean"].pct_change(period)
    train_df.loc[:, f"PctChangeHumi_{period}"] = train_df["humidity"].pct_change(period)
    test_df.loc[:, f"PctChangeTemp_{period}"] = test_df["temp_mean"].pct_change(period)
    test_df.loc[:, f"PctChangeHumi_{period}"] = test_df["humidity"].pct_change(period)

## - Historic Volatility

In [None]:
periods = [3, 7, 14]
for period in periods:
    train_df.loc[:, f"volatility_temp_mean_{period}"] = train_df["temp_mean"].diff().rolling(period).std()
    test_df.loc[:, f"volatility_temp_mean_{period}"] = test_df["temp_mean"].diff().rolling(period).std()
    train_df.loc[:, f"volatility_humidity_{period}"] = train_df["humidity"].diff().rolling(period).std()
    test_df.loc[:, f"volatility_humidity_{period}"] = test_df["humidity"].diff().rolling(period).std()

# Encoding

In [None]:
train_df, test_df = label_enc(train_df,test_df,['device'])

# Modeling

## Validate our model ; Layer.ai 

In [None]:
@model("air_model")
@pip_requirements(packages=["catboost","seaborn"])
def train():
    from catboost import CatBoostRegressor
    from sklearn import metrics
    from sklearn.compose import TransformedTargetRegressor
    import numpy as np
    import pandas as pd
    import matplotlib.pyplot as plt
    import seaborn as sns

    Means = [20.772597188397544]
    STDs = [2.2423317130434035]

    params = {'n_estimators': 2064, 'learning_rate': 0.03196897706232692, 'depth': 5, 'reg_lambda': 12.680808984686983,}
    cb = CatBoostRegressor(**params,verbose=0, random_state=42)
    LOGcb = TransformedTargetRegressor(cb, func = np.log1p, inverse_func = np.expm1)

    X = train_df.drop('pm2_5',axis=1).values
    y = train_df['pm2_5'].values

    temp = train_df.copy()
    temp['date'] = temp.index
    temp = temp.reset_index(drop=True)
    
    layer.log({"model_params":params})

    scores = []
    for fold_ , (train_index, test_index) in enumerate(GroupTimeSeriesSplit(n_splits=4).split(X, y, groups=temp['date'].values)):
        X_Train, X_Test = X[train_index], X[test_index]
        y_Train, y_Test = y[train_index], y[test_index]
        LOGcb.fit(X_Train,y_Train)
        vali = temp.loc[(temp.index >= test_index[0]) & (temp.index <= test_index[-1])]
        y_pred = LOGcb.predict(X_Test)
        scores.append(mean_absolute_error(y_pred,y_Test))

        layer.log({f'Validation Step {fold_} <==> Mean Absolute Error':metrics.mean_absolute_error(y_Test, y_pred)})
        layer.log({f'Validation Step {fold_} <==> Mean Squarred Error': metrics.mean_squared_error(y_Test, y_pred)})
        layer.log({f'Validation Step {fold_} <==> Root Mean Squared Error': np.sqrt(metrics.mean_squared_error(y_Test, y_pred))})
        print(scores[-1])

        print("\nMean:",np.mean(scores),"\nSTD: ", np.std(scores))
        Means.append(np.mean(scores))
        STDs.append(np.std(scores))

        if (Means[-1] < Means[-2]):
            print('Better')
        elif (Means[-1] > Means[-2]):
            print('Worse')
        else:
            print('Same')

        if   (STDs[-1] > STDs[-2]):
            print('Worse')
        elif (STDs[-1] < STDs[-2]):
            print('Better')
        else:
            print('Same')

    #Plot the Catboost Features Importances
    cb.fit(X,y)
    feature_imp = pd.DataFrame({'Value':cb.feature_importances_,'Feature':train_df.drop('pm2_5',axis=1).columns})
    plt.figure(figsize=(40, 20))
    sns.set(font_scale = 5)
    sns.barplot(x="Value", y="Feature", data=feature_imp.sort_values(by="Value", 
                                                        ascending=False)[0:20])
    plt.title('Feature Imprtances (avg over folds)')
    plt.tight_layout()
    layer.log({"Feature importance": plt.gcf()})

    return cb

In [None]:
layer.run([train])

## Train On Different Seeds

In [None]:
#Averaging the predictions of the same model with different seeds to get more consistent results
X = train_df.drop('pm2_5',axis=1)
y = train_df['pm2_5']

Predictions = pd.DataFrame()

for seed in range(20,46):
    print(f'Seed: {seed}')
    params = {'n_estimators': 2064, 'learning_rate': 0.03196897706232692, 'depth': 5, 'reg_lambda': 12.680808984686983}
    CB = CatBoostRegressor(**params,verbose=0, random_state=seed, task_type = 'GPU')
    LogCB = TransformedTargetRegressor(CB, func = np.log1p, inverse_func = np.expm1)  
    LogCB.fit(X, y)

    Predictions[f'Target_{seed}'] = LogCB.predict(test_df)
    Predictions[f'Target_{seed}'] = Predictions[f'Target_{seed}'] * 0.975  #A Correction Factor of 0.975
    
#Averaging the Results
Predictions['Mean'] = Predictions.mean(axis=1)
Predictions['HMean'] = Predictions.apply(stats.hmean, axis=1)
Predictions['GMean'] = Predictions.apply(stats.gmean, axis=1)

#Averaging the Second Results
FinalPred = Predictions[['Mean','HMean','GMean']].apply(stats.hmean,axis=1)

#Making the submission file
submission = pd.DataFrame({"Id": ID ,"pm2_5": FinalPred.values})
submission.to_csv('AirQualityPrediction.csv',index=False)