In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
pd.options.display.max_columns = 999
pd.options.display.max_rows = 999
import seaborn as sns
from scipy import stats

from sklearn.preprocessing import LabelEncoder
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from sklearn.compose import TransformedTargetRegressor
from sklearn.metrics import mean_absolute_error as mae
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.decomposition import PCA
import datetime

import warnings
warnings.simplefilter(action='ignore')

In [150]:
#Label Encoder
def label_enc(train_df, test_df, features):
    lbl_enc = LabelEncoder()
    full_data = pd.concat([train_df[features], test_df[features]],axis=0)
    for col in (features):
        print(col)
        lbl_enc.fit(full_data[col].values)
        train_df[col] = lbl_enc.transform(train_df[col])
        test_df[col] = lbl_enc.transform(test_df[col])
    return train_df, test_df

In [465]:
train_df = pd.read_csv("Train.csv")
test_df = pd.read_csv("Test.csv")
sub = pd.read_csv("SampleSubmission.csv")

In [466]:
# train_df.columns[train_df.columns.str.find('cloud') > 1]

In [467]:
# train_df.corr()['pm2_5'][train_df.corr()['pm2_5'] > 0.1].index

In [468]:
col1 = ['ID','date']
col2 = ['device']
col3 = ['humidity', 'temp_mean']
location = ['site_latitude', 'site_longitude']
chem = ['SulphurDioxide_SO2_column_number_density',
        'SulphurDioxide_SO2_column_number_density_amf',
        'SulphurDioxide_SO2_column_number_density_15km',
        'CarbonMonoxide_CO_column_number_density',
        'NitrogenDioxide_NO2_column_number_density',
        'UvAerosolIndex_absorbing_aerosol_index',
        'Ozone_O3_column_number_density',
        'Cloud_cloud_fraction', 
        'Cloud_cloud_top_pressure']
target = ['pm2_5']
feat = col1 + col2 + col3 + location + chem
train_df = train_df[feat + target]
test_df = test_df[feat]

In [469]:
train_df.shape, test_df.shape

((9923, 17), (4254, 16))

# Add The Time Features

In [470]:
train_df = train_df.sort_values(['date', 'device']).reset_index(drop=True) 
test_df = test_df.sort_values(['date', 'device']).reset_index(drop=True)

for dataset in (train_df,test_df):
    dataset['date'] = pd.to_datetime(dataset['date'])
    dataset['Day'] = dataset.date.dt.day
    dataset['Month'] = dataset.date.dt.month
    dataset['Year'] = dataset.date.dt.year
    dataset['DayOfWeek'] = dataset.date.dt.dayofweek
    dataset['DayOfYear'] = dataset.date.dt.dayofyear
    dataset['Week'] = dataset.date.dt.weekofyear
    dataset.set_index('date', inplace=True)

In [471]:
ID = test_df['ID']
test_df.drop('ID',inplace=True,axis=1)
train_df.drop('ID',inplace=True,axis=1)

# Exploratory data analysis

In [472]:
num_col = train_df.select_dtypes(exclude='O').columns.difference(['Month', 'pm2_5', 'site_latitude', 'site_longitude', 'humidity', 'temp_mean', 'Day', 'DayOfWeek', 'DayOfYear', 'Year', 'Week'])
train_df.temp_mean = train_df.temp_mean.fillna(train_df.temp_mean.median())
for data in (train_df, test_df):
    for feat in num_col:
        data[feat] = data[feat].bfill()

In [473]:
def LAG(data,LagFeature,shift=1,NewFeatures=[]) :
    data[NewFeatures[0]]   = data[LagFeature]  - data[LagFeature].shift(shift)
    data[NewFeatures[1]]   = data[LagFeature].shift(shift)

num_feats = train_df.columns
num_feats = num_feats.drop(['DayOfWeek','Month','Day','pm2_5','temp_mean','humidity','site_longitude', 'site_latitude','device', 'Year', 'DayOfYear', 'Week'])

for feature in num_feats:
    LAG(train_df,LagFeature=f'{feature}',shift=-1,NewFeatures=[f'{feature}_diff_Lag1',f'{feature}_Lag1'])
    LAG(test_df,LagFeature=f'{feature}',shift=1,NewFeatures=[f'{feature}_diff_Lag1',f'{feature}_Lag1'])

In [474]:
for dataset in (train_df,test_df):
    dataset['Year_Week'] = dataset['Year'].astype(str) + '-' + dataset['Week'].astype(str)
    dataset['Month_Day'] = dataset['Month'].astype(str) + '-' + dataset['Day'].astype(str)
    # dataset['lat_lon'] = dataset['site_latitude'].astype(str) + '_' + dataset['site_longitude'].astype(str)
    
feats = ['Year_Week','Month_Day', 'device']
train_df,test_df = label_enc(train_df,test_df,feats)

Year_Week
Month_Day
device


In [475]:
test_df.shape, train_df.shape

((4254, 40), (9923, 41))

## - Aggregations Features

In [476]:
DevicePM2_5Mean = dict(train_df.groupby('device')['pm2_5'].mean())
DevicePM2_5Std = dict(train_df.groupby('device')['pm2_5'].std())
DevicePM2_5Min = dict(train_df.groupby('device')['pm2_5'].min())
DevicePM2_5Max = dict(train_df.groupby('device')['pm2_5'].max())

for dataset in (train_df,test_df):
    dataset['DevicePM2_5Mean'] = dataset['device'].map(DevicePM2_5Mean)
    dataset['DevicePM2_5Std'] = dataset['device'].map(DevicePM2_5Std)
    dataset['DevicePM2_5Min'] = dataset['device'].map(DevicePM2_5Min)
    dataset['DevicePM2_5Max'] = dataset['device'].map(DevicePM2_5Max)

In [477]:
data = pd.concat([train_df, test_df], axis = 0)
def Agg(Features):
    for dataset in (train_df,test_df):
        for Feature in Features:
            dataset[f'{Feature}_PerMonth'] = dataset['Month'].map(dict(data.groupby('Month')[Feature].mean()))
            dataset[f'{Feature}_PerWeek'] = dataset['Year_Week'].map(dict(data.groupby('Year_Week')[Feature].mean()))
            dataset[f'{Feature}_PerDay'] = dataset['Month_Day'].map(dict(data.groupby('Month_Day')[Feature].mean()))

            dataset[f'{Feature}_Month_std'] = dataset['Month'].map(dict(data.groupby('Month')[Feature].std()))
            dataset[f'{Feature}_Week_std'] = dataset['Year_Week'].map(dict(data.groupby('Year_Week')[Feature].std()))
            dataset[f'{Feature}_Day_std'] = dataset['Month_Day'].map(dict(data.groupby('Month_Day')[Feature].std()))

            dataset[f'{Feature}_Month_min'] = dataset['Month'].map(dict(data.groupby('Month')[Feature].min()))
            dataset[f'{Feature}_Week_min'] = dataset['Year_Week'].map(dict(data.groupby('Year_Week')[Feature].min()))
            dataset[f'{Feature}_Day_min'] = dataset['Month_Day'].map(dict(data.groupby('Month_Day')[Feature].min()))

            dataset[f'{Feature}_Month_max'] = dataset['Month'].map(dict(data.groupby('Month')[Feature].max()))
            dataset[f'{Feature}_Week_max'] = dataset['Year_Week'].map(dict(data.groupby('Year_Week')[Feature].max()))
            dataset[f'{Feature}_Day_max'] = dataset['Month_Day'].map(dict(data.groupby('Month_Day')[Feature].max()))
        
Agg(['temp_mean', 'humidity'])

In [478]:
train_df.shape, test_df.shape

((9923, 69), (4254, 68))

In [479]:
train_df = train_df.fillna(0)
test_df = test_df.fillna(0)

In [480]:
# pca = PCA(random_state = 101, n_components = 3)
# full_data = pd.concat([train_df[chem], test_df[chem]],axis=0)
# pca.fit(full_data)
# train_df[['pca_1', 'pca_2', 'pca_3']] = pca.transform(train_df[chem])
# test_df[['pca_1', 'pca_2', 'pca_3']] = pca.transform(test_df[chem])

In [481]:
# corr_matrix = train_df.corr().abs()
# upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
# to_drop = [column for column in upper.columns if any(upper[column] > 0.7)]
# drop = train_df.corr()['pm2_5'].abs()[train_df.corr()['pm2_5'].abs() < 0.03].index.to_list()
# drop_to_drop = [value for value in to_drop if value in drop]
# len(drop_to_drop), len(drop), len(to_drop)

In [482]:
# train_df = train_df.drop(columns = drop)
# test_df = test_df.drop(columns = drop)

In [483]:
# train_df.corr()['pm2_5'].abs()

In [484]:
train_df.drop(['Year_Week', 'Month_Day', 'site_longitude', 'site_latitude'],inplace=True,axis=1)
test_df.drop(['Year_Week', 'Month_Day', 'site_longitude', 'site_latitude'],inplace=True,axis=1)

# Encoding

In [485]:
#Averaging the predictions of the same model with different seeds to get more consistent results
X = train_df.drop('pm2_5', axis = 1)
y = train_df.pm2_5

In [486]:
X.shape, y.shape, test_df.shape

((9923, 64), (9923,), (4254, 64))

In [487]:
# X_train, y_train = X[:9000], y[:9000]
# X_test, y_test = X[9000:], y[9000:]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .1, shuffle = True, random_state = 42)

# Modeling

In [488]:
cb = CatBoostRegressor(n_estimators = 6864, learning_rate = 0.028926897706232692, depth = 8, verbose = 0, random_state = 42)
model = TransformedTargetRegressor(cb, func = np.log1p, inverse_func = np.expm1)
model.fit(X_train, y_train)
pred_1 = model.predict(X_test)
mae(y_test, pred_1)

6.209303976965303

In [489]:
# xg = XGBRegressor(objective = 'reg:squarederror', eval_metric = 'mae', n_estimators = 6000, max_depth = 8, learning_rate = 0.02484, max_leaves = 200, random_state = 42)
# model_ = TransformedTargetRegressor(xg, func = np.log1p, inverse_func = np.expm1)
# model_.fit(X_train, y_train)
# pred_2 = model_.predict(X_test)
# mae(y_test, pred_2)

In [490]:
pred = model.predict(test_df)
submission = pd.DataFrame({"Id": ID ,"pm2_5": pred})
submission.to_csv('drip__.csv', index = False)
submission.head(5)

Unnamed: 0_level_0,Id,pm2_5
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-08-20,ID_M1T3S50WVB,67.975422
2020-08-20,ID_S9OZLWWLTX,65.308804
2020-08-20,ID_OC21YTIKX2,73.866973
2020-08-20,ID_BNZBP8KDYD,79.361904
2020-08-20,ID_ZS2RAN8HZT,64.582965
