<h1><b> Import Modules

In [1111]:
import time

In [1112]:
start=time.time()

In [1113]:
import pandas as pd                                    # for data
import numpy as np                                     # for math
from lightgbm import LGBMRegressor                     # Validation model
from sklearn.metrics import mean_squared_error         # Regressortion metric
from sklearn.model_selection import GroupKFold,KFold   # for validation
from sklearn.preprocessing import LabelEncoder         # for encoding
import sklearn.manifold._t_sne as tsne                 # for t_sne
import seaborn as sns                                  # for plotting
import matplotlib.pyplot as plt                         # for plotting

<h1><b>Define Functions

In [1114]:
def post_process(test_data,y,feature_to_mean='date_month') -> int:
    """runs the post processing

    Keyword arguments:
    test_data -- the testing data
    y -- the target
    feature_to_mean -- feature to apply the post processing by
    Return: predictions after processing
    """
    test_data['target'] = y
    test_data['target_month'] = test_data[feature_to_mean].map(test_data[[feature_to_mean,'target']].groupby(feature_to_mean)['target'].mean())
    test_data.drop(columns='target',inplace=True)
    return test_data['target_month']

<h1><b> Read Data

In [1115]:
train = pd.read_csv("Train.csv")
test = pd.read_csv("Test.csv")

In [1116]:
train.columns

Index(['id', 'site_id', 'site_latitude', 'site_longitude', 'city', 'country',
       'date', 'hour', 'sulphurdioxide_so2_column_number_density',
       'sulphurdioxide_so2_column_number_density_amf',
       'sulphurdioxide_so2_slant_column_number_density',
       'sulphurdioxide_cloud_fraction', 'sulphurdioxide_sensor_azimuth_angle',
       'sulphurdioxide_sensor_zenith_angle',
       'sulphurdioxide_solar_azimuth_angle',
       'sulphurdioxide_solar_zenith_angle',
       'sulphurdioxide_so2_column_number_density_15km', 'month',
       'carbonmonoxide_co_column_number_density',
       'carbonmonoxide_h2o_column_number_density',
       'carbonmonoxide_cloud_height', 'carbonmonoxide_sensor_altitude',
       'carbonmonoxide_sensor_azimuth_angle',
       'carbonmonoxide_sensor_zenith_angle',
       'carbonmonoxide_solar_azimuth_angle',
       'carbonmonoxide_solar_zenith_angle',
       'nitrogendioxide_no2_column_number_density',
       'nitrogendioxide_tropospheric_no2_column_number_densi

In [1117]:
pd.options.display.max_columns = 200

<h1><b> Data Preprocessing

In [1118]:
train_feats = train.notna().sum()[train.notna().sum() > .15*len(train)].index
test_feats = test.notna().sum()[test.notna().sum() > .15*len(test)].index
train = train[train_feats]

In [1119]:
test = test[train_feats[:-1]]

In [1120]:
train.head()

Unnamed: 0,id,site_id,site_latitude,site_longitude,city,country,date,hour,sulphurdioxide_so2_column_number_density,sulphurdioxide_so2_column_number_density_amf,sulphurdioxide_so2_slant_column_number_density,sulphurdioxide_cloud_fraction,sulphurdioxide_sensor_azimuth_angle,sulphurdioxide_sensor_zenith_angle,sulphurdioxide_solar_azimuth_angle,sulphurdioxide_solar_zenith_angle,sulphurdioxide_so2_column_number_density_15km,month,carbonmonoxide_co_column_number_density,carbonmonoxide_h2o_column_number_density,carbonmonoxide_cloud_height,carbonmonoxide_sensor_altitude,carbonmonoxide_sensor_azimuth_angle,carbonmonoxide_sensor_zenith_angle,carbonmonoxide_solar_azimuth_angle,carbonmonoxide_solar_zenith_angle,nitrogendioxide_no2_column_number_density,nitrogendioxide_tropospheric_no2_column_number_density,nitrogendioxide_stratospheric_no2_column_number_density,nitrogendioxide_no2_slant_column_number_density,nitrogendioxide_tropopause_pressure,nitrogendioxide_absorbing_aerosol_index,nitrogendioxide_cloud_fraction,nitrogendioxide_sensor_altitude,nitrogendioxide_sensor_azimuth_angle,nitrogendioxide_sensor_zenith_angle,nitrogendioxide_solar_azimuth_angle,nitrogendioxide_solar_zenith_angle,formaldehyde_tropospheric_hcho_column_number_density,formaldehyde_tropospheric_hcho_column_number_density_amf,formaldehyde_hcho_slant_column_number_density,formaldehyde_cloud_fraction,formaldehyde_solar_zenith_angle,formaldehyde_solar_azimuth_angle,formaldehyde_sensor_zenith_angle,formaldehyde_sensor_azimuth_angle,uvaerosolindex_absorbing_aerosol_index,uvaerosolindex_sensor_altitude,uvaerosolindex_sensor_azimuth_angle,uvaerosolindex_sensor_zenith_angle,uvaerosolindex_solar_azimuth_angle,uvaerosolindex_solar_zenith_angle,ozone_o3_column_number_density,ozone_o3_column_number_density_amf,ozone_o3_slant_column_number_density,ozone_o3_effective_temperature,ozone_cloud_fraction,ozone_sensor_azimuth_angle,ozone_sensor_zenith_angle,ozone_solar_azimuth_angle,ozone_solar_zenith_angle,cloud_cloud_fraction,cloud_cloud_top_pressure,cloud_cloud_top_height,cloud_cloud_base_pressure,cloud_cloud_base_height,cloud_cloud_optical_depth,cloud_surface_albedo,cloud_sensor_azimuth_angle,cloud_sensor_zenith_angle,cloud_solar_azimuth_angle,cloud_solar_zenith_angle,pm2_5
0,id_vjcx08sz91,6531a46a89b3300013914a36,6.53257,3.39936,Lagos,Nigeria,2023-10-25,13,,,,,,,,,,10.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.052301,828817.9375,-100.805145,21.720518,-123.523796,33.745914,0.122055,2.301404,0.285803,230.693756,0.906039,-100.805145,21.720518,-123.523796,33.745914,,,,,,,,,,,,12.015
1,id_bkg215syli,6531a46a89b3300013914a36,6.53257,3.39936,Lagos,Nigeria,2023-11-02,12,,,,,,,,,,11.0,0.045475,3771.02721,3399.756845,828569.623806,69.245351,59.159695,-143.370575,26.566997,,,,,,,,,,,,,0.000214,1.46239,0.00024,0.35915,26.525513,-143.480164,59.220097,70.875954,-0.315206,828578.625,70.875954,59.220097,-143.480164,26.525513,0.116975,3.049902,0.362203,228.260193,0.364713,70.875954,59.220097,-143.480164,26.525513,,,,,,,,,,,,42.2672
2,id_oui2pot3qd,6531a46a89b3300013914a36,6.53257,3.39936,Lagos,Nigeria,2023-11-03,13,,,,,,,,,,11.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.097816,828878.6875,-96.411942,61.04501,-121.307121,41.898113,0.117559,3.248703,0.384168,224.102463,0.754163,-96.411942,61.04501,-121.307121,41.898113,0.756392,45185.49959,6791.682888,51171.802486,5791.682829,11.816715,0.192757,-96.41189,61.045123,-121.307414,41.898269,39.450741
3,id_9aandqzy4n,6531a46a89b3300013914a36,6.53257,3.39936,Lagos,Nigeria,2023-11-08,14,,,,,,,,,,11.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.591543,828920.6875,-95.863083,65.508858,-122.218231,43.923038,0.118412,3.505708,0.416643,226.529633,1.0,-95.863083,65.508858,-122.218231,43.923038,,,,,,,,,,,,10.5376
4,id_ali5x2m4iw,6531a46a89b3300013914a36,6.53257,3.39936,Lagos,Nigeria,2023-11-09,13,0.000267,0.774656,0.000207,0.223403,-97.811241,49.513344,-126.064468,40.167336,8.3e-05,11.0,0.049045,3514.042054,1678.370478,828902.79045,-96.099639,49.432796,-126.070747,40.144183,9.7e-05,6.3e-05,3.5e-05,0.000175,9582.712645,0.069667,0.142913,828919.0,-97.811241,49.513344,-126.064468,40.167336,0.0001,1.298433,6.8e-05,0.223403,40.167336,-126.064468,49.513344,-97.811241,0.069669,828919.0,-97.811241,49.513344,-126.064468,40.167336,0.117899,2.796072,0.334256,226.368851,0.22615,-97.811241,49.513344,-126.064468,40.167336,0.226141,86197.53125,1451.050659,96215.90625,451.050598,10.521009,0.153114,-97.811241,49.513439,-126.064453,40.167355,19.431731


In [1121]:
test.head()

Unnamed: 0,id,site_id,site_latitude,site_longitude,city,country,date,hour,sulphurdioxide_so2_column_number_density,sulphurdioxide_so2_column_number_density_amf,sulphurdioxide_so2_slant_column_number_density,sulphurdioxide_cloud_fraction,sulphurdioxide_sensor_azimuth_angle,sulphurdioxide_sensor_zenith_angle,sulphurdioxide_solar_azimuth_angle,sulphurdioxide_solar_zenith_angle,sulphurdioxide_so2_column_number_density_15km,month,carbonmonoxide_co_column_number_density,carbonmonoxide_h2o_column_number_density,carbonmonoxide_cloud_height,carbonmonoxide_sensor_altitude,carbonmonoxide_sensor_azimuth_angle,carbonmonoxide_sensor_zenith_angle,carbonmonoxide_solar_azimuth_angle,carbonmonoxide_solar_zenith_angle,nitrogendioxide_no2_column_number_density,nitrogendioxide_tropospheric_no2_column_number_density,nitrogendioxide_stratospheric_no2_column_number_density,nitrogendioxide_no2_slant_column_number_density,nitrogendioxide_tropopause_pressure,nitrogendioxide_absorbing_aerosol_index,nitrogendioxide_cloud_fraction,nitrogendioxide_sensor_altitude,nitrogendioxide_sensor_azimuth_angle,nitrogendioxide_sensor_zenith_angle,nitrogendioxide_solar_azimuth_angle,nitrogendioxide_solar_zenith_angle,formaldehyde_tropospheric_hcho_column_number_density,formaldehyde_tropospheric_hcho_column_number_density_amf,formaldehyde_hcho_slant_column_number_density,formaldehyde_cloud_fraction,formaldehyde_solar_zenith_angle,formaldehyde_solar_azimuth_angle,formaldehyde_sensor_zenith_angle,formaldehyde_sensor_azimuth_angle,uvaerosolindex_absorbing_aerosol_index,uvaerosolindex_sensor_altitude,uvaerosolindex_sensor_azimuth_angle,uvaerosolindex_sensor_zenith_angle,uvaerosolindex_solar_azimuth_angle,uvaerosolindex_solar_zenith_angle,ozone_o3_column_number_density,ozone_o3_column_number_density_amf,ozone_o3_slant_column_number_density,ozone_o3_effective_temperature,ozone_cloud_fraction,ozone_sensor_azimuth_angle,ozone_sensor_zenith_angle,ozone_solar_azimuth_angle,ozone_solar_zenith_angle,cloud_cloud_fraction,cloud_cloud_top_pressure,cloud_cloud_top_height,cloud_cloud_base_pressure,cloud_cloud_base_height,cloud_cloud_optical_depth,cloud_surface_albedo,cloud_sensor_azimuth_angle,cloud_sensor_zenith_angle,cloud_solar_azimuth_angle,cloud_solar_zenith_angle
0,id_ihxgrbq8bw,64f9d17ab9e98d001ac9e882,5.61252,-0.22955,Accra,Ghana,2023-09-06,13,-7.2e-05,0.762543,-5.5e-05,0.079645,-100.330299,26.92642,-86.879776,25.512329,-2.8e-05,9.0,0.043537,2825.323242,1.0,829406.9375,-97.787621,26.897718,-86.884308,25.509418,5.5e-05,1.6e-05,4e-05,0.000109,9582.745678,-0.258421,0.019749,829420.3125,-100.330299,26.92642,-86.879776,25.512329,0.000243,1.11224,0.000213,0.079645,25.512329,-86.879776,26.92642,-100.330299,-0.258422,829420.3125,-100.330299,26.92642,-86.879776,25.512329,0.126056,2.243713,0.286278,230.244171,0.079324,-100.330299,26.92642,-86.879776,25.512329,0.043065,74217.403083,2710.544562,83569.504246,1710.544483,3.063105,0.263193,-100.317077,27.059646,-86.88567,25.530511
1,id_dg6s4fhiwe,64f9d17ab9e98d001ac9e882,5.61252,-0.22955,Accra,Ghana,2023-09-07,13,,,,,,,,,,9.0,0.036341,2604.78833,1584.809692,829328.625,70.75309,11.428769,-87.764297,20.935318,,,,,,,,,,,,,,,,,,,,,-0.408162,829340.75,76.400116,11.171532,-87.726566,20.95156,0.126282,2.1364,0.273757,230.184377,0.613266,76.400116,11.171532,-87.726566,20.95156,,,,,,,,,,,
2,id_f7hwwtmuzp,64f9d17ab9e98d001ac9e882,5.61252,-0.22955,Accra,Ghana,2023-09-08,13,-5.1e-05,1.004265,-5.1e-05,0.16316,73.117264,43.112466,-89.089083,16.417355,-2.3e-05,9.0,0.037453,3046.314001,90.699029,829194.5625,71.272375,43.172868,-88.964294,16.440554,6.2e-05,2.7e-05,3.6e-05,0.000134,9582.743944,-0.1152,0.163328,829204.480816,73.117264,43.112466,-89.089083,16.417355,0.000145,1.34584,0.000127,0.16316,16.417355,-89.089083,43.112466,73.117264,-0.115169,829204.480816,73.117264,43.112466,-89.089083,16.417355,0.126162,2.428528,0.311882,237.772241,0.16437,73.117264,43.112466,-89.089083,16.417355,,,,,,,,,,,
3,id_ioese5awdg,64f9d17ab9e98d001ac9e882,5.61252,-0.22955,Accra,Ghana,2023-09-09,12,,,,,,,,,,9.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,-1.116044,829067.3125,70.680077,61.874397,-90.875526,11.865146,0.121339,3.074277,0.37889,225.704056,0.84591,70.680077,61.874397,-90.875526,11.865146,0.845911,46052.449219,6585.034668,52160.980469,5585.034668,29.145922,0.314945,70.680077,61.874222,-90.875603,11.865201
4,id_hdw320zpls,64f9d17ab9e98d001ac9e882,5.61252,-0.22955,Accra,Ghana,2023-09-20,12,-0.000634,0.632173,-0.000401,0.0,70.066956,66.014107,-111.396515,12.22621,-0.000147,9.0,,,,,,,,,4.6e-05,1.3e-05,3.3e-05,0.000132,8614.471169,-0.21131,0.072076,828876.5625,70.066956,66.014107,-111.396515,12.22621,-0.000193,1.285203,-0.000316,0.0,12.22621,-111.396515,66.014107,70.066956,-0.211314,828876.5625,70.066956,66.014107,-111.396515,12.22621,0.127799,3.335475,0.432085,237.175858,0.0,70.066956,66.014107,-111.396515,12.22621,,,,,,,,,,,


<h2><b>Add Time-related Features

In [1122]:
for df in (train,test):
    df['date'] = pd.to_datetime(df['date'])
    df['date_month'] = df['date'].dt.day_of_year
    df['DayOfWeek'] =  df['date'].dt.dayofweek
    df['Day'] =  df['date'].dt.day
    df['Year'] =  df['date'].dt.year
    df.drop(columns=['site_id','date'],inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['date'] = pd.to_datetime(df['date'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['date_month'] = df['date'].dt.day_of_year
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['DayOfWeek'] =  df['date'].dt.dayofweek
A value is trying to be set on a copy of a slice from a DataFrame.
Try using 

<h3><b> Grouping

In [1123]:
groups = train['city']

- groups is a variable that stores the city column from the train DataFrame.
- This is used later for grouping the data during cross-validation.

- In this context, groups represent the different cities in the train dataset.
- By using groups, the cross-validation process can be performed within each city,
- ensuring that the model is trained and evaluated on data from the same city.
- the model might not perform as well because it would not have been exposed to the same distribution of data.
- By using groups, the model is trained and evaluated on data from the same city,
- which helps to ensure that the model is able to generalize well to unseen data.


<h2><b> Encoding

<h4><b>Label Encoding

In [1124]:
le = LabelEncoder()
for column in ['city','country','id']:
    letrans = le.fit_transform(pd.concat([train,test])[column])
    train[column] = letrans[:len(train)]
    test[column] = letrans[len(train):]

<h1><b>Modeling

In [1125]:
model =  LGBMRegressor(n_estimators=200,random_state=42,max_depth=10)
n_splits = 4
cv = GroupKFold(n_splits=n_splits)


<h1><b>Validation

In [1126]:
train_set = pd.read_csv('Train.csv')
test_set = pd.read_csv('Test.csv')

def validate(trainset,testset,t,origin):
    model.fit(trainset.drop(columns=t),trainset[t])
    pred = model.predict(np.array(testset.drop(columns=t)))
    print('std: ', testset[t].std())
    # pred = post_process(origin,pred,'date_month')

    origin['pm_5'] = pred
    origin['date'] = pd.to_datetime(origin['date'])
    origin['date_day'] = origin['date'].dt.month
    pred = origin['date_day'].map(origin[['date_day','pm_5']].groupby('date_day')['pm_5'].mean())
    stds.append(testset[t].std())
    score = mean_squared_error(pred,testset[t],squared=False)
    print('score:', score)
    return score
stds = []
rmse = []
for v_train,v_test in cv.split(train.drop(columns='pm2_5'),train['pm2_5'],groups=groups):
    train_v, test_v= train.iloc[v_train],train.iloc[v_test]
    origin = train_set.iloc[v_test]
    rmse.append(validate(train_v,test_v,'pm2_5',origin))

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003484 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 16640
[LightGBM] [Info] Number of data points in the train set: 2475, number of used features: 74
[LightGBM] [Info] Start training from score 28.876230
std:  14.34682858151784
score: 14.213006742282468
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005823 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 16748
[LightGBM] [Info] Number of data points in the train set: 6571, number of used features: 74
[LightGBM] [Info] Start training from score 25.954156


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  origin['pm_5'] = pred
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  origin['date'] = pd.to_datetime(origin['date'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  origin['date_day'] = origin['date'].dt.month


std:  24.982218437683237
score: 25.11749634778974


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  origin['pm_5'] = pred
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  origin['date'] = pd.to_datetime(origin['date'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  origin['date_day'] = origin['date'].dt.month


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002271 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 16717
[LightGBM] [Info] Number of data points in the train set: 7219, number of used features: 74
[LightGBM] [Info] Start training from score 22.093092
std:  52.84388334184576
score: 52.690842609120686


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  origin['pm_5'] = pred
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  origin['date'] = pd.to_datetime(origin['date'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  origin['date_day'] = origin['date'].dt.month


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.011671 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 16760
[LightGBM] [Info] Number of data points in the train set: 7948, number of used features: 74
[LightGBM] [Info] Start training from score 24.545522
std:  22.35338406057941
score: 18.313434647724534


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  origin['pm_5'] = pred
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  origin['date'] = pd.to_datetime(origin['date'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  origin['date_day'] = origin['date'].dt.month


In [1127]:
np.array(rmse).mean()

27.583695086729357

<h1><b>Inference

In [1128]:
model.fit(train.drop(columns='pm2_5'),train['pm2_5'])


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002153 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 16775
[LightGBM] [Info] Number of data points in the train set: 8071, number of used features: 74
[LightGBM] [Info] Start training from score 24.639296


In [1129]:
y = model.predict(test)



<h1><b> Make The Submission File

In [1130]:
smaple = pd.read_csv('SampleSubmission.csv')
smaple['pm2_5'] = post_process(test,y)
smaple.to_csv('submission2.csv',index=False)

In [1131]:
sample_plus=pd.read_csv('/content/submission 16.9.csv')

In [1132]:
from sklearn.metrics import mean_absolute_error

In [1133]:
MAE_score = mean_absolute_error(smaple['pm2_5'],sample_plus['pm2_5'])

In [1134]:
print(MAE_score)

1.5635650784920747


In [1135]:
end=time.time()

In [1136]:
excution=end-start

In [1137]:
print(excution)

14.417007207870483
