# Traitement des données

In [1]:
#! pip install verstack

In [1]:
import pandas as pd
import numpy as np
import xarray as xr
import datetime as dt

import pre_traitement
import traitement_forecast

## Chargement des données

In [2]:
# unzip file

#! tar -xf ./data_meteonet/train/X_forecast/2016/2D_arome_2016.tar.gz -C ./data_meteonet/train/X_forecast/2D_arome_train/
#! tar -xf ./data_meteonet/train/X_forecast/2017/2D_arome_2017.tar.gz  -C ./data_meteonet/train/X_forecast/2D_arome_train/
#! tar -xf ./data_meteonet/test/X_forecast/2D_arome_test.tar.gz

#! tar -xf ./data_meteonet/train/X_forecast/2016/2D_arpege_2016.tar.gz -C ./data_meteonet/train/X_forecast/
#! tar -xf ./data_meteonet/train/X_forecast/2017/2D_arpege_2017.tar.gz -C ./data_meteonet/train/X_forecast/
#! tar -xf ./data_meteonet/test/X_forecast/2D_arpege_test.tar.gz -C ./data_meteonet/test/

#! tar -xf ./data_meteonet/train/X_forecast/2016/3D_arpege_2016.tar.gz -C ./data_meteonet/train/X_forecast/
#! tar -xf ./data_meteonet/train/X_forecast/2017/3D_arpege_2017.tar.gz -C ./data_meteonet/train/X_forecast/
#! tar -xf ./data_meteonet/test/X_forecast/3D_arpege_test.tar.gz -C ./data_meteonet/test/

In [2]:
path_coords = './Other/Other/'
path_train = './Train/Train/'
path_test = './Test/Test/'
path_baseline ='./Test/Test/Baselines/'

coords, df_X_train, df_X_test, df_Y_train, baseline = pre_traitement.load_datasets(path_coords, path_train, path_test, path_baseline)

In [11]:
df_X_train.head()

Unnamed: 0,number_sta,date,ff,t,td,hu,dd,precip,Id
0,14066001,2016-01-01 00:00:00,3.05,279.28,277.97,91.4,200.0,0.0,14066001_0_0
1,14066001,2016-01-01 01:00:00,2.57,278.76,277.45,91.4,190.0,0.0,14066001_0_1
2,14066001,2016-01-01 02:00:00,2.26,278.27,277.02,91.7,181.0,0.0,14066001_0_2
3,14066001,2016-01-01 03:00:00,2.62,277.98,276.95,93.0,159.0,0.0,14066001_0_3
4,14066001,2016-01-01 04:00:00,2.99,277.32,276.72,95.9,171.0,0.0,14066001_0_4


## Traitement des X_stations et des coordonnées

In [3]:
#on supprime les heures de 'date'
df_X_train["date"] = df_X_train["date"].apply(lambda x: dt.date(x.year, x.month, x.day))
#ffill et bfill par station et par jour
df_X_train = pre_traitement.fill_na_hour(df_X_train, 'date')
#moyenne/somme pour avoir 1 valeur par jour et par station
df_X_train = pre_traitement.hour_to_day(df = df_X_train, var = "date")
#on ajoute la variable month
df_X_train = pre_traitement.add_month(df_X_train)
#on ajoute les coordonnées des stations
df_X_train = pre_traitement.add_coords(coords, df_X_train)
#on ajoute ground truth
df_train = pre_traitement.merge_X_Y ( df_X_train, df_Y_train)
#drop les nan de ground_truth
df_train.dropna(subset = ["Ground_truth"], inplace = True)
df_train.head()

Unnamed: 0,date,number_sta,ff,t,td,hu,dd,precip,Id,month_2,...,month_7,month_8,month_9,month_10,month_11,month_12,lat,lon,height_sta,Ground_truth
0,2016-01-01,14066001,3.91375,280.33375,278.514583,88.591667,146.5,0.2,14066001_0_0,0,...,0,0,0,0,0,0,49.334,-0.431,2.0,3.4
1,2016-01-01,14126001,,280.283333,279.552083,95.083333,,0.5,14126001_0_0,0,...,0,0,0,0,0,0,49.145,0.042,125.0,0.5
2,2016-01-01,14137001,5.444583,280.029583,278.72125,91.733333,144.125,0.0,14137001_0_0,0,...,0,0,0,0,0,0,49.18,-0.456,67.0,3.4
3,2016-01-01,14216001,4.759583,280.089583,278.88875,92.1875,158.666667,0.6,14216001_0_0,0,...,0,0,0,0,0,0,48.928,-0.149,155.0,4.0
4,2016-01-01,14296001,,279.301667,,,,0.2,14296001_0_0,0,...,0,0,0,0,0,0,48.795,-1.037,336.0,13.3


In [4]:
#Création de l'Id 
df_X_test["Id"] = df_X_test["Id"].apply(lambda x: x.split('_')[0]+'_'+x.split('_')[1])
#Création de la variable 'day' et 'number_sta'
df_X_test["date"] = df_X_test["Id"].apply(lambda x: x.split('_')[1])
df_X_test["number_sta"] = df_X_test["Id"].apply(lambda x: x.split('_')[0])
#ffill et bfill par station et par jour
df_X_test = pre_traitement.fill_na_hour(df_X_test, 'date')
#moyenne/somme pour avoir 1 valeur par jour et par station
df_X_test = pre_traitement.hour_to_day(df = df_X_test, var = "date")
#suppression des lignes qui ne sont pas dans la baseline
df_X_test = df_X_test[df_X_test.Id.isin(baseline['Id'])] 
#on ajoute la variable month
df_X_test = pre_traitement.add_month(df_X_test)
#on ajoute les coordonnées des stations
df_X_test = pre_traitement.add_coords(coords, df_X_test)
df_X_test.head()

Unnamed: 0,day,number_sta,ff,t,td,hu,dd,precip,Id,month_2,...,month_6,month_7,month_8,month_9,month_10,month_11,month_12,lat,lon,height_sta
0,0,14066001,3.37,279.474583,277.68375,89.125,206.75,0.2,14066001_0,1,...,0,0,0,0,0,0,0,49.334,-0.431,2.0
1,0,14126001,,279.164583,278.152083,93.520833,,0.5,14126001_0,1,...,0,0,0,0,0,0,0,49.145,0.042,125.0
2,0,14137001,3.388333,279.266667,277.65375,89.908333,197.708333,0.2,14137001_0,1,...,0,0,0,0,0,0,0,49.18,-0.456,67.0
3,0,14216001,2.695,279.235417,277.06375,86.620833,213.583333,0.0,14216001_0,1,...,0,0,0,0,0,0,0,48.928,-0.149,155.0
4,0,14296001,,277.874167,,,,0.0,14296001_0,1,...,0,0,0,0,0,0,0,48.795,-1.037,336.0


## Traitement des données forecast

In [5]:
# liste des variables présentes dans forecast
var = ["ws", "p3031", "u10", "v10", "t2m", "d2m", "r", "tp", "msl"]

###  Modèle arome

In [6]:
# calcul des distances
K = 5 #nombre de points choisi
model = '2D_arome'
path = './data_meteonet/train/X_forecast/2D_arome_train/'
df_distance1 = traitement_forecast.calcul_distance(coords, path, model, K)

In [7]:
#ajout des prévisions dans df_train
p = 1 #paramètre d'interpolation

df_train = df_train.sort_values(by=["date", "number_sta"]) #indispensable pour faire traitement_forecast
prev= traitement_forecast.add_prevision(p,df_distance1, df_train, path, model, var)
df_train.head()

date : 2016-01-01T00:00:00.000000000
date : 2016-01-02T00:00:00.000000000
date : 2016-01-03T00:00:00.000000000
date : 2016-01-04T00:00:00.000000000
date : 2016-01-05T00:00:00.000000000
date : 2016-01-06T00:00:00.000000000
date : 2016-01-07T00:00:00.000000000
date : 2016-01-08T00:00:00.000000000
date : 2016-01-09T00:00:00.000000000
date : 2016-01-10T00:00:00.000000000
date : 2016-01-11T00:00:00.000000000
date : 2016-01-12T00:00:00.000000000
date : 2016-01-13T00:00:00.000000000
date : 2016-01-14T00:00:00.000000000
date : 2016-01-15T00:00:00.000000000
date : 2016-01-16T00:00:00.000000000
date : 2016-01-17T00:00:00.000000000
date : 2016-01-18T00:00:00.000000000
date : 2016-01-19T00:00:00.000000000
date : 2016-01-20T00:00:00.000000000
date : 2016-01-21T00:00:00.000000000
date : 2016-01-22T00:00:00.000000000
date : 2016-01-23T00:00:00.000000000
date : 2016-01-24T00:00:00.000000000
date : 2016-01-25T00:00:00.000000000
date : 2016-01-26T00:00:00.000000000
date : 2016-01-27T00:00:00.000000000
d

date : 2016-08-10T00:00:00.000000000
date : 2016-08-11T00:00:00.000000000
date : 2016-08-12T00:00:00.000000000
date : 2016-08-13T00:00:00.000000000
date : 2016-08-14T00:00:00.000000000
date : 2016-08-15T00:00:00.000000000
date : 2016-08-16T00:00:00.000000000
date : 2016-08-17T00:00:00.000000000
date : 2016-08-18T00:00:00.000000000
date : 2016-08-19T00:00:00.000000000
date : 2016-08-20T00:00:00.000000000
date : 2016-08-21T00:00:00.000000000
date : 2016-08-22T00:00:00.000000000
date : 2016-08-23T00:00:00.000000000
date : 2016-08-24T00:00:00.000000000
date : 2016-08-25T00:00:00.000000000
date : 2016-08-26T00:00:00.000000000
date : 2016-08-27T00:00:00.000000000
date : 2016-08-28T00:00:00.000000000
date : 2016-08-29T00:00:00.000000000
date : 2016-08-30T00:00:00.000000000
date : 2016-08-31T00:00:00.000000000
date : 2016-09-01T00:00:00.000000000
date : 2016-09-02T00:00:00.000000000
date : 2016-09-03T00:00:00.000000000
date : 2016-09-04T00:00:00.000000000
date : 2016-09-05T00:00:00.000000000
d

date : 2017-03-20T00:00:00.000000000
date : 2017-03-21T00:00:00.000000000
date : 2017-03-22T00:00:00.000000000
date : 2017-03-23T00:00:00.000000000
date : 2017-03-24T00:00:00.000000000
date : 2017-03-25T00:00:00.000000000
date : 2017-03-26T00:00:00.000000000
date : 2017-03-27T00:00:00.000000000
date : 2017-03-28T00:00:00.000000000
date : 2017-03-29T00:00:00.000000000
date : 2017-03-30T00:00:00.000000000
date : 2017-03-31T00:00:00.000000000
date : 2017-04-01T00:00:00.000000000
date : 2017-04-02T00:00:00.000000000
date : 2017-04-03T00:00:00.000000000
date : 2017-04-04T00:00:00.000000000
date : 2017-04-05T00:00:00.000000000
date : 2017-04-06T00:00:00.000000000
date : 2017-04-07T00:00:00.000000000
date : 2017-04-08T00:00:00.000000000
date : 2017-04-09T00:00:00.000000000
date : 2017-04-10T00:00:00.000000000
date : 2017-04-11T00:00:00.000000000
date : 2017-04-12T00:00:00.000000000
date : 2017-04-13T00:00:00.000000000
date : 2017-04-14T00:00:00.000000000
date : 2017-04-15T00:00:00.000000000
d

date : 2017-11-01T00:00:00.000000000
date : 2017-11-02T00:00:00.000000000
date : 2017-11-03T00:00:00.000000000
date : 2017-11-04T00:00:00.000000000
date : 2017-11-05T00:00:00.000000000
date : 2017-11-06T00:00:00.000000000
date : 2017-11-07T00:00:00.000000000
date : 2017-11-08T00:00:00.000000000
date : 2017-11-09T00:00:00.000000000
date : 2017-11-10T00:00:00.000000000
date : 2017-11-11T00:00:00.000000000
date : 2017-11-12T00:00:00.000000000
date : 2017-11-13T00:00:00.000000000
date : 2017-11-14T00:00:00.000000000
date : 2017-11-15T00:00:00.000000000
date : 2017-11-16T00:00:00.000000000
date : 2017-11-17T00:00:00.000000000
date : 2017-11-18T00:00:00.000000000
date : 2017-11-19T00:00:00.000000000
date : 2017-11-20T00:00:00.000000000
date : 2017-11-21T00:00:00.000000000
date : 2017-11-22T00:00:00.000000000
date : 2017-11-23T00:00:00.000000000
date : 2017-11-24T00:00:00.000000000
date : 2017-11-25T00:00:00.000000000
date : 2017-11-26T00:00:00.000000000
date : 2017-11-27T00:00:00.000000000
d

Unnamed: 0,date,number_sta,ff,t,td,hu,dd,precip,Id,month_2,...,Ground_truth,forecast_2D_arome_ws,forecast_2D_arome_p3031,forecast_2D_arome_u10,forecast_2D_arome_v10,forecast_2D_arome_t2m,forecast_2D_arome_d2m,forecast_2D_arome_r,forecast_2D_arome_tp,forecast_2D_arome_msl
0,2016-01-01,14066001,3.91375,280.33375,278.514583,88.591667,146.5,0.2,14066001_0_0,0,...,3.4,9.608295,212.92593,4.66075,6.506475,282.986505,279.761738,80.474426,5.795015,100347.302535
1,2016-01-01,14126001,,280.283333,279.552083,95.083333,,0.5,14126001_0_0,0,...,0.5,8.337349,208.335944,3.410915,6.270228,281.776081,279.555447,85.968737,7.135,100490.931728
2,2016-01-01,14137001,5.444583,280.029583,278.72125,91.733333,144.125,0.0,14137001_0_0,0,...,3.4,8.932881,216.908106,4.776775,5.994745,282.374895,279.579423,82.723232,6.175511,100395.5339
3,2016-01-01,14216001,4.759583,280.089583,278.88875,92.1875,158.666667,0.6,14216001_0_0,0,...,4.0,8.957683,213.80795,4.469092,6.585549,281.729176,279.165233,83.938322,8.733481,100519.472299
4,2016-01-01,14296001,,279.301667,,,,0.2,14296001_0_0,0,...,13.3,8.232864,222.485432,5.035392,5.635883,280.183457,278.696856,90.324093,25.706864,100488.348086


In [8]:
#ajout des prévisions dans X_test
path = './data_meteonet/test/2D_arome/'
p = 1
#df_X_test['date'] = df_X_test['Id'].apply(lambda x: int(x.split('_')[1]))
df_X_test = df_X_test.sort_values(by=["date", "number_sta"]) #indispensable pour faire traitement_forecast
df_X_test = traitement_forecast.add_prevision(p,df_distance1,df_X_test, path, model, var = var,bool_train = False) 
df_X_test.head()

date : 0
./data_meteonet/test/2D_arome/2D_arome_1.nc
date : 1
./data_meteonet/test/2D_arome/2D_arome_2.nc
date : 2
./data_meteonet/test/2D_arome/2D_arome_3.nc
date : 3
./data_meteonet/test/2D_arome/2D_arome_4.nc
date : 4
./data_meteonet/test/2D_arome/2D_arome_5.nc
date : 5
./data_meteonet/test/2D_arome/2D_arome_6.nc
date : 6
./data_meteonet/test/2D_arome/2D_arome_7.nc
date : 7
./data_meteonet/test/2D_arome/2D_arome_8.nc
date : 8
./data_meteonet/test/2D_arome/2D_arome_9.nc
date : 9
./data_meteonet/test/2D_arome/2D_arome_10.nc
date : 10
./data_meteonet/test/2D_arome/2D_arome_11.nc
date : 11
./data_meteonet/test/2D_arome/2D_arome_12.nc
date : 12
./data_meteonet/test/2D_arome/2D_arome_13.nc
date : 13
./data_meteonet/test/2D_arome/2D_arome_14.nc
date : 14
./data_meteonet/test/2D_arome/2D_arome_15.nc
date : 15
./data_meteonet/test/2D_arome/2D_arome_16.nc
date : 16
./data_meteonet/test/2D_arome/2D_arome_17.nc
date : 17
./data_meteonet/test/2D_arome/2D_arome_18.nc
date : 18
./data_meteonet/tes

date : 148
./data_meteonet/test/2D_arome/2D_arome_149.nc
date : 149
./data_meteonet/test/2D_arome/2D_arome_150.nc
date : 150
./data_meteonet/test/2D_arome/2D_arome_151.nc
date : 151
./data_meteonet/test/2D_arome/2D_arome_152.nc
date : 152
./data_meteonet/test/2D_arome/2D_arome_153.nc
date : 153
./data_meteonet/test/2D_arome/2D_arome_154.nc
date : 154
./data_meteonet/test/2D_arome/2D_arome_155.nc
date : 155
./data_meteonet/test/2D_arome/2D_arome_156.nc
date : 156
./data_meteonet/test/2D_arome/2D_arome_157.nc
date : 157
./data_meteonet/test/2D_arome/2D_arome_158.nc
date : 158
./data_meteonet/test/2D_arome/2D_arome_159.nc
date : 159
./data_meteonet/test/2D_arome/2D_arome_160.nc
date : 160
./data_meteonet/test/2D_arome/2D_arome_161.nc
date : 161
./data_meteonet/test/2D_arome/2D_arome_162.nc
date : 162
./data_meteonet/test/2D_arome/2D_arome_163.nc
date : 163
./data_meteonet/test/2D_arome/2D_arome_164.nc
date : 164
./data_meteonet/test/2D_arome/2D_arome_165.nc
date : 165
./data_meteonet/test

date : 292
./data_meteonet/test/2D_arome/2D_arome_293.nc
date : 293
./data_meteonet/test/2D_arome/2D_arome_294.nc
date : 294
./data_meteonet/test/2D_arome/2D_arome_295.nc
date : 295
./data_meteonet/test/2D_arome/2D_arome_296.nc
date : 296
./data_meteonet/test/2D_arome/2D_arome_297.nc
date : 297
./data_meteonet/test/2D_arome/2D_arome_298.nc
date : 298
./data_meteonet/test/2D_arome/2D_arome_299.nc
date : 299
./data_meteonet/test/2D_arome/2D_arome_300.nc
date : 300
./data_meteonet/test/2D_arome/2D_arome_301.nc
date : 301
./data_meteonet/test/2D_arome/2D_arome_302.nc
date : 302
./data_meteonet/test/2D_arome/2D_arome_303.nc
date : 303
./data_meteonet/test/2D_arome/2D_arome_304.nc
date : 304
./data_meteonet/test/2D_arome/2D_arome_305.nc
date : 305
./data_meteonet/test/2D_arome/2D_arome_306.nc
date : 306
./data_meteonet/test/2D_arome/2D_arome_307.nc
date : 307
./data_meteonet/test/2D_arome/2D_arome_308.nc
date : 308
./data_meteonet/test/2D_arome/2D_arome_309.nc
date : 309
./data_meteonet/test

Unnamed: 0,day,number_sta,ff,t,td,hu,dd,precip,Id,month_2,...,date,forecast_2D_arome_ws,forecast_2D_arome_p3031,forecast_2D_arome_u10,forecast_2D_arome_v10,forecast_2D_arome_t2m,forecast_2D_arome_d2m,forecast_2D_arome_r,forecast_2D_arome_tp,forecast_2D_arome_msl
0,0,14066001,3.37,279.474583,277.68375,89.125,206.75,0.2,14066001_0,1,...,0,4.772604,287.379452,3.583057,-1.795566,291.132735,288.028601,82.545876,0.0,101703.152476
1,0,14126001,,279.164583,278.152083,93.520833,,0.5,14126001_0,1,...,0,2.706289,267.679862,1.61427,-1.398665,290.287671,287.108753,82.625441,0.097439,101708.597627
2,0,14137001,3.388333,279.266667,277.65375,89.908333,197.708333,0.2,14137001_0,1,...,0,3.454726,294.411684,2.41007,-1.587477,291.4557,287.114786,76.896642,0.014582,101709.303434
3,0,14216001,2.695,279.235417,277.06375,86.620833,213.583333,0.0,14216001_0,1,...,0,3.237514,294.943812,2.095295,-1.556012,291.326185,286.437626,74.723927,0.210788,101711.273218
4,0,14296001,,277.874167,,,,0.0,14296001_0,1,...,0,2.799716,294.769656,2.238178,-1.296203,290.326455,286.720757,81.021201,0.010956,101772.700711


### Modèle arpege

In [9]:
# calcul des distances
K = 3
model = '2D_arpege'
path = './data_meteonet/train/X_forecast/2D_arpege_train/'
df_distance2 = traitement_forecast.calcul_distance(coords, path, model, K)

In [10]:
#ajout des prévisions dans df_train
p = 1
df_train = df_train.sort_values(by=["date", "number_sta"]) #indispensable pour faire traitement_forecast
df_train = traitement_forecast.add_prevision(p,df_distance2, df_train, path, model,var)
df_train.tail()

date : 2016-01-01T00:00:00.000000000
date : 2016-01-02T00:00:00.000000000
date : 2016-01-03T00:00:00.000000000
date : 2016-01-04T00:00:00.000000000
date : 2016-01-05T00:00:00.000000000
date : 2016-01-06T00:00:00.000000000
date : 2016-01-07T00:00:00.000000000
date : 2016-01-08T00:00:00.000000000
date : 2016-01-09T00:00:00.000000000
date : 2016-01-10T00:00:00.000000000
date : 2016-01-11T00:00:00.000000000
date : 2016-01-12T00:00:00.000000000
date : 2016-01-13T00:00:00.000000000
date : 2016-01-14T00:00:00.000000000
date : 2016-01-15T00:00:00.000000000
date : 2016-01-16T00:00:00.000000000
date : 2016-01-17T00:00:00.000000000
date : 2016-01-18T00:00:00.000000000
date : 2016-01-19T00:00:00.000000000
date : 2016-01-20T00:00:00.000000000
date : 2016-01-21T00:00:00.000000000
date : 2016-01-22T00:00:00.000000000
date : 2016-01-23T00:00:00.000000000
date : 2016-01-24T00:00:00.000000000
date : 2016-01-25T00:00:00.000000000
date : 2016-01-26T00:00:00.000000000
date : 2016-01-27T00:00:00.000000000
d

date : 2016-08-10T00:00:00.000000000
date : 2016-08-11T00:00:00.000000000
date : 2016-08-12T00:00:00.000000000
date : 2016-08-13T00:00:00.000000000
date : 2016-08-14T00:00:00.000000000
date : 2016-08-15T00:00:00.000000000
date : 2016-08-16T00:00:00.000000000
date : 2016-08-17T00:00:00.000000000
date : 2016-08-18T00:00:00.000000000
date : 2016-08-19T00:00:00.000000000
date : 2016-08-20T00:00:00.000000000
date : 2016-08-21T00:00:00.000000000
date : 2016-08-22T00:00:00.000000000
date : 2016-08-23T00:00:00.000000000
date : 2016-08-24T00:00:00.000000000
date : 2016-08-25T00:00:00.000000000
date : 2016-08-26T00:00:00.000000000
date : 2016-08-27T00:00:00.000000000
date : 2016-08-28T00:00:00.000000000
date : 2016-08-29T00:00:00.000000000
date : 2016-08-30T00:00:00.000000000
date : 2016-08-31T00:00:00.000000000
date : 2016-09-01T00:00:00.000000000
date : 2016-09-02T00:00:00.000000000
date : 2016-09-03T00:00:00.000000000
date : 2016-09-04T00:00:00.000000000
date : 2016-09-05T00:00:00.000000000
d

date : 2017-03-20T00:00:00.000000000
date : 2017-03-21T00:00:00.000000000
date : 2017-03-22T00:00:00.000000000
date : 2017-03-23T00:00:00.000000000
date : 2017-03-24T00:00:00.000000000
date : 2017-03-25T00:00:00.000000000
date : 2017-03-26T00:00:00.000000000
date : 2017-03-27T00:00:00.000000000
date : 2017-03-28T00:00:00.000000000
date : 2017-03-29T00:00:00.000000000
date : 2017-03-30T00:00:00.000000000
date : 2017-03-31T00:00:00.000000000
date : 2017-04-01T00:00:00.000000000
date : 2017-04-02T00:00:00.000000000
date : 2017-04-03T00:00:00.000000000
date : 2017-04-04T00:00:00.000000000
date : 2017-04-05T00:00:00.000000000
date : 2017-04-06T00:00:00.000000000
date : 2017-04-07T00:00:00.000000000
date : 2017-04-08T00:00:00.000000000
date : 2017-04-09T00:00:00.000000000
date : 2017-04-10T00:00:00.000000000
date : 2017-04-11T00:00:00.000000000
date : 2017-04-12T00:00:00.000000000
date : 2017-04-13T00:00:00.000000000
date : 2017-04-14T00:00:00.000000000
date : 2017-04-15T00:00:00.000000000
d

date : 2017-11-01T00:00:00.000000000
date : 2017-11-02T00:00:00.000000000
date : 2017-11-03T00:00:00.000000000
date : 2017-11-04T00:00:00.000000000
date : 2017-11-05T00:00:00.000000000
date : 2017-11-06T00:00:00.000000000
date : 2017-11-07T00:00:00.000000000
date : 2017-11-08T00:00:00.000000000
date : 2017-11-09T00:00:00.000000000
date : 2017-11-10T00:00:00.000000000
date : 2017-11-11T00:00:00.000000000
date : 2017-11-12T00:00:00.000000000
date : 2017-11-13T00:00:00.000000000
date : 2017-11-14T00:00:00.000000000
date : 2017-11-15T00:00:00.000000000
date : 2017-11-16T00:00:00.000000000
date : 2017-11-17T00:00:00.000000000
date : 2017-11-18T00:00:00.000000000
date : 2017-11-19T00:00:00.000000000
date : 2017-11-20T00:00:00.000000000
date : 2017-11-21T00:00:00.000000000
date : 2017-11-22T00:00:00.000000000
date : 2017-11-23T00:00:00.000000000
date : 2017-11-24T00:00:00.000000000
date : 2017-11-25T00:00:00.000000000
date : 2017-11-26T00:00:00.000000000
date : 2017-11-27T00:00:00.000000000
d

Unnamed: 0,date,number_sta,ff,t,td,hu,dd,precip,Id,month_2,...,forecast_2D_arome_msl,forecast_2D_arpege_ws,forecast_2D_arpege_p3031,forecast_2D_arpege_u10,forecast_2D_arpege_v10,forecast_2D_arpege_t2m,forecast_2D_arpege_d2m,forecast_2D_arpege_r,forecast_2D_arpege_tp,forecast_2D_arpege_msl
183742,2017-12-30,86137003,4.028333,286.465417,284.495417,88.025,221.166667,3.4,86137003_729_0,0,...,101165.904888,9.252916,219.655643,5.56725,6.655889,284.048556,281.279492,83.074523,2.052794,101153.218439
183743,2017-12-30,86165005,5.950833,285.988333,284.020833,87.8875,239.291667,5.6,86165005_729_0,0,...,101387.611321,8.578518,213.89588,4.652563,6.648792,283.658364,281.074614,84.240508,2.841173,101391.056611
183744,2017-12-30,86272002,,286.11875,,,,5.6,86272002_729_0,0,...,101262.104974,8.281766,218.348579,4.921711,6.114034,283.815923,281.42137,85.18987,1.89849,101246.519678
183745,2017-12-30,91200002,4.2125,285.455,284.124167,91.675,229.708333,5.6,91200002_729_0,0,...,100778.423065,9.707421,218.647633,5.881056,7.448153,283.378643,281.557296,88.539244,2.670354,100772.481539
183746,2017-12-30,95690001,8.529583,285.330833,283.395417,88.175,230.958333,5.4,95690001_729_0,0,...,100558.002879,9.368429,219.243324,5.771374,7.184695,283.656741,281.453239,86.277629,4.561992,100538.999917


In [11]:
#ajout des prévisions dans X_test
path = './data_meteonet/test/2D_arpege/'
p = 1
model = '2D_arpege'

df_X_test = traitement_forecast.add_prevision(p,df_distance2,df_X_test, path, model, var,bool_train = False)
df_X_test.tail()

date : 0
./data_meteonet/test/2D_arpege/2D_arpege_1.nc
date : 1
./data_meteonet/test/2D_arpege/2D_arpege_2.nc
date : 2
./data_meteonet/test/2D_arpege/2D_arpege_3.nc
date : 3
./data_meteonet/test/2D_arpege/2D_arpege_4.nc
date : 4
./data_meteonet/test/2D_arpege/2D_arpege_5.nc
date : 5
./data_meteonet/test/2D_arpege/2D_arpege_6.nc
date : 6
./data_meteonet/test/2D_arpege/2D_arpege_7.nc
date : 7
./data_meteonet/test/2D_arpege/2D_arpege_8.nc
date : 8
./data_meteonet/test/2D_arpege/2D_arpege_9.nc
date : 9
./data_meteonet/test/2D_arpege/2D_arpege_10.nc
date : 10
./data_meteonet/test/2D_arpege/2D_arpege_11.nc
date : 11
./data_meteonet/test/2D_arpege/2D_arpege_12.nc
date : 12
./data_meteonet/test/2D_arpege/2D_arpege_13.nc
date : 13
./data_meteonet/test/2D_arpege/2D_arpege_14.nc
date : 14
./data_meteonet/test/2D_arpege/2D_arpege_15.nc
date : 15
./data_meteonet/test/2D_arpege/2D_arpege_16.nc
date : 16
./data_meteonet/test/2D_arpege/2D_arpege_17.nc
date : 17
./data_meteonet/test/2D_arpege/2D_arpege

date : 143
./data_meteonet/test/2D_arpege/2D_arpege_144.nc
date : 144
./data_meteonet/test/2D_arpege/2D_arpege_145.nc
date : 145
./data_meteonet/test/2D_arpege/2D_arpege_146.nc
date : 146
./data_meteonet/test/2D_arpege/2D_arpege_147.nc
date : 147
./data_meteonet/test/2D_arpege/2D_arpege_148.nc
date : 148
./data_meteonet/test/2D_arpege/2D_arpege_149.nc
date : 149
./data_meteonet/test/2D_arpege/2D_arpege_150.nc
date : 150
./data_meteonet/test/2D_arpege/2D_arpege_151.nc
date : 151
./data_meteonet/test/2D_arpege/2D_arpege_152.nc
date : 152
./data_meteonet/test/2D_arpege/2D_arpege_153.nc
date : 153
./data_meteonet/test/2D_arpege/2D_arpege_154.nc
date : 154
./data_meteonet/test/2D_arpege/2D_arpege_155.nc
date : 155
./data_meteonet/test/2D_arpege/2D_arpege_156.nc
date : 156
./data_meteonet/test/2D_arpege/2D_arpege_157.nc
date : 157
./data_meteonet/test/2D_arpege/2D_arpege_158.nc
date : 158
./data_meteonet/test/2D_arpege/2D_arpege_159.nc
date : 159
./data_meteonet/test/2D_arpege/2D_arpege_160.

date : 282
./data_meteonet/test/2D_arpege/2D_arpege_283.nc
date : 283
./data_meteonet/test/2D_arpege/2D_arpege_284.nc
date : 284
./data_meteonet/test/2D_arpege/2D_arpege_285.nc
date : 285
./data_meteonet/test/2D_arpege/2D_arpege_286.nc
date : 286
./data_meteonet/test/2D_arpege/2D_arpege_287.nc
date : 287
./data_meteonet/test/2D_arpege/2D_arpege_288.nc
date : 288
./data_meteonet/test/2D_arpege/2D_arpege_289.nc
date : 289
./data_meteonet/test/2D_arpege/2D_arpege_290.nc
date : 290
./data_meteonet/test/2D_arpege/2D_arpege_291.nc
date : 291
./data_meteonet/test/2D_arpege/2D_arpege_292.nc
date : 292
./data_meteonet/test/2D_arpege/2D_arpege_293.nc
date : 293
./data_meteonet/test/2D_arpege/2D_arpege_294.nc
date : 294
./data_meteonet/test/2D_arpege/2D_arpege_295.nc
date : 295
./data_meteonet/test/2D_arpege/2D_arpege_296.nc
date : 296
./data_meteonet/test/2D_arpege/2D_arpege_297.nc
date : 297
./data_meteonet/test/2D_arpege/2D_arpege_298.nc
date : 298
./data_meteonet/test/2D_arpege/2D_arpege_299.

Unnamed: 0,day,number_sta,ff,t,td,hu,dd,precip,Id,month_2,...,forecast_2D_arome_msl,forecast_2D_arpege_ws,forecast_2D_arpege_p3031,forecast_2D_arpege_u10,forecast_2D_arpege_v10,forecast_2D_arpege_t2m,forecast_2D_arpege_d2m,forecast_2D_arpege_r,forecast_2D_arpege_tp,forecast_2D_arpege_msl
69203,362,86137003,4.68,279.822083,275.260833,73.958333,162.166667,3.0,86137003_362,0,...,,,,,,,,,,
69204,362,86165005,4.52875,280.80625,274.163333,64.429167,192.916667,1.6,86165005_362,0,...,,,,,,,,,,
69205,362,86272002,,279.885417,,,,5.0,86272002_362,0,...,,,,,,,,,,
69206,362,91200002,2.612917,276.440417,273.216667,80.4,131.625,1.8,91200002_362,0,...,,,,,,,,,,
69207,362,95690001,4.542083,275.627917,271.831667,76.795833,130.166667,0.4,95690001_362,0,...,,,,,,,,,,


### Modèle arpege 3D

In [12]:
model = 'arpege_3D_height'
path = './data_meteonet/train/X_forecast/3D_arpege_train/'
p = 1
K = 3
df_distance3 = traitement_forecast.calcul_distance(coords, path, model, K)

In [13]:
df_train = traitement_forecast.add_prevision_3D(p,df_distance3, df_train, path, model, bool_train = True)

date : 2016-01-01T00:00:00.000000000
date : 2016-01-02T00:00:00.000000000
date : 2016-01-03T00:00:00.000000000
date : 2016-01-04T00:00:00.000000000
date : 2016-01-05T00:00:00.000000000
date : 2016-01-06T00:00:00.000000000
date : 2016-01-07T00:00:00.000000000
date : 2016-01-08T00:00:00.000000000
date : 2016-01-09T00:00:00.000000000
date : 2016-01-10T00:00:00.000000000
date : 2016-01-11T00:00:00.000000000
date : 2016-01-12T00:00:00.000000000
date : 2016-01-13T00:00:00.000000000
date : 2016-01-14T00:00:00.000000000
date : 2016-01-15T00:00:00.000000000
date : 2016-01-16T00:00:00.000000000
date : 2016-01-17T00:00:00.000000000
date : 2016-01-18T00:00:00.000000000
date : 2016-01-19T00:00:00.000000000
date : 2016-01-20T00:00:00.000000000
date : 2016-01-21T00:00:00.000000000
date : 2016-01-22T00:00:00.000000000
date : 2016-01-23T00:00:00.000000000
date : 2016-01-24T00:00:00.000000000
date : 2016-01-25T00:00:00.000000000
date : 2016-01-26T00:00:00.000000000
date : 2016-01-27T00:00:00.000000000
d

date : 2016-08-10T00:00:00.000000000
date : 2016-08-11T00:00:00.000000000
date : 2016-08-12T00:00:00.000000000
date : 2016-08-13T00:00:00.000000000
date : 2016-08-14T00:00:00.000000000
date : 2016-08-15T00:00:00.000000000
date : 2016-08-16T00:00:00.000000000
date : 2016-08-17T00:00:00.000000000
date : 2016-08-18T00:00:00.000000000
date : 2016-08-19T00:00:00.000000000
date : 2016-08-20T00:00:00.000000000
date : 2016-08-21T00:00:00.000000000
date : 2016-08-22T00:00:00.000000000
date : 2016-08-23T00:00:00.000000000
date : 2016-08-24T00:00:00.000000000
date : 2016-08-25T00:00:00.000000000
date : 2016-08-26T00:00:00.000000000
date : 2016-08-27T00:00:00.000000000
date : 2016-08-28T00:00:00.000000000
date : 2016-08-29T00:00:00.000000000
date : 2016-08-30T00:00:00.000000000
date : 2016-08-31T00:00:00.000000000
date : 2016-09-01T00:00:00.000000000
date : 2016-09-02T00:00:00.000000000
date : 2016-09-03T00:00:00.000000000
date : 2016-09-04T00:00:00.000000000
date : 2016-09-05T00:00:00.000000000
d

date : 2017-03-20T00:00:00.000000000
date : 2017-03-21T00:00:00.000000000
date : 2017-03-22T00:00:00.000000000
date : 2017-03-23T00:00:00.000000000
date : 2017-03-24T00:00:00.000000000
date : 2017-03-25T00:00:00.000000000
date : 2017-03-26T00:00:00.000000000
date : 2017-03-27T00:00:00.000000000
date : 2017-03-28T00:00:00.000000000
date : 2017-03-29T00:00:00.000000000
date : 2017-03-30T00:00:00.000000000
date : 2017-03-31T00:00:00.000000000
date : 2017-04-01T00:00:00.000000000
date : 2017-04-02T00:00:00.000000000
date : 2017-04-03T00:00:00.000000000
date : 2017-04-04T00:00:00.000000000
date : 2017-04-05T00:00:00.000000000
date : 2017-04-06T00:00:00.000000000
date : 2017-04-07T00:00:00.000000000
date : 2017-04-08T00:00:00.000000000
date : 2017-04-09T00:00:00.000000000
date : 2017-04-10T00:00:00.000000000
date : 2017-04-11T00:00:00.000000000
date : 2017-04-12T00:00:00.000000000
date : 2017-04-13T00:00:00.000000000
date : 2017-04-14T00:00:00.000000000
date : 2017-04-15T00:00:00.000000000
d

date : 2017-10-28T00:00:00.000000000
date : 2017-10-29T00:00:00.000000000
date : 2017-10-30T00:00:00.000000000
date : 2017-10-31T00:00:00.000000000
date : 2017-11-01T00:00:00.000000000
date : 2017-11-02T00:00:00.000000000
date : 2017-11-03T00:00:00.000000000
date : 2017-11-04T00:00:00.000000000
date : 2017-11-05T00:00:00.000000000
date : 2017-11-06T00:00:00.000000000
date : 2017-11-07T00:00:00.000000000
date : 2017-11-08T00:00:00.000000000
date : 2017-11-09T00:00:00.000000000
date : 2017-11-10T00:00:00.000000000
date : 2017-11-11T00:00:00.000000000
date : 2017-11-12T00:00:00.000000000
date : 2017-11-13T00:00:00.000000000
date : 2017-11-14T00:00:00.000000000
date : 2017-11-15T00:00:00.000000000
date : 2017-11-16T00:00:00.000000000
date : 2017-11-17T00:00:00.000000000
date : 2017-11-18T00:00:00.000000000
date : 2017-11-19T00:00:00.000000000
date : 2017-11-20T00:00:00.000000000
date : 2017-11-21T00:00:00.000000000
date : 2017-11-22T00:00:00.000000000
date : 2017-11-23T00:00:00.000000000
d

In [14]:
path = './data_meteonet/test/3D_arpege/'
df_X_test = traitement_forecast.add_prevision_3D(p,df_distance3, df_X_test, path, model, bool_train = False)

date : 0
./data_meteonet/test/3D_arpege/arpege_3D_height_1.nc
date : 1
./data_meteonet/test/3D_arpege/arpege_3D_height_2.nc
date : 2
./data_meteonet/test/3D_arpege/arpege_3D_height_3.nc
date : 3
./data_meteonet/test/3D_arpege/arpege_3D_height_4.nc
date : 4
./data_meteonet/test/3D_arpege/arpege_3D_height_5.nc
date : 5
./data_meteonet/test/3D_arpege/arpege_3D_height_6.nc
date : 6
./data_meteonet/test/3D_arpege/arpege_3D_height_7.nc
date : 7
./data_meteonet/test/3D_arpege/arpege_3D_height_8.nc
date : 8
./data_meteonet/test/3D_arpege/arpege_3D_height_9.nc
date : 9
./data_meteonet/test/3D_arpege/arpege_3D_height_10.nc
date : 10
./data_meteonet/test/3D_arpege/arpege_3D_height_11.nc
date : 11
./data_meteonet/test/3D_arpege/arpege_3D_height_12.nc
date : 12
./data_meteonet/test/3D_arpege/arpege_3D_height_13.nc
date : 13
./data_meteonet/test/3D_arpege/arpege_3D_height_14.nc
date : 14
./data_meteonet/test/3D_arpege/arpege_3D_height_15.nc
date : 15
./data_meteonet/test/3D_arpege/arpege_3D_height_1

date : 128
./data_meteonet/test/3D_arpege/arpege_3D_height_129.nc
date : 129
./data_meteonet/test/3D_arpege/arpege_3D_height_130.nc
date : 130
./data_meteonet/test/3D_arpege/arpege_3D_height_131.nc
date : 131
./data_meteonet/test/3D_arpege/arpege_3D_height_132.nc
date : 132
./data_meteonet/test/3D_arpege/arpege_3D_height_133.nc
date : 133
./data_meteonet/test/3D_arpege/arpege_3D_height_134.nc
date : 134
./data_meteonet/test/3D_arpege/arpege_3D_height_135.nc
date : 135
./data_meteonet/test/3D_arpege/arpege_3D_height_136.nc
date : 136
./data_meteonet/test/3D_arpege/arpege_3D_height_137.nc
erreur
date : 137
./data_meteonet/test/3D_arpege/arpege_3D_height_138.nc
date : 138
./data_meteonet/test/3D_arpege/arpege_3D_height_139.nc
date : 139
./data_meteonet/test/3D_arpege/arpege_3D_height_140.nc
date : 140
./data_meteonet/test/3D_arpege/arpege_3D_height_141.nc
date : 141
./data_meteonet/test/3D_arpege/arpege_3D_height_142.nc
date : 142
./data_meteonet/test/3D_arpege/arpege_3D_height_143.nc
dat

date : 253
./data_meteonet/test/3D_arpege/arpege_3D_height_254.nc
date : 254
./data_meteonet/test/3D_arpege/arpege_3D_height_255.nc
date : 255
./data_meteonet/test/3D_arpege/arpege_3D_height_256.nc
date : 256
./data_meteonet/test/3D_arpege/arpege_3D_height_257.nc
date : 257
./data_meteonet/test/3D_arpege/arpege_3D_height_258.nc
date : 258
./data_meteonet/test/3D_arpege/arpege_3D_height_259.nc
date : 259
./data_meteonet/test/3D_arpege/arpege_3D_height_260.nc
date : 260
./data_meteonet/test/3D_arpege/arpege_3D_height_261.nc
date : 261
./data_meteonet/test/3D_arpege/arpege_3D_height_262.nc
date : 262
./data_meteonet/test/3D_arpege/arpege_3D_height_263.nc
date : 263
./data_meteonet/test/3D_arpege/arpege_3D_height_264.nc
date : 264
./data_meteonet/test/3D_arpege/arpege_3D_height_265.nc
date : 265
./data_meteonet/test/3D_arpege/arpege_3D_height_266.nc
date : 266
./data_meteonet/test/3D_arpege/arpege_3D_height_267.nc
date : 267
./data_meteonet/test/3D_arpege/arpege_3D_height_268.nc
date : 268

In [15]:
#téléchargement des dataframes
df_train.to_csv("df_train_complet_avec_nan.csv", index = False)

df_X_test.to_csv("df_X_test_complet_avec_nan.csv", index = False)

### Traitement des NaN dans les données forecast

In [12]:
df_X_test = pd.read_csv("df_X_test_complet_avec_nan.csv")
df_train = pd.read_csv("df_train_complet_avec_nan.csv")

In [14]:
df_X_test.describe()

Unnamed: 0,day,number_sta,ff,t,td,hu,dd,precip,month_2,month_3,...,forecast_2D_arpege_r,forecast_2D_arpege_tp,forecast_2D_arpege_msl,forecast_arpege_3D_height_20,forecast_arpege_3D_height_100,forecast_arpege_3D_height_500,forecast_arpege_3D_height_875,forecast_arpege_3D_height_1375,forecast_arpege_3D_height_2000,forecast_arpege_3D_height_3000
count,85140.0,85140.0,47067.0,85078.0,55263.0,55272.0,47061.0,85140.0,85140.0,85140.0,...,84216.0,84434.0,84434.0,84697.0,84697.0,84697.0,84697.0,84697.0,84697.0,84697.0
mean,181.376368,49407770.0,3.727269,285.412636,281.482136,79.560229,179.075554,2.211344,0.073949,0.083051,...,78.602467,2.418337,101598.909738,100108.528477,99155.253732,94519.565851,90344.49927,85017.858974,78733.681867,69499.345232
std,104.684171,21916180.0,1.970027,6.068282,4.956047,11.931719,73.384732,4.538714,0.261689,0.275962,...,14.848274,4.333288,940.991605,1216.898129,1207.095166,1165.981092,1137.363891,1114.908952,1107.754739,1119.682311
min,0.0,14047000.0,0.0,265.922083,226.035833,2.005208,0.0,0.0,0.0,0.0,...,27.699097,0.0,97895.091108,93826.321702,92917.524766,88527.31923,84599.207316,79596.280314,73576.8903,64707.245064
25%,91.0,29263000.0,2.355,281.018438,278.191458,72.307292,122.791667,0.0,0.0,0.0,...,69.047057,0.005952,101091.483231,99381.289326,98435.568079,93827.606847,89668.46801,84347.601164,78054.624324,68780.505991
50%,182.0,49020000.0,3.288333,284.974167,281.725833,81.45,189.166667,0.2,0.0,0.0,...,82.215505,0.390117,101693.030136,100175.6607,99225.715243,94603.799703,90438.99939,85123.693274,78848.424106,69624.532862
75%,272.0,72086000.0,4.643333,290.026667,285.262083,88.567708,235.666667,2.2,0.0,0.0,...,90.397741,3.133457,102169.786457,100937.44503,99979.726057,95322.749835,91137.63195,85806.324498,79530.56275,70316.859762
max,362.0,95690000.0,19.580417,305.84375,295.593333,100.0,350.75,73.6,1.0,1.0,...,99.915835,64.374146,104152.288853,103800.91419,102789.630665,97849.083563,93417.817007,87920.888349,81597.469941,72332.366235


In [14]:
for col in df_train.columns : 
    print(col, " : ", sum(df_train[col].isnull()), " NaN = ", sum(df_train[col].isnull())/df_train.shape[0]*100, "%")

date  :  0  NaN =  0.0 %
number_sta  :  0  NaN =  0.0 %
ff  :  69049  NaN =  42.5947059658127 %
t  :  156  NaN =  0.09623273516874657 %
td  :  48618  NaN =  29.99130204124436 %
hu  :  48544  NaN =  29.945653179689955 %
dd  :  69099  NaN =  42.625549791187304 %
precip  :  0  NaN =  0.0 %
Id  :  0  NaN =  0.0 %
month_2  :  0  NaN =  0.0 %
month_3  :  0  NaN =  0.0 %
month_4  :  0  NaN =  0.0 %
month_5  :  0  NaN =  0.0 %
month_6  :  0  NaN =  0.0 %
month_7  :  0  NaN =  0.0 %
month_8  :  0  NaN =  0.0 %
month_9  :  0  NaN =  0.0 %
month_10  :  0  NaN =  0.0 %
month_11  :  0  NaN =  0.0 %
month_12  :  0  NaN =  0.0 %
lat  :  0  NaN =  0.0 %
lon  :  0  NaN =  0.0 %
height_sta  :  0  NaN =  0.0 %
Ground_truth  :  0  NaN =  0.0 %
forecast_2D_arome_ws  :  8102  NaN =  4.997933463699901 %
forecast_2D_arome_p3031  :  8102  NaN =  4.997933463699901 %
forecast_2D_arome_u10  :  8102  NaN =  4.997933463699901 %
forecast_2D_arome_v10  :  8102  NaN =  4.997933463699901 %
forecast_2D_arome_t2m  :  675

In [15]:
for col in df_X_test.columns : 
    print(col, " : ", sum(df_X_test[col].isnull()), " NaN = ", sum(df_X_test[col].isnull())/df_X_test.shape[0]*100, "%")

day  :  0  NaN =  0.0 %
number_sta  :  0  NaN =  0.0 %
ff  :  38073  NaN =  44.718111346018325 %
t  :  62  NaN =  0.0728212356119333 %
td  :  29877  NaN =  35.09161381254404 %
hu  :  29868  NaN =  35.08104298801973 %
dd  :  38079  NaN =  44.72515856236787 %
precip  :  0  NaN =  0.0 %
Id  :  0  NaN =  0.0 %
month_2  :  0  NaN =  0.0 %
month_3  :  0  NaN =  0.0 %
month_4  :  0  NaN =  0.0 %
month_5  :  0  NaN =  0.0 %
month_6  :  0  NaN =  0.0 %
month_7  :  0  NaN =  0.0 %
month_8  :  0  NaN =  0.0 %
month_9  :  0  NaN =  0.0 %
month_10  :  0  NaN =  0.0 %
month_11  :  0  NaN =  0.0 %
month_12  :  0  NaN =  0.0 %
lat  :  0  NaN =  0.0 %
lon  :  0  NaN =  0.0 %
height_sta  :  0  NaN =  0.0 %
date  :  0  NaN =  0.0 %
forecast_2D_arome_ws  :  7140  NaN =  8.386187455954898 %
forecast_2D_arome_p3031  :  7140  NaN =  8.386187455954898 %
forecast_2D_arome_u10  :  7140  NaN =  8.386187455954898 %
forecast_2D_arome_v10  :  6669  NaN =  7.8329809725158555 %
forecast_2D_arome_t2m  :  5455  NaN =  

**Méthode 1: miss forest**

In [20]:
#!pip install sklearn



In [17]:
import sklearn.neighbors._base
import sys
sys.modules['sklearn.neighbors.base'] = sklearn.neighbors._base
from missingpy import MissForest

In [19]:
#pour train
df_train2 = df_train.drop(["date", "Id", "number_sta"], axis = 1)
imputer = MissForest()
X_imputed = imputer.fit_transform(df_train2)

In [24]:
df1 = pd.DataFrame(X_imputed, columns = df_train2.columns)
df1["date"] = df_train["date"]
df1["Id"] = df_train["Id"]
df1["number_sta"] = df_train["number_sta"]
df1.to_csv("df_train_nan_by_missforest.csv", index = False)

In [28]:
#pour test
df_test2 = df_X_test.drop(["date", "Id", "number_sta"], axis = 1)
imputer2 = MissForest()
X_imputed2 = imputer2.fit_transform(df_test2)

Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4
Iteration: 5
Iteration: 6
Iteration: 7
Iteration: 8


In [30]:
df2 = pd.DataFrame(X_imputed2, columns = df_test2.columns)
df2["date"] = df_X_test["date"]
df2["Id"] = df_X_test["Id"]
df2["number_sta"] = df_X_test["number_sta"]
df2.to_csv("df_X_test_nan_by_missforest.csv", index = False)

In [50]:
df2.to_csv("df_X_test_nan_by_missforest.csv", index = False)

In [48]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 85140 entries, 0 to 85139
Data columns (total 49 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   day                             85140 non-null  int32  
 1   ff                              85140 non-null  float64
 2   t                               85140 non-null  float64
 3   td                              85140 non-null  float64
 4   hu                              85140 non-null  float64
 5   dd                              85140 non-null  float64
 6   precip                          85140 non-null  float64
 7   month_2                         85140 non-null  int32  
 8   month_3                         85140 non-null  int32  
 9   month_4                         85140 non-null  int32  
 10  month_5                         85140 non-null  int32  
 11  month_6                         85140 non-null  int32  
 12  month_7                         

In [47]:
i = '12'
df2["month_"+i] = df2["month_"+i].astype("int")