<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Importations" data-toc-modified-id="Importations-1">Importations</a></span></li><li><span><a href="#Ajout-de-colonnes-depuis-'date'" data-toc-modified-id="Ajout-de-colonnes-depuis-'date'-2">Ajout de colonnes depuis 'date'</a></span></li><li><span><a href="#Ajout-des-coordonnées" data-toc-modified-id="Ajout-des-coordonnées-3">Ajout des coordonnées</a></span></li><li><span><a href="#Regroupement-des-données" data-toc-modified-id="Regroupement-des-données-4">Regroupement des données</a></span></li><li><span><a href="#DropNA" data-toc-modified-id="DropNA-5">DropNA</a></span></li><li><span><a href="#Modèle" data-toc-modified-id="Modèle-6">Modèle</a></span></li></ul></div>

In [1]:
import numpy.linalg as npl
import pandas as pd
import seaborn as sns
import missingno as msno
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_percentage_error as mape

from xgboost import XGBRegressor

# Importations

In [148]:
DATA_X = pd.read_csv("Train/Train/X_station_train.csv")
DATA_Y = pd.read_csv("Train/Train/Y_train.csv")
COORDS = pd.read_csv("Other/Other/stations_coordinates.csv")

In [167]:
train = DATA_X.copy()
y = DATA_Y.copy()
coords = COORDS.copy()

# Ajout de colonnes depuis 'date'

In [168]:
train["month"] = train["date"].replace(to_replace="^.{5}", value="", regex=True)\
                              .replace(to_replace=".{12}$", value="", regex=True)

train["day"] = train["date"].replace(to_replace="^.{8}", value="", regex=True)\
                            .replace(to_replace=".{9}$", value="", regex=True)

train["hour"] = train["date"].replace(to_replace="^.{11}", value="", regex=True)\
                             .replace(to_replace=".{6}$", value="", regex=True)

In [169]:
del train["date"]

In [170]:
train = train.astype({"month": "int8", "day": "int8", "hour": "int8"})

# Ajout des coordonnées

In [171]:
train = train.merge(coords, how="left", on="number_sta")

In [172]:
train = train.astype({"Id": str, 
                      "number_sta": "category", 
                      "month": "category", 
                      "day": "category", 
                      "hour": "category"})

# Regroupement des données

In [173]:
train["Id"].replace(to_replace=".\d+$", value="", regex=True, inplace=True)

In [174]:
train

Unnamed: 0,number_sta,ff,t,td,hu,dd,precip,Id,month,day,hour,lat,lon,height_sta
0,14066001,3.05,279.28,277.97,91.4,200.0,0.0,14066001_0,1,1,0,49.334,-0.431,2.0
1,14066001,2.57,278.76,277.45,91.4,190.0,0.0,14066001_0,1,1,1,49.334,-0.431,2.0
2,14066001,2.26,278.27,277.02,91.7,181.0,0.0,14066001_0,1,1,2,49.334,-0.431,2.0
3,14066001,2.62,277.98,276.95,93.0,159.0,0.0,14066001_0,1,1,3,49.334,-0.431,2.0
4,14066001,2.99,277.32,276.72,95.9,171.0,0.0,14066001_0,1,1,4,49.334,-0.431,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4409469,95690001,9.10,286.68,283.44,80.8,239.0,0.0,95690001_729,12,30,19,49.108,1.831,126.0
4409470,95690001,8.58,286.39,283.21,81.1,231.0,0.0,95690001_729,12,30,20,49.108,1.831,126.0
4409471,95690001,8.74,286.28,283.40,82.6,226.0,0.0,95690001_729,12,30,21,49.108,1.831,126.0
4409472,95690001,9.04,286.21,283.29,82.4,224.0,0.0,95690001_729,12,30,22,49.108,1.831,126.0


In [175]:
train = train.groupby("Id", as_index=False).agg({
    "number_sta": "first",
    "ff": "median",
    "t": "median",
    "td": "median",
    "hu": "median",
    "dd": "median",
    "precip": "sum",
    "month": "first",
    "day": "first",
    "hour": "first",
    "lat": "first",
    "lon": "first",
    "height_sta": "first"
})

In [176]:
train

Unnamed: 0,Id,number_sta,ff,t,td,hu,dd,precip,month,day,hour,lat,lon,height_sta
0,14066001_0,14066001,3.695,281.325,278.625,90.15,142.0,0.2,1,1,0,49.334,-0.431,2.0
1,14066001_1,14066001,7.690,282.870,280.045,83.60,207.0,3.4,1,2,0,49.334,-0.431,2.0
2,14066001_10,14066001,5.340,279.520,277.385,86.90,226.5,6.0,1,11,0,49.334,-0.431,2.0
3,14066001_100,14066001,4.435,283.950,278.005,80.95,133.0,11.6,4,10,0,49.334,-0.431,2.0
4,14066001_101,14066001,1.790,282.720,281.360,89.55,171.5,5.6,4,11,0,49.334,-0.431,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
183742,95690001_725,95690001,7.310,279.005,276.850,85.00,202.0,2.4,12,26,0,49.108,1.831,126.0
183743,95690001_726,95690001,8.285,277.110,274.885,88.75,276.5,3.2,12,27,0,49.108,1.831,126.0
183744,95690001_727,95690001,2.875,274.495,273.055,88.70,250.5,0.0,12,28,0,49.108,1.831,126.0
183745,95690001_728,95690001,6.545,278.760,275.480,91.95,210.0,4.4,12,29,0,49.108,1.831,126.0


# DropNA

In [183]:
y_train = y.dropna()

In [184]:
train = train[train["Id"].isin(y_train["Id"])]

In [187]:
y_train = y_train["Ground_truth"]

# Modèle

In [189]:
X = train.drop("Id", axis=1)
Y = y_train

In [190]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=.25, random_state=42)

In [191]:
my_xgbr = XGBRegressor()

In [192]:
my_xgbr.fit(X_train, Y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.300000012, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=100, n_jobs=12, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [193]:
pred = my_xgbr.predict(X_test)

In [197]:
mape(pred, Y_test)

1.6908471184219391