In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import datetime
import warnings
warnings.filterwarnings("ignore")
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('./'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

./casia-train-model.ipynb
./README.md
./prise_en_main/open-y-and-baselines.ipynb
./prise_en_main/compute-baseline-obs.ipynb
./prise_en_main/open-x-forecast-3d-train.ipynb
./prise_en_main/open-x-station.ipynb
./prise_en_main/open-x-forecast-2d-train.ipynb
./prise_en_main/open-mask-data.ipynb
./prise_en_main/open-x-forecast-3d-test.ipynb
./prise_en_main/open-x-forecast-2d-test.ipynb
./prise_en_main/.ipynb_checkpoints/open-x-station-checkpoint.ipynb
./prise_en_main/.ipynb_checkpoints/open-y-and-baselines-checkpoint.ipynb
./prise_en_main/.ipynb_checkpoints/compute-baseline-obs-checkpoint.ipynb
./prise_en_main/.ipynb_checkpoints/open-x-forecast-3d-train-checkpoint.ipynb
./data/.gitignore
./data/Other/Other/stations_coordinates.csv
./data/Other/Other/NW_masks.nc
./data/Test/Test/Id_month_test.csv
./data/Test/Test/X_station_test.csv
./data/Test/Test/X_forecast/2D_arpege_119.nc
./data/Test/Test/X_forecast/arpege_3D_isobar_118.nc
./data/Test/Test/X_forecast/2D_arome_119.nc
./data/Test/Test/X_fo

### load X_train

In [130]:
first_date = datetime.datetime(2016,1,1)    
last_date = datetime.datetime(2017,12,31)
coords_fname  = './data/Other/Other/stations_coordinates.csv'
fname = './data/Train/Train/X_station_train.csv'

In [128]:
def load_X_train(fname,coords_fname) :
    coords = pd.read_csv(coords_fname)
    train = pd.read_csv(fname,parse_dates=['date'],infer_datetime_format=True)
    # reorganisation des données.
    train['number_sta']=train['number_sta'].astype('category')
    #sort by station, then by date 
    train = train.sort_values(['number_sta','date'])
    method_traitements = ["mean","min","max","std"]
    dict_to_split = {"ff":method_traitements,"t":method_traitements,"td":method_traitements
                     ,"hu":method_traitements,"dd":method_traitements,"precip":"sum"}
    Base_obs = train[{"number_sta","date","ff","t","td","hu","dd","precip"}].copy()
    Base_obs.set_index('date',inplace = True)  

    # type de traitement.
    Base_obs_type = Base_obs.groupby('number_sta').resample('D').agg(dict_to_split)
    Base_obs_type = Base_obs_type.reset_index(['date','number_sta'])
    Base_obs_type['number_sta'] = Base_obs_type['number_sta'].astype('category')   
    #Select the observations the day before 
    Base_obs_type['baseline_obs'] = Base_obs_type['precip']["sum"].shift(1)
    Base_obs_type = Base_obs_type.sort_values(by=["number_sta","date"])
    #del Base_obs['precip']
    #Base_obs = Base_obs.rename(columns={'baseline_obs':'precip'})
    del Base_obs_type["precip"]
    date = first_date
    dates = []
    while date <= (last_date - datetime.timedelta(days=1)):
        date += datetime.timedelta(days=1)
        dates.append(date)

    d_dates =  pd.DataFrame(dates, columns = ['date'])
    d_dates['day_index'] = d_dates.index
    y_f = Base_obs_type.merge(d_dates,how="right",on = ["date"])
    y_f = y_f[y_f['date']!=last_date]
    y_f['Id'] = y_f[list(y_f.columns)[1]].astype(str) + '_' + \
                      y_f['day_index'].astype(str) 
    #y_f["number_sta"] = y_f[("number_sta","")]
    #y_f = y_f.merge(coords,on=["number_sta"],how="right")
    X_train = y_f.drop(["day_index",("number_sta","")],axis=1)
    X_train["month"] = X_train[("date","")].dt.month
    X_train["day"] = X_train[("date","")].dt.day
    X_train.drop(["date",("date","")],axis=1,inplace=True)
    return X_train

In [131]:
train = load_X_train(fname,coords_fname)

### Load Y_test

In [187]:
fname_test = "./data/Test/Test/X_station_test.csv"
coords_fname_test = './data/Other/Other/stations_coordinates.csv'
coords = pd.read_csv(coords_fname_test)
test = pd.read_csv(fname_test)

In [188]:
test["Id"] =test["Id"].str.rsplit("_",n=1,expand=True)[0]

In [189]:
test.drop(["precip","month"],axis = 1,inplace = True)

In [190]:
method_traitements = ["mean","min","max","std"]
dict_to_split = {"ff":method_traitements,"t":method_traitements,"td":method_traitements
                 ,"hu":method_traitements,"dd":method_traitements}

In [191]:
test["Id"] = test["Id"].astype("category")

In [192]:
Base_obs = test[{"Id","ff","t","td","hu","dd"}].copy()

In [193]:
# type de traitement.
Base_obs_type = Base_obs.groupby('Id').agg(dict_to_split)
Base_obs_type = Base_obs_type.sort_values(by=["Id"])

In [194]:
test = Base_obs_type.copy()

In [195]:
test.columns = ['_'.join(col).strip() for col in test.columns.values]

In [196]:
test = test.reset_index(['Id'])

In [197]:
base_test = pd.read_csv("./data/Test/Test/Baselines/Baseline_observation_test.csv")

In [200]:
test = test.merge(base_test,on=["Id"],how="right")

In [202]:
test[["number_sta","day_index"]] = test["Id"].str.split("_",expand=True)

In [203]:
test["number_sta"] = test["number_sta"].astype(int)

In [204]:
test = test.merge(coords,on="number_sta",how='right')

In [206]:
date = first_date
dates = []
while date <= (last_date - datetime.timedelta(days=1)):
    date += datetime.timedelta(days=1)
    dates.append(date)

d_dates =  pd.DataFrame(dates, columns = ['date'])
d_dates['day_index'] = d_dates.index

In [207]:
d_dates["month"]=d_dates["date"].dt.month
d_dates["day"] = d_dates["date"].dt.day
del d_dates["date"]

In [208]:
test.dropna(subset=["day_index"],inplace=True)

In [210]:
test["day_index"] = test["day_index"].astype(int)

In [211]:
test.merge(d_dates,on=["day_index"],how="right")

Unnamed: 0,Id,ff_mean,ff_min,ff_max,ff_std,t_mean,t_min,t_max,t_std,td_mean,...,dd_max,dd_std,Prediction,number_sta,day_index,lat,lon,height_sta,month,day
0,78140001_0,,,,,,,,,,...,,,0.2,78140001.0,0,48.964,1.925,42.0,1,2
1,86272002_0,,,,,281.354167,280.25,283.65,1.077428,,...,,,0.0,86272002.0,0,46.839,0.457,120.0,1,2
2,14577003_0,,,,,278.689583,274.40,284.35,3.365844,277.572917,...,,,0.5,14577003.0,0,49.280,-0.564,15.0,1,2
3,14126001_0,,,,,279.164583,276.20,283.35,2.375394,278.152083,...,,,0.5,14126001.0,0,49.145,0.042,125.0,1,2
4,14066001_0,3.37,2.15,4.87,0.695764,279.474583,276.28,284.57,2.634342,277.683750,...,244.0,18.66699,0.2,14066001.0,0,49.334,-0.431,2.0,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
85502,,,,,,,,,,,...,,,,,725,,,,12,27
85503,,,,,,,,,,,...,,,,,726,,,,12,28
85504,,,,,,,,,,,...,,,,,727,,,,12,29
85505,,,,,,,,,,,...,,,,,728,,,,12,30


In [213]:
test = test.dropna(subset=["lat"])

### Load Y

In [132]:
fname = './data/Train/Train/Y_train.csv'
param = 'Ground_truth'  #weather parameter name in the file ('Ground_truth' about Y and 'Prediction' about baseline)
ytrain= pd.read_csv(fname, parse_dates=['date'], infer_datetime_format=True)
ytrain['number_sta'] = Y_train['number_sta'].astype('category')

In [133]:
ytrain = ytrain[ytrain["date"]!=last_date].copy()

In [134]:
X_full = train.dropna()
Y_full = ytrain[ytrain.index.isin(X_full.index)].dropna()
X_full =X_full[X_full.index.isin(Y_full.index)].dropna()
Ximp = X_full.drop("Id",axis=1).values
Yimp = np.array(Y_full["Ground_truth"])

In [135]:
from sklearn.model_selection import train_test_split

In [136]:
X_train, X_test, y_train, y_test = train_test_split(
...     Ximp, Yimp, test_size=0.10, random_state=42)

### Application du modèle.

In [21]:
from sklearn.impute import KNNImputer

In [22]:
### réaliser une imputation des données manquantes.

X = X_train.drop("Id",axis=1).values
imputer = KNNImputer(n_neighbors=2)
imputer.fit(X)

KNNImputer(n_neighbors=2)

### Fonction MAPE

In [12]:
yp = np.array([1,2,3,5,6])
yt = np.array([1.2,2.6,3.7,5.8,6.2])
def MAPE(yp,yt) :
    taille = len(yp)
    mape = 0
    for i in range(taille) :
        mape += abs((yt[i]-yp[i])/(yt[i]+1))
    mape *= 100/taille
    return mape

### Linear regression

In [68]:
from sklearn.linear_model import LinearRegression

In [141]:
reg = LinearRegression().fit(X_train, y_train)

In [142]:
reg.score(X_train, y_train)

0.23240890767167122

In [143]:
MAPE(reg.predict(X_train),y_train)

104.6640436654025

In [144]:
MAPE(reg.predict(X_test),y_test)

104.24008913023096

In [133]:
X_train.shape

(58097, 23)

### RandomForestRegression

In [15]:
from sklearn.ensemble import RandomForestRegressor

In [137]:
regr = RandomForestRegressor()
regr.fit(X_train, y_train)

RandomForestRegressor()

In [138]:
regr.score(X_train,y_train)

0.9329040473108308

In [139]:
MAPE(regr.predict(X_train),y_train)

24.012247772028854

In [140]:
MAPE(regr.predict(X_test),y_test)

63.326865301446084