In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import datetime
import warnings
warnings.filterwarnings("ignore")
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('./'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

./casia-train-model.ipynb
./README.md
./data/.gitignore
./data/Train/Train/Y_train.csv
./data/Train/Train/X_station_train.csv
./data/Train/Train/X_forecast/2D_arpege_20170214.nc
./data/Train/Train/X_forecast/arpege_3D_height_20170214.nc
./data/Train/Train/X_forecast/2D_arome_20170214.nc
./data/Train/Train/X_forecast/arpege_3D_isobar_20170214.nc
./data/Train/Train/Baselines/Baseline_forecast_train.csv
./data/Train/Train/Baselines/Baseline_observation_train.csv
./data/Test/Test/Id_month_test.csv
./data/Test/Test/X_station_test.csv
./data/Test/Test/X_forecast/arpege_3D_isobar_118.nc
./data/Test/Test/X_forecast/2D_arpege_119.nc
./data/Test/Test/X_forecast/arpege_3D_height_119.nc
./data/Test/Test/X_forecast/2D_arome_119.nc
./data/Test/Test/Baselines/Baseline_forecast_test.csv
./data/Test/Test/Baselines/Baseline_observation_test.csv
./data/Other/Other/NW_masks.nc
./data/Other/Other/stations_coordinates.csv
./.git/COMMIT_EDITMSG
./.git/config
./.git/description
./.git/packed-refs
./.git/index

In [64]:
### Fonctions utiles :
def percentile(n):
    def percentile_(x):
        return np.percentile(x, n)
    percentile_.__name__ = 'percentile_%s' % n
    return percentile_

### load X_train

In [65]:
first_date = datetime.datetime(2016,1,1)    
last_date = datetime.datetime(2017,12,31)
coords_fname  = './data/Other/Other/stations_coordinates.csv'
fname = './data/Train/Train/X_station_train.csv'

In [68]:
# Cette fonction ne prend en compte que les données présentes dans Xstation
def load_X_train(fname,coords_fname) :
    coords = pd.read_csv(coords_fname)
    train = pd.read_csv(fname,parse_dates=['date'],infer_datetime_format=True)
    # reorganisation des données.
    train['number_sta']=train['number_sta'].astype('category')
    #sort by station, then by date 
    train = train.sort_values(['number_sta','date'])
    method_traitements = ["mean","min","max","std",percentile(15),percentile(25),percentile(50),percentile(75),percentile(85)]
    dict_to_split = {"ff":method_traitements,"t":method_traitements,"td":method_traitements
                     ,"hu":method_traitements,"dd":method_traitements,"precip":"sum"}
    Base_obs = train[{"number_sta","date","ff","t","td","hu","dd","precip"}].copy()
    Base_obs.set_index('date',inplace = True)  

    # type de traitement.
    Base_obs_type = Base_obs.groupby('number_sta').resample('D').agg(dict_to_split)
    Base_obs_type.columns = ['_'.join(col).strip() for col in Base_obs_type.columns.values]
    Base_obs_type = Base_obs_type.reset_index(['date','number_sta'])
    Base_obs_type['number_sta'] = Base_obs_type['number_sta'].astype('category')   
    #Select the observations the day before

    Base_obs_type['Prediction'] = Base_obs_type['precip_sum'].shift(1)
    Base_obs_type = Base_obs_type.sort_values(by=["number_sta","date"])
    #del Base_obs['precip']
    #Base_obs = Base_obs.rename(columns={'baseline_obs':'precip'})
    del Base_obs_type["precip_sum"]
    date = first_date
    dates = []
    while date <= (last_date - datetime.timedelta(days=1)):
        date += datetime.timedelta(days=1)
        dates.append(date)

    d_dates =  pd.DataFrame(dates, columns = ['date'])
    d_dates['day_index'] = d_dates.index
    y_f = Base_obs_type.merge(d_dates,how="right",on = ["date"])
    y_f = y_f[y_f['date']!=last_date]
    y_f['Id'] = y_f[list(y_f.columns)[0]].astype(str) + '_' + \
                      y_f['day_index'].astype(str) 
    #y_f["number_sta"] = y_f[("number_sta","")]
    y_f = y_f.merge(coords,on=["number_sta"],how="right")
    X_train = y_f.copy()#drop(["day_index"],axis=1)
    X_train["month"] = X_train["date"].dt.month
    X_train["day"] = X_train["date"].dt.day
    X_train.drop(["date"],axis=1,inplace=True)
    return X_train

In [69]:
train = load_X_train(fname,coords_fname)

In [106]:
train_forecast = pd.read_csv("./data/Train/Train/Baselines/Baseline_forecast_train.csv")

In [None]:
train_forecast.rename(columns={"Prediction":"Forecasted"},inplace = True)
train_forecast.drop(["date","number_sta"],axis = 1 ,inplace = True)

In [114]:
value_list = list(train["Id"])
boolean_series = train_forecast.Id.isin(value_list)
train_forecast = train_forecast[boolean_series]

In [117]:
train = train.merge(train_forecast,on=["Id"],how="right")

### Load Y_test

In [70]:
fname_test = "./data/Test/Test/X_station_test.csv"
coords_fname_test = './data/Other/Other/stations_coordinates.csv'
coords = pd.read_csv(coords_fname_test)
def load_X_test(fname_test,coords_fname_test) :
    test = pd.read_csv(fname_test)
    test["Id"] =test["Id"].str.rsplit("_",n=1,expand=True)[0]
    test.drop(["precip","month"],axis = 1,inplace = True)
    method_traitements = ["mean","min","max","std",percentile(15),percentile(25),percentile(50),percentile(75),percentile(85)]
    dict_to_split = {"ff":method_traitements,"t":method_traitements,"td":method_traitements
                     ,"hu":method_traitements,"dd":method_traitements}
    test["Id"] = test["Id"].astype("category")
    Base_obs = test[{"Id","ff","t","td","hu","dd"}].copy()
    # type de traitement.
    Base_obs_type = Base_obs.groupby('Id').agg(dict_to_split)
    Base_obs_type = Base_obs_type.sort_values(by=["Id"])
    test = Base_obs_type.copy()
    test.columns = ['_'.join(col).strip() for col in test.columns.values]
    test = test.reset_index(['Id'])
    base_test = pd.read_csv("./data/Test/Test/Baselines/Baseline_observation_test.csv")
    test = test.merge(base_test,on=["Id"],how="right")
    test[["number_sta","day_index"]] = test["Id"].str.split("_",expand=True)
    test["number_sta"] = test["number_sta"].astype(int)
    test = test.merge(coords,on="number_sta",how='right')
    date = first_date
    dates = []
    while date <= (last_date - datetime.timedelta(days=1)):
        date += datetime.timedelta(days=1)
        dates.append(date)

    d_dates =  pd.DataFrame(dates, columns = ['date'])
    d_dates['day_index'] = d_dates.index
    d_dates["month"]=d_dates["date"].dt.month
    d_dates["day"] = d_dates["date"].dt.day
    del d_dates["date"]
    test.dropna(subset=["day_index"],inplace=True)
    test["day_index"] = test["day_index"].astype(int)
    test = test.merge(d_dates,on=["day_index"],how="right")
    test = test.dropna(subset=["lat"])
    return test
test = load_X_test(fname_test,coords_fname_test)

### Load Y

In [72]:
fname = './data/Train/Train/Y_train.csv'
param = 'Ground_truth'  #weather parameter name in the file ('Ground_truth' about Y and 'Prediction' about baseline)
ytrain= pd.read_csv(fname, parse_dates=['date'], infer_datetime_format=True)
ytrain['number_sta'] = ytrain['number_sta'].astype('category')

In [73]:
ytrain = ytrain[ytrain["date"]!=last_date].copy()

In [118]:
### Methodologie drop_all_na
X_full = train.dropna()
Y_full = ytrain.dropna(subset=["Ground_truth"])
#X_full =X_full[X_full.index.isin(Y_full.index)].dropna()
X_full = X_full.sort_values("Id")
Y_full = Y_full.sort_values("Id")
value_list = list(X_full["Id"])
boolean_series = Y_full.Id.isin(value_list)
Y_full = Y_full[boolean_series]
value_list = list(Y_full["Id"])
boolean_series = X_full.Id.isin(value_list)
X_full = X_full[boolean_series]

In [None]:
X_full = train.copy()

In [None]:
### Methodologie drop_na from Ytrain
Y_full = ytrain.dropna(subset=["Ground_truth"])

In [None]:
value_list = list(Y_full["Id"])
boolean_series = X_full.Id.isin(value_list)
X_full = X_full[boolean_series]

In [119]:
Ximp = X_full.drop(["Id","number_sta","day_index"],axis=1).values#],"lon","lat","height_sta"
Yimp = np.array(Y_full["Ground_truth"])

In [29]:
### pour l'ensemble test
x_full = test.copy()

In [30]:
ximp = x_full.drop(["Id","number_sta","day_index"],axis=1).values

In [None]:
Xfull = Ximp.copy()

### Application du modèle.

In [31]:
from sklearn.impute import KNNImputer

In [32]:
### réaliser une imputation des données manquantes.
imputer = KNNImputer(n_neighbors=1)
imputer.fit(ximp)

KNNImputer(n_neighbors=1)

In [33]:
ximp = imputer.transform(ximp)

In [None]:
from sklearn.impute import SimpleImputer

In [None]:
imp = SimpleImputer(missing_values=np.nan, strategy='mean')

In [None]:
imp.fit(Ximp)

In [None]:
ximp=imp.transform(ximp)

### Fonction MAPE

In [19]:
def MAPE(yp,yt) :
    taille = len(yp)
    mape = 0
    for i in range(taille) :
        mape += abs((yt[i]-yp[i])/(yt[i]+1))
    mape *= 100/taille
    return mape

In [9]:
from sklearn.preprocessing import StandardScaler

In [10]:
scaler = StandardScaler()

In [120]:
scaler.fit(Ximp)
Ximp = scaler.transform(Ximp)

In [77]:
from sklearn.model_selection import train_test_split

In [121]:
X_train, X_test, y_train, y_test = train_test_split(
...     Ximp, Yimp, test_size=0.33, random_state=42)

### Linear regression

In [15]:
from sklearn.linear_model import LinearRegression

In [122]:
reg = LinearRegression().fit(X_train, y_train)

In [123]:
reg.score(X_train, y_train)

0.599825757351211

In [124]:
MAPE(reg.predict(X_train),y_train)

53.02318730448354

In [125]:
MAPE(reg.predict(X_test),y_test)

53.289136247602414

### RandomForestRegression

In [22]:
from sklearn.ensemble import RandomForestRegressor

In [126]:
regr = RandomForestRegressor()
regr.fit(X_train, y_train)

RandomForestRegressor()

In [127]:
regr.score(X_train,y_train)

0.952445706605137

In [145]:
MAPE(regr.predict(X_train),y_train)

14.515045100497591

In [128]:
MAPE(regr.predict(X_test),y_test)

38.793523027967865

### SVM

In [41]:
from sklearn import svm

In [None]:
regr_svm = svm.SVR(kernel="linear")
regr_svm.fit(X_train,y_train)

In [None]:
regr_svm.score(X_train,y_train)

In [None]:
MAPE(regr_svm.predict(X_train),y_train)

In [None]:
MAPE(regr_svm.predict(X_test),y_test)

In [None]:
26.714607117563933,27.857640502040667

### Lasso

In [46]:
from sklearn import linear_model

In [133]:
reg_lasso = linear_model.Lasso(alpha=0.1)
reg_lasso.fit(X_train,y_train)

Lasso(alpha=0.1)

In [134]:
reg_lasso.score(X_train,y_train)

0.5929825913168887

In [135]:
MAPE(reg_lasso.predict(X_train),y_train)

49.01571487165535

In [136]:
MAPE(reg_lasso.predict(X_test),y_test)

49.239045847560725

### MLP-NN 

In [95]:
from sklearn.neural_network import MLPRegressor

In [137]:
regr_mlpr = MLPRegressor(random_state=1, max_iter=1000).fit(X_train, y_train)

In [138]:
regr_mlpr.score(X_train,y_train)

0.6979424330891131

In [139]:
MAPE(regr_mlpr.predict(X_train),y_train)

57.027055171108245

In [140]:
MAPE(regr_mlpr.predict(X_test),y_test)

59.27871081848988

### XG-Boost

In [100]:
from sklearn.ensemble import GradientBoostingRegressor

In [141]:
reg_xgBoost = GradientBoostingRegressor()
reg_xgBoost.fit(X_train, y_train)

GradientBoostingRegressor()

In [142]:
reg_xgBoost.score(X_train,y_train)

0.6751925422323366

In [143]:
MAPE(reg_xgBoost.predict(X_train),y_train)

39.46120119176254

In [144]:
MAPE(reg_xgBoost.predict(X_test),y_test)

40.34704834120971

### Partie submission

In [None]:
train[train.index.isin(X_full.index)]["Id"]

In [54]:
y_predicted = list(regr_svm.predict(ximp))
id_predicted = list(test["Id"])

In [55]:
df_submit = pd.DataFrame()
df_submit["Id"] = id_predicted
df_submit["Prediction"] = y_predicted

In [None]:
df_submit["Prediction"] = df.Prediction.apply(lambda s : s if s>=0 else 0 )

In [61]:
df_submit.to_csv('./working/' + "submission.csv",index=False)