In [1]:
# LIBRARIES
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# import seaborn as sns
# from pandas_profiling import ProfileReport
# import statsmodels
# from skforecast.ForecasterAutoreg import ForecasterAutoreg
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error


import warnings
warnings.filterwarnings('ignore')
import matplotlib
%matplotlib inline


random_state = 123
np.random.seed(random_state)

In [2]:
raw_train_data = pd.read_csv('data/train_data.csv', parse_dates=['ofd_date'])
raw_test_data = pd.read_csv('data/test.csv', parse_dates=['ofd_date'])
raw_train_data.head()

Unnamed: 0,ofd_date,country_code,fc_codes,station_code,OFD,Slam,Earlies_Exp,Earlies_Rec,MNR_SNR_Exp,Rollover,Returns,R_Sideline,Sideline
0,2021-06-30,C,"F6, F8, F14, F17",D33,14594,14568,782,896,615,767,35,2,4
1,2021-06-30,C,"F6, F8, F9, F14, F17, F18",D37,12736,13111,655,823,211,29,17,2,1
2,2021-06-30,C,"F1, F4, F6, F7, F13, F15, F16",D34,14562,15651,1028,1910,225,35,47,3,1
3,2021-06-30,C,"F2, F6, F7, F10, F12, F13, F14, F15, F19",D45,11165,11467,514,769,56,39,29,0,1
4,2021-06-30,C,"F6, F8, F13, F14, F17",D50,10006,10423,399,842,52,60,65,1,1


In [3]:
raw_train_data.shape

(11309, 13)

In [4]:
raw_test_data.shape

(2389, 12)

In [5]:
raw_train_data.groupby(["country_code"])["fc_codes"].nunique()

country_code
A     9
B    11
C    20
D    20
Name: fc_codes, dtype: int64

Create a target variable 
Transform country_code and station_code
Find number of unique fc codes per country 



In [6]:
def drop_useless(data):
    '''Drops unneed variables.'''
    
#     data = data.drop(['OFD'],axis=1)
    return data


def create_target(data):
    '''Creates target variable (Earlies_Exp - MNR_SNR_Exp) and drops those variables.'''
    
    data['target'] = data['Earlies_Exp'] - data['MNR_SNR_Exp']
    data = data.drop(['Earlies_Exp','MNR_SNR_Exp'], axis=1)
    return data


def apply_preprocessing(data, train=True):
    '''Takes all basic preprocessing functions and applies them in one function. Set train to false for test set as no target can be created.'''

    data = drop_useless(data)
    if train:
        data = create_target(data)
    return data

In [7]:
train_data = apply_preprocessing(raw_train_data)
test_data = apply_preprocessing(raw_test_data, train=False)

test_data = test_data[::-1].reset_index().drop(['Unnamed: 0', 'index'],axis=1)
train_data = train_data[::-1].reset_index().drop('index',axis=1)

In [8]:
# Dummy Variables 
def modify_data(df):
#     data = pd.get_dummies(df, prefix = ['Country', 'Station_code'], columns=['country_code', 'station_code'])
    df.drop(['fc_codes', 'ofd_date', 'country_code', 'station_code'], axis=1, inplace=True)
    return df

# def scale_data(df): 
    

train_data = modify_data(train_data)
test_data = modify_data(test_data)

In [9]:
cols_scale = train_data.columns.drop('target').to_list()

In [10]:
dummy = train_data.copy()

In [16]:
from sklearn.preprocessing import RobustScaler, MinMaxScaler, StandardScaler
scaler = StandardScaler()
dummy[cols_scale] = scaler.fit_transform(train_data[cols_scale])
dummy.head()

Unnamed: 0,OFD,Slam,Earlies_Rec,Rollover,Returns,R_Sideline,Sideline,target
0,0.343307,0.404794,0.85969,-0.143894,-0.119326,0.35088,-0.357465,249
1,0.09745,0.06264,-0.52598,-0.160482,-0.165453,0.149811,-0.359525,165
2,-0.491012,-0.511619,-0.750905,-0.278254,-0.330193,-0.091472,-0.361585,99
3,0.389448,0.388086,-0.340458,-0.218539,-0.181927,0.133725,-0.359525,235
4,-0.011204,-0.038984,-0.44389,-0.26996,0.108014,-0.334766,-0.359525,425


In [17]:
from pycaret.regression import *

In [18]:
setup_kwargs = dict(
    
    n_jobs=-1, # for parallel processing 
    use_gpu=False, 
    session_id=123, # similar to random state
    
    preprocess = False,
    data_split_shuffle = False,
    normalize=False,
#     imputation_type='iterative',
    normalize_method = 'minmax', 
    fold_strategy = 'timeseries', 
    fold = 5, 
)

In [19]:
train_data.shape[0]*0.7

7916.299999999999

In [20]:
train_data['target']

0        249
1        165
2         99
3        235
4        425
        ... 
11304    347
11305    458
11306    803
11307    444
11308    167
Name: target, Length: 11309, dtype: int64

In [21]:
_ = setup(data = dummy,  target = 'target', **setup_kwargs)

Unnamed: 0,Description,Value
0,session_id,123
1,Target,target
2,Original Data,"(11309, 8)"
3,Missing Values,False
4,Numeric Features,7
5,Categorical Features,0
6,Transformed Train Set,"(7916, 7)"
7,Transformed Test Set,"(3393, 7)"
8,Shuffle Train-Test,False
9,Stratify Train-Test,False


In [66]:
import pycaret 

In [141]:
get_config('y_train')

0       249.0
1       165.0
2        99.0
3       235.0
4       425.0
        ...  
7911     43.0
7912    434.0
7913    273.0
7914    218.0
7915    597.0
Name: target, Length: 7916, dtype: float32

In [22]:
best_models = compare_models(sort='RMSE')

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
lightgbm,Light Gradient Boosting Machine,270.5389,228501.4676,470.6134,0.3658,1.0656,2.8074,0.218
llar,Lasso Least Angle Regression,297.4558,237287.0384,476.8913,0.3444,1.0647,3.0465,0.012
et,Extra Trees Regressor,289.7448,254360.2059,495.8445,0.2955,1.071,2.6889,0.376
en,Elastic Net,321.1543,254573.4808,502.7069,0.2876,1.1182,3.1217,0.012
knn,K Neighbors Regressor,319.1017,272095.1824,518.0157,0.2423,1.1126,2.7606,0.046
rf,Random Forest Regressor,304.6609,285844.0561,526.9439,0.21,1.0801,2.7401,0.608
omp,Orthogonal Matching Pursuit,355.9247,321059.3507,565.1498,0.0925,1.2104,3.2138,0.02
gbr,Gradient Boosting Regressor,327.2591,531722.8633,675.8937,-0.4526,1.0935,2.9294,0.304
ada,AdaBoost Regressor,499.5121,534464.0849,727.8991,-0.5177,1.4421,4.5718,0.106
ridge,Ridge Regression,245.4935,1029087.2611,733.9757,-1.8711,0.9396,1.966,0.36


If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), LassoLars())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)

Set parameter alpha to: original_alpha * np.sqrt(n_samples). 
If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), Lars())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)


If y

If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), Lars())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)


  "X does not have valid feature names, but"
If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), Lars())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)


If you wish to scale the 

If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), OrthogonalMatchingPursuit())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)




In [143]:
tune_model(best_models, optimize="RMSE")

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,269.3358,170332.7041,412.7138,0.4713,0.8229,1.6813
1,270.3851,205135.6869,452.9191,0.4257,0.9184,1.8218
2,367.8023,362272.692,601.8909,0.1099,1.2388,3.8257
3,274.4616,211466.664,459.855,0.3614,1.0575,2.7613
4,347.4359,338184.9584,581.5367,0.0636,1.2915,3.5416
Mean,305.8841,257478.5411,501.7831,0.2864,1.0658,2.7264
SD,42.7639,77390.7023,75.4469,0.1673,0.1798,0.87


HuberRegressor(alpha=0.5, epsilon=1.4, fit_intercept=True, max_iter=100,
               tol=1e-05, warm_start=False)