1. sort by time, split by time, but we have multiple rows for one single time
2. integrate stationary data with time series model
3. interpolate / backcast nan

to see logging results, go to the directory of this folder and run `mlflow ui` in terminal. the server will be at http://127.0.0.1:5000.

In [1]:
# Import necessary libraries
import os
import pandas as pd
import numpy as np

# Mlflow
import mlflow
import mlflow.sklearn
import mlflow.xgboost

# Visualisation
import seaborn as sns
import matplotlib.pyplot as plt

# Machine Learning
import xgboost as xgb
from xgboost import plot_importance, plot_tree

# Model Evaluiation
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

In [2]:
df1 = pd.read_csv('../../final_dfs/for_modeling/df_final_date_wide.csv')
df2 = pd.read_csv('../../final_dfs/for_modeling/df_final_stationery.csv')

In [3]:
df1.shape

(11228, 40)

In [4]:
df2.shape

(401, 179)

In [5]:
df1.head(2)

Unnamed: 0,ags2,ags5,date,number_of_companies_administration,number_of_companies_agriculture,number_of_companies_arts_entertainment,number_of_companies_communication,number_of_companies_construction,number_of_companies_domestic_staff,number_of_companies_economic_services,...,employees_social_security_at_residence,employees_social_security_at_residenceemployees_social_security_at_work,realized_short_time_work_companies,realized_short_time_work_people,registerd_jobs,underemployment_without_short_time _work,unemployed,unemployment_benefit_entitled,unemployment_benefit_recipients,unemployment_rate
0,1,1001,2019-01-01,34.0,14.0,108.0,131.0,264.0,0.0,130.0,...,,,,,819.0,6166.0,4275.0,11452.0,1155.0,8.2
1,1,1001,2019-02-01,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,...,,,,,864.0,6054.0,4065.0,11506.0,1176.0,7.8


### prelim data cleaning

In [6]:
df1.isna().sum()

ags2                                                                          0
ags5                                                                          0
date                                                                          0
number_of_companies_administration                                            0
number_of_companies_agriculture                                               0
number_of_companies_arts_entertainment                                        0
number_of_companies_communication                                             0
number_of_companies_construction                                              0
number_of_companies_domestic_staff                                            0
number_of_companies_economic_services                                         0
number_of_companies_education                                                 0
number_of_companies_energy                                                    0
number_of_companies_extraterritorial    

In [7]:
#df1 = df1.replace({'0':np.nan, 0:np.nan})

In [8]:
#df1.isna().sum()

In [9]:
#df1.columns

In [10]:
na_cols = ['displayed_short_time_work_companies',
       'displayed_short_time_work_people',
       'employees_social_security_at_residence',
       'employees_social_security_at_residenceemployees_social_security_at_work',
       'realized_short_time_work_companies', 'realized_short_time_work_people',
       'underemployment_without_short_time _work',
       'unemployment_benefit_entitled',
       'unemployment_benefit_recipients']

In [11]:
df1[na_cols] = df1[na_cols].replace({'0':np.nan, 0:np.nan})

In [12]:
df1.isna().sum()

ags2                                                                          0
ags5                                                                          0
date                                                                          0
number_of_companies_administration                                            0
number_of_companies_agriculture                                               0
number_of_companies_arts_entertainment                                        0
number_of_companies_communication                                             0
number_of_companies_construction                                              0
number_of_companies_domestic_staff                                            0
number_of_companies_economic_services                                         0
number_of_companies_education                                                 0
number_of_companies_energy                                                    0
number_of_companies_extraterritorial    

In [13]:
#df1 = df1.dropna()

converting data to datetime and extracting time to fit in model

In [14]:
df1.dtypes['date'] #object

dtype('O')

In [15]:
df1['date'] = pd.to_datetime(df1['date'], format='%Y-%m-%d', errors='ignore')

In [16]:
df1.dtypes['date'] #datetime64[ns]

dtype('<M8[ns]')

In [17]:
# sort date to split by time
df1 = df1.sort_values(by=['date', 'ags5'])

In [18]:
df1 = df1.reset_index(drop=True)

In [19]:
df1['year'] = df1['date'].dt.year
df1['month'] = df1['date'].dt.month

In [20]:
#df1.dtypes

### training function

In [21]:
def train_xgb(X_train, X_test, y_train, y_test, params, run_name='xgb_model_run'):
    with mlflow.start_run(run_name=run_name):

        reg = xgb.XGBRegressor(**params)
        reg.fit(X_train, y_train)
        y_pred = reg.predict(X_test)
        
        # evaluation metrics
        rmse = mean_squared_error(y_test, y_pred, squared=False)
        r2 = r2_score(y_test, y_pred)
        
        print("Model Run Statistics")
        print(f"RMSE: {rmse}")
        print(f"R2 Score: {r2}")
        
        # parameters
        mlflow.log_params(params)
        mlflow.log_param('X_vars', str(list(X.columns)))
        
        # metrics
        mlflow.log_metric("rmse", rmse)
        mlflow.log_metric("r2", r2)
        
        # important features
        imp_features = pd.DataFrame({
            'features':X.columns, 
            'importance':reg.feature_importances_
        })
        imp_features.sort_values(by='importance', ascending=False, inplace=True)
        print(imp_features.head(20))
        
        mlflow.log_param('imp_features', str(list(imp_features.head(20)['features'].values)))

        mlflow.xgboost.log_model(reg, "model")
        
        # Return the model
        return reg

In [22]:
params_1 = {'n_estimators': 500,
            'max_depth':4, 
            'min_samples_split':5,
            'learning_rate': 0.01, 
            'loss':'ls', 
            'verbosity':1}

In [23]:
params_2 = {'n_estimators': 1000,
            'max_depth': 6, 
            'min_samples_split': 5,
            'learning_rate': 0.01, 
            'loss':'ls', 
            'verbosity':1}

### model v1

In [24]:
# Create X and y 
X = df1.drop(na_cols, axis=1)
X = df1.drop(['ags5', 'ags2', 'date', 'unemployment_rate'], axis=1) # do i need to drop "unemployed" as well
y = df1['unemployment_rate']

In [25]:
# split time-series data
train_size = int(len(X)/401*0.8)*401 #so that data is split in time
X_train, X_test = X[0:train_size], X[train_size:len(X)]
y_train, y_test = y[0:train_size], y[train_size:len(X)]

In [26]:
model1_1 = train_xgb(X_train, X_test, y_train, y_test, params_1, run_name='v1.1')
model1_2 = train_xgb(X_train, X_test, y_train, y_test, params_2, run_name='v1.2')

Parameters: { "loss", "min_samples_split" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Model Run Statistics
RMSE: 0.9153806740289593
R2 Score: 0.8317141572548336
                                          features  importance
1                  number_of_companies_agriculture    0.132821
34                   unemployment_benefit_entitled    0.094878
10     number_of_companies_financial_and_insurance    0.068224
2           number_of_companies_arts_entertainment    0.060509
22                     number_of_company_deletions    0.055349
13               number_of_companies_manufacturing    0.051524
17       number_of_companies_repair_motor_vehicles    0.048271
33                                      unemployed    0.047727
24                             number_of_start_ups    0

### model v2
how to backcast?

interpolate data backwards: 
[pandas](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.interpolate.html), 
[scipy](https://docs.scipy.org/doc/scipy/reference/interpolate.html), 
[other methods](https://www.machinelearningplus.com/time-series/time-series-analysis-python/#15howtotreatmissingvaluesinatimeseries)

In [27]:
# ags5 = df1['ags5'].unique()

In [28]:
# def knn_mean(df, col):
#     n = (np.ceil(df.isna().sum()[col]/402)+1)*2
#     print(n)
#     ts = df[col].values
    
#     out = np.copy(ts)
    
#     # need to group by / filter with kreis
#     # need to backcast for each kreis each column
    
#     for i, val in enumerate(ts):
#         if np.isnan(val):
#             n_by_2 = np.ceil(n/2)
#             lower = np.max([0, int(i-n_by_2)])
#             upper = np.min([len(ts)+1, int(i+n_by_2)])
#             ts_near = np.concatenate([ts[lower:i], ts[i:upper]])
#             out[i] = np.nanmean(ts_near)
#     return out

In [29]:
# # Create X and y 
# X = df1.drop(['ags5', 'ags2', 'date', 'unemployment_rate'], axis=1) # do i need to drop "unemployed" as well
# y = df1['unemployment_rate']

In [30]:
# df1.isna().sum()

In [31]:
# #X['dstwc_knn'] = 
# knn_mean(df1, 'displayed_short_time_work_companies')

In [44]:
df1[df1['ags5']==1001][['date', 'displayed_short_time_work_companies']]

Unnamed: 0,date,displayed_short_time_work_companies
0,2019-01-01,
401,2019-02-01,
802,2019-03-01,
1203,2019-04-01,
1604,2019-05-01,
2005,2019-06-01,
2406,2019-07-01,
2807,2019-08-01,
3208,2019-09-01,
3609,2019-10-01,
