1. sort by time, split by time, but we have multiple rows for one single time
2. integrate stationary data with time series model
3. interpolate / backcast nan

to see logging results, go to the directory of this folder and run `mlflow ui` in terminal. the server will be at http://127.0.0.1:5000.

In [1]:
# Import necessary libraries
import os
import pandas as pd
import numpy as np
import datetime

# Mlflow
import mlflow
import mlflow.sklearn
import mlflow.xgboost

# Visualisation
import seaborn as sns
import matplotlib.pyplot as plt

# Machine Learning
import xgboost as xgb
from xgboost import plot_importance, plot_tree

# Model Evaluiation
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

In [2]:
df1 = pd.read_csv('../../final_dfs/for_modeling/df_final_date_wide_2007.csv', index_col=0)
df2 = pd.read_csv('../../final_dfs/for_modeling/df_final_stationery.csv')

In [3]:
df1.shape

(65363, 40)

In [4]:
df1.head(2)

Unnamed: 0,ags2,ags5,date,number_of_company_deletions,number_of_company_liquidations,number_of_start_ups,number_of_companies_administration,number_of_companies_agriculture,number_of_companies_arts_entertainment,number_of_companies_communication,...,realized_short_time_work_companies,realized_short_time_work_people,underemployment_without_short_time _work,unemployment_benefit_entitled,unemployment_benefit_recipients,unemployment_rate,registerd_jobs,unemployed,employees_social_security_at_work,employees_social_security_at_residence
0,1,1001,2007-05-01,4.0,2.0,21.0,,,,,...,,,,,,12.7,784,5559,38319,24519
1,1,1001,2007-06-01,8.0,6.0,18.0,,,,,...,,,,,,12.2,932,5323,38266,24560


### prelim data cleaning

In [5]:
#df1.isna().sum()

In [6]:
#df1 = df1.replace({'0':np.nan, 0:np.nan})

In [7]:
#df1.isna().sum()

In [8]:
#df1.columns

In [9]:
na_cols = ['number_of_companies_administration', 'number_of_companies_agriculture',
       'number_of_companies_arts_entertainment',
       'number_of_companies_communication', 'number_of_companies_construction',
       'number_of_companies_domestic_staff',
       'number_of_companies_economic_services',
       'number_of_companies_education', 'number_of_companies_energy',
       'number_of_companies_extraterritorial',
       'number_of_companies_financial_and_insurance',
       'number_of_companies_health_and_social_services',
       'number_of_companies_hospitality', 'number_of_companies_manufacturing',
       'number_of_companies_mining', 'number_of_companies_real_estat',
       'number_of_companies_rendering_other_services',
       'number_of_companies_repair_motor_vehicles',
       'number_of_companies_technical_services',
       'number_of_companies_transport', 'number_of_companies_unknown_sector',
       'number_of_companies_water_and_sewage',
       'displayed_short_time_work_companies',
       'displayed_short_time_work_people',
       'realized_short_time_work_companies', 'realized_short_time_work_people',
       'underemployment_without_short_time _work',
       'unemployment_benefit_entitled', 'unemployment_benefit_recipients']

In [10]:
#df1[na_cols] = df1[na_cols].replace({'0':np.nan, 0:np.nan})

In [11]:
#df1.isna().sum()

In [12]:
df1 = df1.dropna(axis='columns')

In [13]:
df1.shape

(65363, 11)

converting data to datetime and extracting time to fit in model

In [14]:
df1.dtypes['date'] #object

dtype('O')

In [15]:
df1['date'] = pd.to_datetime(df1['date'], format='%Y-%m-%d', errors='ignore')

In [16]:
df1.dtypes['date'] #datetime64[ns]

dtype('<M8[ns]')

In [17]:
# sort date to split by time
df1 = df1.sort_values(by=['date', 'ags5'])

In [18]:
df1 = df1.reset_index(drop=True)

In [19]:
df1['year'] = df1['date'].dt.year
df1['month'] = df1['date'].dt.month

In [20]:
#df1.dtypes

### training function

In [21]:
def train_xgb(X_train, X_test, y_train, y_test, params, run_name='xgb_model_run'):
    with mlflow.start_run(run_name=run_name):

        reg = xgb.XGBRegressor(**params)
        reg.fit(X_train, y_train)
        y_pred = reg.predict(X_test)
        
        # evaluation metrics
        rmse = mean_squared_error(y_test, y_pred, squared=False)
        r2 = r2_score(y_test, y_pred)
        
        print("Model Run Statistics")
        print(f"RMSE: {rmse}")
        print(f"R2 Score: {r2}")
        
        # parameters
        mlflow.log_params(params)
        mlflow.log_param('X_vars', str(list(X.columns)))
        
        # metrics
        mlflow.log_metric("rmse", rmse)
        mlflow.log_metric("r2", r2)
        
        # important features
        imp_features = pd.DataFrame({
            'features':X.columns, 
            'importance':reg.feature_importances_
        })
        imp_features.sort_values(by='importance', ascending=False, inplace=True)
        print(imp_features.head(20))
        
        mlflow.log_param('imp_features', str(list(imp_features.head(20)['features'].values)))

        mlflow.xgboost.log_model(reg, "model")
        
        # Return the model
        return reg

In [22]:
params_1 = {'n_estimators': 500,
            'max_depth':4, 
            'min_samples_split':5,
            'learning_rate': 0.01, 
            'loss':'ls', 
            'verbosity':1}

In [23]:
params_2 = {'n_estimators': 1000,
            'max_depth': 6, 
            'min_samples_split': 5,
            'learning_rate': 0.01, 
            'loss':'ls', 
            'verbosity':1}

### lag variables

In [24]:
alq = df1[df1['ags5']==1001]['unemployment_rate'].shift(-1)

In [25]:
k1001 = df1[df1['ags5']==1001]
k1001['ur_lag'] = alq
k1001.dropna(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  k1001['ur_lag'] = alq
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  k1001.dropna(inplace=True)


In [26]:
# filter to normal time
k1001 = k1001[(k1001['date']>datetime.datetime(2010,1,1)) 
          & (k1001['date']<datetime.datetime(2019,12,31))] 

### model v1

In [27]:
# Create X and y 
#X = df1.drop(na_cols, axis=1)
X = k1001.drop(['ags5', 'ags2', 'date', 'unemployment_rate', 'ur_lag'], axis=1) 
y = k1001['ur_lag']

In [28]:
# split time-series data
train_size = int(len(X)*0.8)
#train_size = int(len(X)/401*0.8)*401 #so that data is split in time
X_train, X_test = X[0:train_size], X[train_size:len(X)]
y_train, y_test = y[0:train_size], y[train_size:len(X)]

In [29]:
model1_1 = train_xgb(X_train, X_test, y_train, y_test, params_1, run_name='v1.1')
model1_2 = train_xgb(X_train, X_test, y_train, y_test, params_2, run_name='v1.2')

Parameters: { "loss", "min_samples_split" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Model Run Statistics
RMSE: 0.7066496047003094
R2 Score: -1.3684758758408067
                                 features  importance
6  employees_social_security_at_residence    0.635686
5       employees_social_security_at_work    0.141658
4                              unemployed    0.096770
3                          registerd_jobs    0.061028
7                                    year    0.031959
8                                   month    0.016881
1          number_of_company_liquidations    0.006235
2                     number_of_start_ups    0.006080
0             number_of_company_deletions    0.003703
Parameters: { "loss", "min_samples_split" } might not be used.

  This may not be