In [1]:
%load_ext autoreload
%autoreload 2
import sys, os
from os.path import expanduser
## actions required!!!!!!!!!!!!!!!!!!!! change your folder path 
path = "~/Documents/G3/MA-prediction"
path = expanduser(path)
sys.path.append(path)

import pandas as pd
import numpy as np

pd.options.mode.chained_assignment = None
import warnings
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)
warnings.simplefilter(action='ignore', category=FutureWarning)

import datetime
from tqdm import tqdm

import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
mpl.style.use("seaborn-v0_8")
plt.rcParams['figure.figsize'] = [9, 9]
plt.rcParams['axes.titlesize'] = 20
plt.rcParams['axes.labelsize'] = 20
plt.rcParams['xtick.labelsize'] = 15
plt.rcParams['ytick.labelsize'] = 15
plt.rcParams['legend.fontsize'] = 18
plt.rcParams['font.size'] = 30

from numpy.random import RandomState
RANDOM_STATE = 0
random_state = RandomState(RANDOM_STATE)


import sklearn
# preprocessing:
# missing values
from sklearn.impute import SimpleImputer
# categorical
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
# scaler
from sklearn.preprocessing import StandardScaler, RobustScaler, QuantileTransformer

# pipeline
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer

# model
from sklearn.linear_model import LinearRegression, LogisticRegression, ElasticNet, ElasticNetCV



from sklearn import set_config
set_config(display='diagram')   


from sklearn import clone
from sklearn.metrics import average_precision_score, roc_auc_score
from sklearn.model_selection import ParameterGrid

scorer_dict = {"average_precision": average_precision_score, "roc_auc": roc_auc_score}
scoring = ['average_precision', 'roc_auc']

In [2]:
from MA_prediction.utils import *
from MA_prediction.preprocessing import *
from MA_prediction.prediction_model import *

# model-2: Prediction Model

In this notebook we build a prediction model to estimate the probability of MA outcomes.


## load data

In [3]:
filepath = f"{path}/data/prediction-model/df_feature_engineer.h5"
df = pd.read_hdf(filepath)

In [4]:
# predictors we use
col_cat = ['att_new', 'apub_new', 'tend', 'lbo', 'cross', 'term']
col_num = ['phda', 'psought', 'val_adj_by_cpi', 'premium1day', 'premium2wk', 'premium4wk',
            'arb_spread_4daya']
col_num_na = ['mv_ratio', 'amv']


df.retain[df[col_num].isna().any(axis=1)]=False

# extract y
dict_statc = {'W':1, 'C':0, 'P':99}
y = df.statc[df.retain].replace(dict_statc)

In [5]:
X_raw = df.loc[df.retain, col_cat+col_num+col_num_na].copy()
# get indicator variables for categorical variables
X_lr = pd.get_dummies(X_raw, columns=col_cat)

In [6]:
lr = LogisticRegression(penalty='elasticnet', solver='saga', random_state=RANDOM_STATE, max_iter=10000)
model_lr = Pipeline(
    [('impute', SimpleImputer(strategy="median")),
     ('scaler', StandardScaler()),
     ('lr', lr)
    ])

param_grid_lr = {'lr__C':[.01, .02, .05],
                 'lr__l1_ratio':[.3,  .75, 1],
                 'lr__class_weight':[None]}

In [7]:
grid_search = GridSearch_TS_CV(model_lr, 
                               param_grid_lr, 
                               scoring=["average_precision", "roc_auc"], 
                               criterion="roc_auc").fit(X_lr, y, df.da, df.dr.fillna(datetime.date(2022, 12, 31)))

100%|███████████████████████████████████████████| 21/21 [00:06<00:00,  3.36it/s]


In [8]:
# cv split
print("Time series split, by year:")
for n_fold, ((train_start, train_end), (test_start, test_end)) in enumerate(grid_search.yr_split):
    print(f"fold {n_fold}: train on {train_start}-{train_end}, test on {test_start}-{test_end}")

Time series split, by year:
fold 0: train on 1990-2001, test on 2002-2002
fold 1: train on 1990-2002, test on 2003-2003
fold 2: train on 1990-2003, test on 2004-2004
fold 3: train on 1990-2004, test on 2005-2005
fold 4: train on 1990-2005, test on 2006-2006
fold 5: train on 1990-2006, test on 2007-2007
fold 6: train on 1990-2007, test on 2008-2008
fold 7: train on 1990-2008, test on 2009-2009
fold 8: train on 1990-2009, test on 2010-2010
fold 9: train on 1990-2010, test on 2011-2011
fold 10: train on 1990-2011, test on 2012-2012
fold 11: train on 1990-2012, test on 2013-2013
fold 12: train on 1990-2013, test on 2014-2014
fold 13: train on 1990-2014, test on 2015-2015
fold 14: train on 1990-2015, test on 2016-2016
fold 15: train on 1990-2016, test on 2017-2017
fold 16: train on 1990-2017, test on 2018-2018
fold 17: train on 1990-2018, test on 2019-2019
fold 18: train on 1990-2019, test on 2020-2020
fold 19: train on 1990-2020, test on 2021-2021
fold 20: train on 1990-2021, test on 2022-

In [9]:
print("scores on the whole test set:")
print(grid_search.test_scores)

scores on the whole test set:
{'average_precision': 0.6656432698486535, 'roc_auc': 0.8402830510718042}


In [10]:
print("Average train scores on the selected best hyperparam:")
print(compute_dict_mean(grid_search.best_train_scores_))
print("Average validation scores on the selected best hyperparam:")
print(compute_dict_mean(grid_search.best_val_scores_))

Average train scores on the selected best hyperparam:
{'average_precision': 0.6038178137425588, 'roc_auc': 0.8420427915931987}
Average validation scores on the selected best hyperparam:
{'average_precision': 0.7012004559275227, 'roc_auc': 0.8773394901029424}
