In [455]:
# !pip3 install imblearn
# !pip3 install deployed
# !pip3 install xgboost
# !pip3 install plotly
# !pip3 install lightgbm

In [456]:
####################################
# Import Python Modules
####################################
# General Purpose Modules
import pickle
from datetime import datetime

# Data Processing Modules
import numpy as np
import pandas as pd

# ML Modules
import lightgbm as lgb
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit, RandomizedSearchCV
from sklearn.metrics import roc_auc_score,precision_score, recall_score 


# Custom modules
from dataset_schema_dict import dataset_schema

seed = 99
_seed = 99

In [457]:

data = pd.read_csv('research_data_7days_above_20_2023.csv', index_col=False)
data = data.dropna()

data['target_d3_p3'] = np.where(data['t1']> (1.03*data['t0']), 1, np.where(data['t1']<(0.97*data['t0']), 0, 
               np.where(data['t2']> (1.03*data['t0']), 1, np.where(data['t2']<(0.97*data['t0']), 0, 
               np.where(data['t3']> (1.03*data['t0']), 1, 0)))))

data['target_d5_p3'] = np.where(data['t1']> (1.03*data['t0']), 1, np.where(data['t1']<(0.97*data['t0']), 0, 
               np.where(data['t2']> (1.03*data['t0']), 1, np.where(data['t2']<(0.97*data['t0']), 0, 
               np.where(data['t3']> (1.03*data['t0']), 1 , np.where(data['t3']<(0.97*data['t0']), 0, 
               np.where(data['t4']> (1.03*data['t0']), 1 , np.where(data['t4']<(0.97*data['t0']), 0, 
               np.where(data['t5']> (1.03*data['t0']), 1, 0)))))))))

data['target_d7_p3'] = np.where(data['t1']> (1.03*data['t0']), 1, np.where(data['t1']<(0.97*data['t0']), 0, 
               np.where(data['t2']> (1.03*data['t0']), 1, np.where(data['t2']<(0.97*data['t0']), 0, 
               np.where(data['t3']> (1.03*data['t0']), 1 , np.where(data['t3']<(0.97*data['t0']), 0, 
               np.where(data['t4']> (1.03*data['t0']), 1 , np.where(data['t4']<(0.97*data['t0']), 0, 
               np.where(data['t5']> (1.03*data['t0']), 1 , np.where(data['t5']<(0.97*data['t0']), 0, 
               np.where(data['t6']> (1.03*data['t0']), 1 , np.where(data['t6']<(0.97*data['t0']), 0, 
               np.where(data['t7']> (1.03*data['t0']), 1, 0)))))))))))))

data['target_d3_p5'] = np.where(data['t1']> (1.05*data['t0']), 1, np.where(data['t1']<(0.97*data['t0']), 0, 
               np.where(data['t2']> (1.05*data['t0']), 1, np.where(data['t2']<(0.97*data['t0']), 0, 
               np.where(data['t3']> (1.05*data['t0']), 1, 0)))))

data['target_d5_p5'] = np.where(data['t1']> (1.05*data['t0']), 1, np.where(data['t1']<(0.97*data['t0']), 0, 
               np.where(data['t2']> (1.05*data['t0']), 1, np.where(data['t2']<(0.97*data['t0']), 0, 
               np.where(data['t3']> (1.05*data['t0']), 1 , np.where(data['t3']<(0.97*data['t0']), 0, 
               np.where(data['t4']> (1.05*data['t0']), 1 , np.where(data['t4']<(0.97*data['t0']), 0, 
               np.where(data['t5']> (1.05*data['t0']), 1, 0)))))))))


data['target_d7_p5'] = np.where(data['t1']> (1.05*data['t0']), 1, np.where(data['t1']<(0.97*data['t0']), 0, 
               np.where(data['t2']> (1.05*data['t0']), 1, np.where(data['t2']<(0.97*data['t0']), 0, 
               np.where(data['t3']> (1.05*data['t0']), 1 , np.where(data['t3']<(0.97*data['t0']), 0, 
               np.where(data['t4']> (1.05*data['t0']), 1 , np.where(data['t4']<(0.97*data['t0']), 0, 
               np.where(data['t5']> (1.05*data['t0']), 1 , np.where(data['t5']<(0.97*data['t0']), 0, 
               np.where(data['t6']> (1.05*data['t0']), 1 , np.where(data['t6']<(0.97*data['t0']), 0, 
               np.where(data['t7']> (1.05*data['t0']), 1, 0)))))))))))))


data['target_d3_p7'] = np.where(data['t1']> (1.07*data['t0']), 1, np.where(data['t1']<(0.97*data['t0']), 0, 
               np.where(data['t2']> (1.07*data['t0']), 1, np.where(data['t2']<(0.97*data['t0']), 0, 
               np.where(data['t3']> (1.07*data['t0']), 1, 0)))))

data['target_d5_p7'] = np.where(data['t1']> (1.07*data['t0']), 1, np.where(data['t1']<(0.97*data['t0']), 0, 
               np.where(data['t2']> (1.07*data['t0']), 1, np.where(data['t2']<(0.97*data['t0']), 0, 
               np.where(data['t3']> (1.07*data['t0']), 1 , np.where(data['t3']<(0.97*data['t0']), 0, 
               np.where(data['t4']> (1.07*data['t0']), 1 , np.where(data['t4']<(0.97*data['t0']), 0, 
               np.where(data['t5']> (1.07*data['t0']), 1, 0)))))))))


data['target_d7_p7'] = np.where(data['t1']> (1.07*data['t0']), 1, np.where(data['t1']<(0.97*data['t0']), 0, 
               np.where(data['t2']> (1.07*data['t0']), 1, np.where(data['t2']<(0.97*data['t0']), 0, 
               np.where(data['t3']> (1.07*data['t0']), 1 , np.where(data['t3']<(0.97*data['t0']), 0, 
               np.where(data['t4']> (1.07*data['t0']), 1 , np.where(data['t4']<(0.97*data['t0']), 0, 
               np.where(data['t5']> (1.07*data['t0']), 1 , np.where(data['t5']<(0.97*data['t0']), 0, 
               np.where(data['t6']> (1.07*data['t0']), 1 , np.where(data['t6']<(0.97*data['t0']), 0, 
               np.where(data['t7']> (1.07*data['t0']), 1, 0)))))))))))))


In [458]:
data['day_name'] = pd.to_datetime(data['date'], format='%d/%m/%Y').dt.day_name()
data['month'] = pd.to_datetime(data['date'], format='%d/%m/%Y').dt.strftime('%m')
data['month'].value_counts()

04    15121
03    14979
12    14916
11    14422
01    12808
02    11977
10     3696
05     2215
Name: month, dtype: int64

In [459]:
integer_columns = data.select_dtypes(include=['int64','int32']).columns
float_columns = data.select_dtypes(include=['float']).columns
object_columns = data.select_dtypes(include=['object']).columns
data[integer_columns] = data[integer_columns].astype('category')

data.dtypes.value_counts()

category    87
float64      8
object       4
dtype: int64

In [460]:
target_list = [col for col in data if col.startswith('target')]

for col in target_list:
    print (col)
    print(data[col].value_counts(normalize=True))

target_d3_p3
0    0.727905
1    0.272095
Name: target_d3_p3, dtype: float64
target_d5_p3
0    0.652606
1    0.347394
Name: target_d5_p3, dtype: float64
target_d7_p3
0    0.609215
1    0.390785
Name: target_d7_p3, dtype: float64
target_d3_p5
0    0.821677
1    0.178323
Name: target_d3_p5, dtype: float64
target_d5_p5
0    0.75289
1    0.24711
Name: target_d5_p5, dtype: float64
target_d7_p5
0    0.708545
1    0.291455
Name: target_d7_p5, dtype: float64
target_d3_p7
0    0.877172
1    0.122828
Name: target_d3_p7, dtype: float64
target_d5_p7
0    0.818836
1    0.181164
Name: target_d5_p7, dtype: float64
target_d7_p7
0    0.777232
1    0.222768
Name: target_d7_p7, dtype: float64


In [461]:
scenario = 'd7_p5'
data[['t0', 't1','t2','t3','t4','t5','t6','t7']][data[f'target_{scenario}'] == 1]
#data[['date','t0', 't1','t2','t3','t4','t5','t6','t7']][data['Name'] == 'GRANFLO']


Unnamed: 0,t0,t1,t2,t3,t4,t5,t6,t7
5,0.81,0.825,0.810,0.820,0.840,0.845,0.845,0.860
7,0.81,0.820,0.840,0.845,0.845,0.860,0.860,0.875
8,0.82,0.840,0.845,0.845,0.860,0.860,0.875,0.870
25,0.85,0.860,0.840,0.840,0.840,0.855,0.870,0.930
26,0.86,0.840,0.840,0.840,0.855,0.870,0.930,0.950
...,...,...,...,...,...,...,...,...
90105,1.79,1.790,1.800,1.800,1.860,1.900,1.920,1.920
90106,1.79,1.800,1.800,1.860,1.900,1.920,1.920,1.980
90107,1.80,1.800,1.860,1.900,1.920,1.920,1.980,1.880
90108,1.80,1.860,1.900,1.920,1.920,1.980,1.880,1.920


In [462]:
##Define Train OOT (random split)

df1 = data.drop(columns=['t1', 't2', 't3', 't4', 't5', 't6', 't7'])
df1.set_index(['Name', 'date'], inplace=True)

def train_validate_test_split(df, train_percent=.8, seed=seed):
    np.random.seed(seed)
    perm = np.random.permutation(df.index)
    m = len(df)
    train_end = int(train_percent * m)
    train = df.loc[perm[:train_end]]
    oot = df.loc[perm[train_end:]]
    return train, oot

train, oot = train_validate_test_split(df1)

print (data.shape)
print (f'Split OK? : {len(train) + len(oot)== len(data)}')

(90134, 99)
Split OK? : True


In [463]:
del data
del df1

In [464]:
scenario_list = ['d3_p3', 'd3_p5', 'd3_p7', 'd5_p3', 'd5_p5', 'd5_p7', 'd7_p3', 'd7_p5', 'd3_p7']
for scenario in scenario_list:
    # Script variables
    pd.options.mode.chained_assignment = None

    script_start = datetime.now()       # Script start
    _seed = 999                         # random state seed
    save_transformers = True            # Save the lgb transformers
    verbose_script = True               # Verbosity of script

    mdl_nm = 'uplift'

    target = f'target_{scenario}'

    # metric_to_use = 'average_precision'
    #metric_to_use = 'auc'
    metric_to_use = 'precision'
    #metric_to_use = 'auc'

    #scoring_to_use = 'roc_auc'
    # scoring_to_use = 'average_precision'
    scoring_to_use = 'precision'
    #scoring_to_use = 'log_loss'

    #exp_set = 'set2_' + metric_to_use
    exp_set = 'tuneAUC_'



    #######################################################################
    # Set Features to use in model
    #######################################################################



    no_cat_features = False
    

    # Save dataset dictionary
    dict_path = f'./{mdl_nm}/'
    f_name = dict_path + f'dataset_schema_dict.pkl'

    with open(f_name, 'wb') as handle:
        pickle.dump(
            dataset_schema, 
            handle, 
            protocol=pickle.HIGHEST_PROTOCOL
        )


    #######################################################################
    # Set Model Parameters
    #######################################################################
    
        
    # set parameters
    init_param_dict = {                      # !!! TUNE THESE
        'seed': _seed,
        'num_threads': 0,
        'verbosity': 0,
        #
        'boosting_type': 'gbdt',
        'objective': 'binary',
        'num_iterations': 2000,
        'early_stopping_rounds': 50,
        'tree_learner': 'data',
        'subsample_for_bin':300000, # lower=performance, high=accuracy
        #
        'metric': metric_to_use,
    }

    init_param_dict['categorical_feature'] = 'name:'



    #######################################################################
    # Load Data
    #######################################################################

    df = train


    #######################################################################
    # 
    #######################################################################

    #Set index ....
    # df.set_index(['Name','date'], inplace = True)

    if verbose_script:
        print('loaded data')
        print(datetime.now() - script_start)
        print('')


    #######################################################################
    # Remove bad columns
    #######################################################################

    # Remove other cols
    omit_cols = [
        c for c in df.columns
        if (
            'lead' in c.lower() 
            or c.startswith('target_')
                
        )
    ]

    # Ensure we're not going to remove the target
    if target in omit_cols:
        omit_cols.remove(target)

    df.drop(columns = omit_cols, inplace = True)

    # Remove 0-variance columns:
    df = df.loc[:,df.apply(pd.Series.nunique) > 1].copy(deep=True)

    #######################################################################
    # Ordinal (integer) Encode categorical variables
    #######################################################################

    for col in df.drop(columns=[target]).columns:

        if df[col].dtype == object or df[col].dtype.name == 'category':
            if verbose_script:
                print('encoding categorical column:')
                print(col)
                print('')

            # # Fill missing values
            # df[col].fillna('__unk__', inplace = True)

            # Instantiate sklearn's encoder class
            cnt_unique_values = df[col].nunique()

            if cnt_unique_values <= (255 - 3):
                dtype_to_use = np.uint8
            elif cnt_unique_values <= (65535 - 3) :
                dtype_to_use = np.int16
            elif cnt_unique_values <= (4294967295 - 3):
                dtype_to_use = np.int32
            else:
                dtype_to_use = np.uint64

            encoder = OrdinalEncoder(
                categories = 'auto',
                dtype = dtype_to_use,
                handle_unknown = 'use_encoded_value',
                unknown_value = cnt_unique_values + 2,
            )

            # Fit the encoder
            encoder.fit(df[col].to_numpy().reshape(-1,1))

            # Transform the df object values
            new_values = encoder.transform(df[col].to_numpy().reshape(-1,1))
            
            df[col] = pd.Categorical(new_values.ravel(), ordered = False)

            # Save encoder object
            encoder_path = f'./python_model_objects/{mdl_nm}/'
            f_name = encoder_path + f'ordEnc_{col}.pkl'

            with open(f_name, 'wb') as handle:
                pickle.dump(
                    encoder, 
                    handle, 
                    protocol=pickle.HIGHEST_PROTOCOL
                )
                
            init_param_dict['categorical_feature'] += col + ','
            
            if verbose_script:
                print(f'saved ordinal encoder for {col} at')
                print(f_name)
                print('')


    # remove the last comma
    init_param_dict['categorical_feature'] = init_param_dict['categorical_feature'][:-1]


    #######################################################################
    # Train / Test Splits
    #######################################################################

    df = df[df[target].notnull()].copy(deep=True)

    X_ = df.drop(columns=target).copy(deep=True)
    y_ = df[target].copy(deep=True)

    # Shuffle data
    X_train, X_eval, y_train, y_eval = train_test_split(X_ , y_, test_size = 0.3, random_state = _seed, stratify = y_)
        
    # Add column names...
    init_param_dict['feature_name'] = list(X_.columns)

    #######################################################################
    # LGB Model Object
    #######################################################################

    # Instantiate obj
    lgb_classifier = lgb.LGBMClassifier(**init_param_dict)


    #######################################################################
    # SKLearn Cross Val Object
    #######################################################################

    # Splitting strategy for Cross-Validation
    sss_cv = StratifiedShuffleSplit(
        n_splits = 5,
        test_size = 0.3, 
        train_size = 0.7, 
        random_state = _seed
    )

    #######################################################################
    # SKLearn Search Object
    #######################################################################
    # Default list of values to search in naive cases
    values_to_check = sorted(set([k ** n for n in range(-8,3) for k in range(1,10+1)]))

    # Params to search
    params_to_optimize = {
        #'max_depth': [3,4,5,6,7,8,9,10,20],
        
        'max_depth': [5,10,15,20,30,50],    
        # 'num_leaves': [10, 25, 50, 75, 100, 150, 200, 500],
        'num_leaves': [5, 10, 25, 50, 75, 100, 150],
        # 'learning_rate': [value for value in values_to_check if value < 1.0],
        'min_data_in_leaf': [20, 50,100,250],
        # 'min_child_weight': [0.00001, 0.0001, 0.001, 0.01, 0.05, 0.1],
        # 'min_gain_to_split': [0.00001, 0.0001, 0.001, 0.01, 0.05, 0.1], 
        # 'bagging_fraction': [0.05, 0.25, 0.5, 0.75, 0.9, 1.0],
        'bagging_freq': [2, 5, 10, 25, 50],
        'feature_fraction': [0.05, 0.25, 0.5, 0.75, 0.9, 1],
        # 'lambda_l1': values_to_check,
        # 'lambda_l2': values_to_check,
        'scale_pos_weight':[0.1, 0.5, 1, 2, 5, 10, 25]
        
    }

    # Cross validation with randomized search
    clf = RandomizedSearchCV(
        estimator = lgb_classifier, 
        param_distributions = params_to_optimize,   ##TUNE
        n_iter = 2500,                              ##TUNE
        random_state = _seed,
        n_jobs = -1, 
        refit = True, 
        cv = sss_cv,
        scoring = scoring_to_use,
        verbose = 1, 
        return_train_score = False,
    )


    #######################################################################
    # Fit model
    #######################################################################

    # Params for fit function
    params_for_fitting = {
        'X': X_train, 
        'y': y_train, 
        'eval_set': [(X_eval, y_eval)],
        'eval_metric': 'auc',
        'feature_name': init_param_dict['feature_name'],
        'categorical_feature': init_param_dict['categorical_feature'][5:].split(','),
    }


    cv = True

    if cv:
    
        # Search for best parameters
        search = clf.fit(**params_for_fitting)  
    
        new_params = lgb_classifier.get_params()
        new_params.update(search.best_params_)
        
        if no_cat_features:
            del new_params['categorical_feature']
            del params_for_fitting['categorical_feature']
            
        lgb_classifier = lgb.LGBMClassifier(**new_params)

        lgbmCV = lgb_classifier.fit(**params_for_fitting)
    
        
    else:
        learning_params = {
        'max_depth': 5,
        'num_leaves': 200,
        'learning_rate': 0.01,
        'min_data_in_leaf': 25,
        'min_child_weight': 0.0001,
        'min_gain_to_split': 0.0001, 
        'bagging_fraction': 0.8,
        'bagging_freq': 5,
        'feature_fraction': 1.0,
        'lambda_l1': 0,
        'lambda_l2': 0,
        'early_stopping_rounds':100,
        }
        
        new_params = lgb_classifier.get_params()
        new_params.update(learning_params)
        
        if no_cat_features:
            del new_params['categorical_feature']
            del params_for_fitting['categorical_feature']
        
        lgb_classifier = lgb.LGBMClassifier(**new_params)

        lgbmCV = lgb_classifier.fit(**params_for_fitting)
        

        
    #######################################################################
    # Save Model
    #######################################################################

    model_path = f'./python_model_objects/{mdl_nm}/'
    f_name = model_path + f'{scenario}_lgbmcv_model.pkl'

    with open(f_name, 'wb') as handle:
        pickle.dump(
            lgbmCV, 
            handle, 
            protocol=pickle.HIGHEST_PROTOCOL
        )

    print ("Model saved!!!!")

    #######################################################################
    # 
    #       Amirul : 20230531
    #
    #
    #######################################################################

    
        


    #######################################################################
    # Generate predictions
    #######################################################################
    feature_names = lgbmCV.feature_name

    df_eval = X_eval.join(y_eval)
    df_eval['y_pred'] = lgbmCV.predict_proba(df_eval[feature_names])[:,1]
    df_eval['y_pred_class'] = lgbmCV.predict(df_eval[feature_names])
    #df['y_pred_class'] = (model_obj.predict_proba(df[feature_names])[:,1] >= 0.5).astype(bool)


    #######################################################################
    # Calculate Metric
    #######################################################################
    auc_score = roc_auc_score(y_true = df_eval[target].astype(int), y_score = df_eval['y_pred'])
    precision = precision_score(df_eval[target].astype(int), df_eval['y_pred_class'])
    recall = recall_score(df_eval[target].astype(int), df_eval['y_pred_class'])

    print ('AUC Score:' , auc_score)
    print ('Precision:' , precision)
    print ('Recall:' , recall )



    confusion_matrix = pd.crosstab(df_eval[target], df_eval['y_pred_class'], rownames=['Actual'], colnames=['Predicted'])
    #confusion_matrix.to_csv(exp_set + f'train_confusion_matrix.csv')
    with pd.ExcelWriter(f"{exp_set}{scenario}.xlsx", engine="openpyxl") as writer: 
        confusion_matrix.to_excel(writer, sheet_name="cf", index=True)


    metric_result = pd.DataFrame({'AUC' : [auc_score], 'Precision' : [precision], 'Recall': [recall]})
    metric_result.to_csv(exp_set + f'train_metric.csv')

    #######################################################################
    # Create table
    #######################################################################
    lenx = len(df_eval)

    for qtile,n_ in [('decile',10), ('percentile',lenx)]:
        # Add the qtile label to rows
        df_eval[target] = df_eval[target].astype('int')
        df_eval[qtile] = pd.qcut(df_eval['y_pred'].rank(method='first', ascending=False), n_, labels=False)

        # Aggregate rows to qtile bin
        agg_df = df_eval.groupby(qtile).agg(
            {qtile:len , target:np.nansum, 'y_pred':[np.nanmin, np.nanmax]})

        agg_df.columns = ['total_cnt','positives','pred_min', 'pred_max']

        agg_df['positive_rate'] = agg_df['positives'] / agg_df['total_cnt']

        avg_capture = np.sum(agg_df.positives) / np.sum(agg_df.total_cnt)

        agg_df['decile_lift'] = agg_df['positive_rate'] / avg_capture

        agg_df.sort_index(ascending = True, inplace = True)

        agg_df['cumsum_total_cnt'] = agg_df.total_cnt.cumsum()

        agg_df['cumsum_positives'] = agg_df.positives.cumsum()


        if qtile == 'decile':
            range_size = 10
        else:
            range_size = lenx


        agg_df['naive_rt'] = [(i+1)/range_size for i in range(range_size)]

        agg_df['naive_cumsum_positives'] = agg_df.naive_rt * np.sum(agg_df.positives)

        agg_df['total_gain'] = agg_df.cumsum_positives / agg_df.naive_cumsum_positives

        agg_df['precision_test'] = agg_df.cumsum_positives / agg_df.cumsum_total_cnt 

        #qtile_dfs[qtile] = agg_df.copy(deep=True)
        
        with pd.ExcelWriter(f"{exp_set}{scenario}.xlsx", mode="a", engine="openpyxl", if_sheet_exists='replace') as writer: 
            agg_df.to_excel(writer, sheet_name=f'train_{qtile}', index=True)
            # agg_df.to_csv(f'{exp_set}_train_lift_{qtile}.csv')

        
    #######################################################################
    # Print Feature Importance
    #######################################################################

    feature_impt = pd.DataFrame({'Value':lgbmCV.feature_importances_,'Feature':lgbmCV.feature_name}).sort_values(by = 'Value', ascending = False)
    with pd.ExcelWriter(f"{exp_set}{scenario}.xlsx", mode="a", engine="openpyxl", if_sheet_exists='replace') as writer: 
        feature_impt.to_excel(writer, sheet_name=(f'fi_train_{qtile}'), index=True)
        #feature_impt.to_csv(exp_set + f'feature_importance_{exp_set}_train_lift_{qtile}.csv')



    print ("Completed")

loaded data
0:00:00.002678

encoding categorical column:
20-day-high

saved ordinal encoder for 20-day-high at
./python_model_objects/uplift/ordEnc_20-day-high.pkl

encoding categorical column:
3-ducks

saved ordinal encoder for 3-ducks at
./python_model_objects/uplift/ordEnc_3-ducks.pkl

encoding categorical column:
52-week-high

saved ordinal encoder for 52-week-high at
./python_model_objects/uplift/ordEnc_52-week-high.pkl

encoding categorical column:
52-week-low

saved ordinal encoder for 52-week-low at
./python_model_objects/uplift/ordEnc_52-week-low.pkl

encoding categorical column:
above-ma50

saved ordinal encoder for above-ma50 at
./python_model_objects/uplift/ordEnc_above-ma50.pkl

encoding categorical column:
all-time-high

saved ordinal encoder for all-time-high at
./python_model_objects/uplift/ordEnc_all-time-high.pkl

encoding categorical column:
atr

saved ordinal encoder for atr at
./python_model_objects/uplift/ordEnc_atr.pkl

encoding categorical column:
blue-chip-uptr

KeyboardInterrupt: 