In [None]:
import logging

# Create a logger
logger = logging.getLogger('tpot_log')

# Set the log level
logger.setLevel(logging.INFO)  # Adjust the level as needed (INFO, DEBUG, etc.)

# Define a log file to save the log information
log_file = 'tpot_log.txt'

# Create a file handler and add it to the logger
file_handler = logging.FileHandler(log_file)
logger.addHandler(file_handler)

# Optionally, create a console handler to log messages to the console
console_handler = logging.StreamHandler()
logger.addHandler(console_handler)

# Define the log format
log_format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
formatter = logging.Formatter(log_format)
file_handler.setFormatter(formatter)
console_handler.setFormatter(formatter)

# Example: Log a message
logger.info('Logging has been set up for TPOT.')

# Now, TPOT's log messages will be written to the specified log file and optionally shown in the console.

In [None]:
import numpy as np 
import pandas as pd 
from tqdm import tqdm
from array import array
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
temp = dict(layout = go.Layout(font = dict(family="Franklin Gothic", size=12), width = 1500))

In [None]:
import xgboost

## TPOT Dictionary for Feature Processing and ML Model Building

In [None]:
from tpot import TPOTRegressor
regressor_config_dict={'sklearn.linear_model.ElasticNetCV': {
        'l1_ratio': [0.05, 0.1, 0.5, 1.0],
        'tol': [1e-05, 0.001, 0.1]
    },
    'sklearn.ensemble.GradientBoostingRegressor': {
        'n_estimators': [100, 500, 1000],
        'loss': ['ls', 'huber'],
        'learning_rate': [0.01, 0.1, 0.5],
        'max_depth': [3, 5, 7],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'subsample': [0.1, 0.5, 1.0],
        'max_features': [0.1, 0.5, 1.0],
    },
    'sklearn.ensemble.RandomForestRegressor': {
        'n_estimators': [100, 500, 1000],
        'max_features': [0.1, 0.5, 1.0],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
    },
    'xgboost.XGBRegressor': {
        'n_estimators': [100, 500, 1000],
        'max_depth': [3, 5, 7],
        'learning_rate': [0.01, 0.1, 0.5],
        'subsample': [0.1, 0.5, 1.0],
        'min_child_weight': [1, 5, 10],
    },
    'sklearn.svm.LinearSVR': {
        'loss': ['epsilon_insensitive', 'squared_epsilon_insensitive'],
        'dual': [True, False],
        'tol': [1e-05, 0.001, 0.1],
        'C': [0.001, 0.1, 1.0, 10.0, 25.0],
        'epsilon': [0.001, 0.1, 1.0],
    },
    'sklearn.ensemble.AdaBoostRegressor': {
        'n_estimators': [100, 500, 1000],
        'learning_rate': [0.01, 0.1, 0.5],
    },


'sklearn.impute.SimpleImputer': {'strategy':'mean'},
 'sklearn.impute.SimpleImputer': {'strategy':'median'},
 'sklearn.impute.KNNImputer': {},
 'sklearn.preprocessing.MaxAbsScaler': {},
 'sklearn.preprocessing.MinMaxScaler': {},
 'sklearn.preprocessing.Normalizer': {'norm': ['l1', 'l2', 'max']},
     'sklearn.kernel_approximation.Nystroem': {
    'kernel': ['rbf', 'polynomial', 'linear'],
    'gamma': [0.1, 0.5, 1.0],
    'n_components': range(1, 6)  # Reduced the range for n_components
},
            'sklearn.decomposition.PCA': {'svd_solver': ['randomized'],
  'iterated_power': range(1, 11)},
                'sklearn.kernel_approximation.RBFSampler': {'gamma': [0.  , 0.2 , 0.35,  0.45, 0.5 ,
         0.55, 0.6 , 0.65, 0.8 , 1.  ]},
                    'sklearn.preprocessing.RobustScaler': {},
 'sklearn.preprocessing.StandardScaler': {},
 'sklearn.preprocessing.QuantileTransformer': {},
 'sklearn.preprocessing.PowerTransformer': {},
    
     'sklearn.feature_selection.SelectFwe': {'alpha': [0.01, 0.05, 0.1,0.15, 0.2,0.25, 0.3,0.35,0.4,0.45,0.5],
  'score_func': {'sklearn.feature_selection.f_regression': None}},
            
             'sklearn.feature_selection.SelectPercentile': {'percentile': range(1, 100,20),
  'score_func': {'sklearn.feature_selection.f_regression': None}},
 'sklearn.feature_selection.VarianceThreshold': {'threshold': [
   0.001,
   0.01,
   0.1,
   0.2]},
     'sklearn.feature_selection.SelectFromModel': {'threshold': [0.  , 0.05, 0.1 , 0.15, 0.2 , 0.25, 0.3 , 0.35, 0.4 , 0.45, 0.5 ,
         0.55, 0.6 , 0.65, 0.7 , 0.75, 0.8 , 0.85, 0.9 , 0.95, 1.  ],
  'estimator': {'sklearn.ensemble.ExtraTreesRegressor': {'n_estimators': [100],
    'max_features': [0.05, 0.1 , 0.15, 0.2 , 0.25, 0.3 , 0.35, 0.4 , 0.45, 0.5 , 0.55,
           0.6 , 0.65, 0.7 , 0.75, 0.8 , 0.85, 0.9 , 0.95, 1.  ]}}}}

In [None]:
def get_rsi(df, period):

    delta = df['wap'].pct_change()


    # Calculate the average gain and average loss for the specified period
    gain = delta.where(delta > 0, 0)
    loss = -delta.where(delta < 0, 0)
    avg_gain = gain.rolling(period).mean()
    avg_loss = loss.rolling(period).mean()

    # Calculate the Relative Strength (RS) by dividing the average gain by the average loss
    rs = avg_gain / avg_loss

    # Calculate the Relative Strength Index (RSI)
    rsi = 100 - (100 / (1 + rs))

    return rsi 

def RSI_Signal(rsi):

    if rsi > 70: # Overbought
        return -1
    
    elif rsi < 30: # Oversold
        return 1
    
    else: 
        return 0

def obv_volume(delta, volume):

    if delta > 0 :
        return volume
    elif delta < 0 :
        return -volume

In [None]:
#import pandas as pd
def generate_features(df,preprocess):
    features = ['seconds_in_bucket', 'imbalance_buy_sell_flag',
               'imbalance_size', 'matched_size', 'bid_size', 'ask_size',
                'reference_price','far_price', 'near_price', 'ask_price', 'bid_price', 'wap',
                'imb_s1', 'imb_s2'
               ]
    
    df['imb_s1'] = df.eval('(bid_size-ask_size)/(bid_size+ask_size)')
    df['imb_s2'] = df.eval('(imbalance_size-matched_size)/(matched_size+imbalance_size)')
    
    prices = ['reference_price','far_price', 'near_price', 'ask_price', 'bid_price', 'wap']
    
    for i,a in enumerate(prices):
        for j,b in enumerate(prices):
            if i>j:
                df[f'{a}_{b}_imb'] = df.eval(f'({a}-{b})/({a}+{b})')
                features.append(f'{a}_{b}_imb')    
                    
    for i,a in enumerate(prices):
        for j,b in enumerate(prices):
            for k,c in enumerate(prices):
                if i>j and j>k:
                    max_ = df[[a,b,c]].max(axis=1)
                    min_ = df[[a,b,c]].min(axis=1)
                    mid_ = df[[a,b,c]].sum(axis=1)-min_-max_

                    df[f'{a}_{b}_{c}_imb2'] = (max_-mid_)/(mid_-min_)
                    features.append(f'{a}_{b}_{c}_imb2')
    
    
    train_df_1 = pd.DataFrame()

    for name, single_stock_train_df in tqdm(df.groupby('stock_id')):

        single_stock_train_df['rsi'] = get_rsi(single_stock_train_df, period = 14)
        single_stock_train_df['rsi signal'] = single_stock_train_df.apply(lambda x: RSI_Signal(x['rsi']), axis = 1)
        train_df_1 = pd.concat([train_df_1, single_stock_train_df], axis = 0)

    features.append('rsi')
    features.append('rsi signal')
    
    
    train_df_2 = pd.DataFrame()

    for name, single_stock_train_df in tqdm(train_df_1.groupby('stock_id')):

        single_stock_train_df['delta(wap)'] = single_stock_train_df['wap'].pct_change()
        single_stock_train_df['Volume Adjust'] = single_stock_train_df.apply(lambda x: obv_volume(x['delta(wap)'], x['matched_size']), axis = 1)
        single_stock_train_df['OBV'] = single_stock_train_df['Volume Adjust'].rolling(window = 14).sum()

        train_df_2 = pd.concat([train_df_2, single_stock_train_df], axis = 0)
    
    features.append('delta(wap)')
    features.append('Volume Adjust')
    features.append('OBV')
    
    train_df_2['Imbalance'] = train_df_2['imbalance_buy_sell_flag'] * train_df_2['imbalance_size']
    train_df_2['Bid/Ask Spread'] = train_df_2['bid_price'] - train_df_2['ask_price']
    
    features.append('Imbalance')
    features.append('Bid/Ask Spread')
    
    if preprocess:
        default_value = -99999999
        train_df_2.fillna(default_value, inplace=True)
    
    
        # Replace Inf with NaN
        train_df_2.replace([np.inf, -np.inf], np.nan, inplace=True)

        # Replace NaN with a default value (e.g., 0)
        default_value = -99999998
        train_df_2.fillna(default_value, inplace=True)
    
    train_df_2.replace([np.inf, -np.inf], np.nan, inplace=True)
    
    return train_df_2[features]

TRAINING = True
if TRAINING:
    ## Have to edit this to use a sampling code, sample close to 1 MM  
    df_train = pd.read_csv('train.csv').head(500000)
    df_train.dropna(subset="target",inplace=True)
    df_ = generate_features(df_train,False)

In [None]:
X = df_
y = df_train['target']

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, MaxAbsScaler, QuantileTransformer, PowerTransformer
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.pipeline import Pipeline
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### The following took 1 day to run, use the next tab 

In [None]:
# Create and fit the TPOTRegressor with the specified 'config_dict'
tpot2 = TPOTRegressor(generations=20, 
                    population_size=80, cv=5, verbosity=3, random_state=42, config_dict=regressor_config_dict,
                      max_time_mins=600,periodic_checkpoint_folder="intermediate_results",max_eval_time_mins=30,
                    n_jobs=-1,log_file='tpot_log.txt')
tpot2.fit(X_train, y_train)

In [None]:
# Create and fit the TPOTRegressor with the specified 'config_dict'
tpot = TPOTRegressor(generations=20, 
                    population_size=80, cv=5, verbosity=3, random_state=42, config_dict=regressor_config_dict,max_time_mins=30   ,periodic_checkpoint_folder="intermediate_results",
                    n_jobs=-1)#,early_stop=8)
tpot.fit(X_train, y_train) 

In [None]:
import joblib

# Access the best pipeline
best_pipeline = tpot.fitted_pipeline_

# Save the best pipeline to a file
joblib.dump(best_pipeline, 'best_pipeline.pkl')

In [None]:
imputer = SimpleImputer(strategy="median")
imputer.fit(X_train)
training_features = imputer.transform(X_train)
testing_features = imputer.transform(X_test)

In [None]:
# Load the saved pipeline
loaded_pipeline = joblib.load('best_pipeline.pkl')

In [None]:
np.sqrt(mean_squared_error(tpot.predict(X_test),y_test))

In [None]:
np.sqrt(mean_squared_error(loaded_pipeline.predict(testing_features),y_test))