In [4]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, precision_score, recall_score, accuracy_score, f1_score, confusion_matrix, roc_auc_score

class CreateMarketFeatures:
    def __init__(self, df):
        self.df = df

    def ma_spread(self, short_window=5, long_window=10):
        short_ma = self.df['TradedPrice'].rolling(window=short_window, min_periods=1).mean()
        long_ma = self.df['TradedPrice'].rolling(window=long_window, min_periods=1).mean()
        self.df['ma_spread'] = long_ma - short_ma
        return self.df

    def obv_calc(self):
        volume_direction = self.df['Volume'] * np.sign(self.df['TradedPrice'].diff())
        self.df['OBV'] = volume_direction.cumsum()
        return self.df

    def chaikin_mf(self, period=5):
        high = self.df['TradedPrice'].rolling(window=period, min_periods=1).max()
        low = self.df['TradedPrice'].rolling(window=period, min_periods=1).min()
        close = self.df['TradedPrice']
        cmf_multiplier = ((close - low) - (high - close)) / (high - low).replace(0, np.nan)
        cmf_volume = cmf_multiplier * self.df['Volume']
        self.df['CMF'] = cmf_volume.rolling(window=period, min_periods=1).sum() / self.df['Volume'].rolling(window=period, min_periods=1).sum()
        self.df['CMF'].fillna(0, inplace=True)
        return self.df

def get_paths_for_symbol(symbol, FeaturesDir, LabelOne):
    features_path = os.path.join(FeaturesDir, symbol, 'MODEL_BASED')
    labels_path = os.path.join(LabelOne, symbol)
    return {'Features Path': features_path, 'Labels Path': labels_path}

def load_data_for_date(symbol_path, date):
    labels_file = os.path.join(symbol_path['Labels Path'], date + '.csv')
    try:
        df = pd.read_csv(labels_file)
        market_features = CreateMarketFeatures(df)
        df = market_features.ma_spread()
        df = market_features.obv_calc()
        df = market_features.chaikin_mf()
        return df
    except FileNotFoundError:
        print(f"File not found for {date} at {labels_file}")
        return pd.DataFrame()

FeaturesDir = '/media/ak/DataOnly1/SymbolFeatureDirectories/'
LabelOne = '/media/ak/DataOnly1/ExperimentCommonLocs/LabelsAlternateOne/'
symbols = sorted(os.listdir(FeaturesDir))


In [9]:
symbols

['AAL.L',
 'APF.L',
 'AV.L',
 'AZN.L',
 'BARC.L',
 'BATS.L',
 'BLT.L',
 'CCL.L',
 'CEY.L',
 'CNA.L',
 'CPG.L',
 'DGE.L',
 'HSBA.L',
 'IOG.L',
 'ITV.L',
 'KGF.L',
 'LAND.L',
 'LGEN.L',
 'LLOY.L',
 'MAB.L',
 'MKS.L',
 'NG.L',
 'PRU.L',
 'PSON.L',
 'RB.L',
 'RBS.L',
 'RDSa.L',
 'RDSb.L',
 'REL.L',
 'RR.L',
 'RSA.L',
 'RTO.L',
 'SDR.L',
 'SGE.L',
 'SHP.L',
 'SMIN.L',
 'SPT.L',
 'STAN.L',
 'TSCO.L',
 'ULVR.L',
 'UU.L',
 'VOD.L',
 'WPP.L']

In [10]:
results = pd.DataFrame()
for symbol in ['AAL.L', 'APF.L','AV.L','AZN.L','BARC.L','BATS.L']:
    symbol_paths = get_paths_for_symbol(symbol, FeaturesDir, LabelOne)
    dates = sorted([f[:-4] for f in os.listdir(symbol_paths['Labels Path']) if f.endswith('.csv')])
    for date in dates:
        df = load_data_for_date(symbol_paths, date)
        if not df.empty:
            df['Symbol'] = symbol
            df['Date'] = date
            results = pd.concat([results, df], ignore_index=True)

# # Now train and evaluate the model
# if not results.empty:
#     X = results[['ma_spread', 'OBV', 'CMF']]
#     y = results['label']  # Assuming 'label' is defined in your DataFrame
#     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=16)
    
#     logreg = LogisticRegression(random_state=16)
#     logreg.fit(X_train, y_train)
#     y_pred = logreg.predict(X_test)
    
#     # Collecting metrics
#     print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
#     print(f"Precision: {precision_score(y_test, y_pred, average='binary')}")
#     print(f"Recall: {recall_score(y_test, y_pred, average='binary')}")
#     print(f"F1 Score: {f1_score(y_test, y_pred, average='binary')}")
#     print(classification_report(y_test, y_pred))


In [11]:
results

Unnamed: 0.1,Unnamed: 0,TradedTime,TradedPrice,ReturnTradedPrice,Volume,Duration,label_PrMov__window_5__thres_arbitrary__0.1,ma_spread,OBV,CMF,Symbol,Date
0,0,08:00:14.137937,0.135500,0.000000,62439.0,0.000000,0.0,0.000000,,0.000000,AAL.L,20170116
1,1,08:00:14.140071,0.135500,0.000000,1300.0,0.002134,1.0,0.000000,0.0,0.000000,AAL.L,20170116
2,2,08:00:14.187955,0.135550,0.000369,938.0,0.047884,1.0,0.000000,938.0,0.014503,AAL.L,20170116
3,3,08:00:14.258144,0.135600,0.000369,62.0,0.070189,1.0,0.000000,1000.0,0.015447,AAL.L,20170116
4,4,08:00:14.298304,0.135600,0.000000,1138.0,0.040160,0.0,0.000000,1000.0,0.032454,AAL.L,20170116
...,...,...,...,...,...,...,...,...,...,...,...,...
1908895,14851,15:29:55.276664,0.369417,0.000047,513.0,0.010529,1.0,-0.000002,-486225.0,0.553996,BATS.L,20180420
1908896,14852,15:29:55.286442,0.369400,-0.000047,434.0,0.009778,,-0.000002,-486659.0,0.066220,BATS.L,20180420
1908897,14853,15:29:55.291255,0.369400,0.000000,1493.0,0.004813,,-0.000002,-486659.0,-0.539077,BATS.L,20180420
1908898,14854,15:29:59.344780,0.369500,0.000271,100.0,4.053525,,-0.000012,-486559.0,-0.514085,BATS.L,20180420


In [12]:
# Now train and evaluate the model
if not results.empty:
    X = results[['ma_spread', 'OBV', 'CMF']]
    y = results['label']  # Assuming 'label' is defined in your DataFrame
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=16)
    
    logreg = LogisticRegression(random_state=16)
    logreg.fit(X_train, y_train)
    y_pred = logreg.predict(X_test)
    
    # Collecting metrics
    print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
    print(f"Precision: {precision_score(y_test, y_pred, average='binary')}")
    print(f"Recall: {recall_score(y_test, y_pred, average='binary')}")
    print(f"F1 Score: {f1_score(y_test, y_pred, average='binary')}")
    print(classification_report(y_test, y_pred))


KeyError: 'label'