In [190]:
import pandas as pd
import numpy as np
import scipy as scipy
import matplotlib.pyplot as plt
import matplotlib as mpl
import os

from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from collections import defaultdict
##################################
from sklearn.model_selection import train_test_split
# import the class
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, f1_score, recall_score, accuracy_score, classification_report

from sklearn.metrics import f1_score, classification_report, roc_auc_score, confusion_matrix, matthews_corrcoef, log_loss
from sklearn.preprocessing import LabelBinarizer


In [189]:
### class with features 

class CreateMarketFeatures:
    """A class to compute market-based features for trading strategies.

    Requires:
    df: pandas DataFrame with columns TradedPrice, Volume, and optionally Duration.
    """

    def __init__(self, df):
        self.df = df

    def ma_spread(self, short_window=5, long_window=10):
        """Calculate moving average spread between short and long windows for TradedPrice."""
        short_ma = self.df['TradedPrice'].rolling(window=short_window, min_periods=1).mean()
        long_ma = self.df['TradedPrice'].rolling(window=long_window, min_periods=1).mean()
        self.df['ma_spread'] = long_ma - short_ma
        return self.df
    def include_label_columns(self):
        """Include columns that contain 'label' in their name into the DataFrame."""
        # Find all columns that have 'label' in their name
        label_cols = [col for col in self.df.columns if 'label' in col]
        # Check if label columns are found and handle them
        if label_cols:
            # The columns are already part of the DataFrame, so this function might just ensure they are treated correctly
            # Here you can add additional handling or processing of label columns if necessary
            print(f"Label columns included: {label_cols}")
        else:
            # Assuming default action if no label columns are present
            print("No label columns found.")

    def obv_calc(self):
        """Calculate On-Balance Volume (OBV) to measure buying and selling pressure."""
        price_diff = self.df['TradedPrice'].diff()
        volume_direction = self.df['Volume'] * np.sign(price_diff)
        self.df['OBV'] = volume_direction.cumsum()
        return self.df

    def chaikin_mf(self, period=5):
        """Calculate Chaikin Money Flow (CMF) to measure money flow volume over a set period."""
        high = self.df['TradedPrice'].rolling(window=period, min_periods=1).max()
        low = self.df['TradedPrice'].rolling(window=period, min_periods=1).min()
        close = self.df['TradedPrice']
        
        cmf_multiplier = ((close - low) - (high - close)) / (high - low).replace(0, np.nan)
        cmf_volume = cmf_multiplier * self.df['Volume']
        self.df['CMF'] = cmf_volume.rolling(window=period, min_periods=1).sum() / \
                         self.df['Volume'].rolling(window=period, min_periods=1).sum()
        self.df['CMF'].fillna(0, inplace=True)  # Handle NaN divisions
        return self.df

    def add_labels(self):
        """Placeholder to demonstrate how to add labels if needed."""
        # Assuming label logic or addition is done outside or here based on certain conditions
        pass




In [167]:
# Create an instance of Logistic Regression Classifier and fit the data.
logreg = LogisticRegression(C=1e5)


In [168]:
FeaturesDir = '/media/ak/DataOnly1/SymbolFeatureDirectories/'
LabelOne='/media/ak/DataOnly1/ExperimentCommonLocs/LabelsAlternateOne'

In [169]:

# Paths to the directories
FeaturesDir = '/media/ak/DataOnly1/SymbolFeatureDirectories/'
LabelOne = '/media/ak/DataOnly1/ExperimentCommonLocs/LabelsAlternateOne'

# Get list of files in each directory
features_files = set(os.listdir(FeaturesDir))
labels_files = set(os.listdir(LabelOne))

# Find common elements
common_elements = features_files.intersection(labels_files)
# Convert set to list
common_elements_list = sorted(list(common_elements))
# Print the common elements
print("Common elements:", common_elements_list)
symbol = common_elements_list[1]

Common elements: ['AAL.L', 'APF.L', 'AV.L', 'AZN.L', 'BARC.L', 'BATS.L', 'BLT.L', 'CCL.L', 'CEY.L', 'CPG.L', 'ITV.L', 'KGF.L', 'LAND.L', 'LGEN.L', 'LLOY.L', 'MAB.L', 'MKS.L', 'NG.L', 'PRU.L', 'PSON.L', 'RB.L', 'RBS.L', 'RDSa.L', 'RDSb.L', 'REL.L', 'RR.L', 'RSA.L', 'RTO.L', 'SDR.L', 'SGE.L', 'SHP.L', 'SMIN.L', 'SPT.L', 'STAN.L', 'TSCO.L', 'ULVR.L', 'UU.L', 'VOD.L', 'WPP.L']


In [170]:
def find_common_elements(features_dates, labelDates):
    # Convert lists to sets to find common elements
    common_elements = set(features_dates).intersection(labelDates)
    
    # Convert the set back to a list to return
    return list(common_elements)

In [171]:
import os

def get_directory_contents_by_date(date, FeaturesDir, LabelOne):
    # List the contents of both directories
    features_files = os.listdir(FeaturesDir)
    labels_files = os.listdir(LabelOne)
    
    # Prepare the paths
    features_date_path = os.path.join(FeaturesDir, date)
    labels_date_file = f"{date}.csv"
    labels_date_path = os.path.join(LabelOne, labels_date_file)
    
    # Verify the existence of the date directory in features and date file in labels
    if os.path.isdir(features_date_path) and labels_date_file in labels_files:
        return {
            'Features Directory Content': os.listdir(features_date_path),
            'Labels File Path': labels_date_path
        }
    else:
        return "The specified date directory or file does not exist in one or both locations."

# Example usage
FeaturesDir = '/media/ak/DataOnly1/SymbolFeatureDirectories/'
LabelOne = '/media/ak/DataOnly1/ExperimentCommonLocs/LabelsAlternateOne/'
date = '20170829'  # Change this date to your specific requirement

# # Call the function
# result = get_directory_contents_by_date(date, FeaturesDir, LabelOne)
# print(result)


# # Example usage:
# FeaturesDir = '/media/ak/DataOnly1/SymbolFeatureDirectories/'
# LabelOne = '/media/ak/DataOnly1/ExperimentCommonLocs/LabelsAlternateOne'


# # # Call the function
# # result = find_common_subdirectories(index, FeaturesDir, LabelOne)
# # FeaturesDirectoryContent = result['Features Directory Content']
# # IndexDate =1 
# # DateId = FeaturesDirectoryContent[IndexDate]
# # FeaturesDateDir = os.path.join(FeaturesDir, DateId)

def check_date_in_list(common_dates, DatesIdx, check_list):
    try:
        # Build the string from the common_dates list at index DatesIdx
        date_with_extension = common_dates[DatesIdx] + '.csv'
        
        # Check if this string is in the provided list
        is_present = date_with_extension in check_list
        
        return is_present
    except IndexError:
        return "Index is out of range"  # Handle the case where DatesIdx is out of the list bounds

# # Example usage
# common_dates = ['2023-04-04', '2023-04-05']
# DatesIdx = 1
# check_list = ['2023-04-05.csv', '2023-04-06.csv']

# # Call the function and print the result
# result = check_date_in_list(common_dates, DatesIdx, check_list)
# print("Is the date file


In [172]:
symbols = list(set(os.listdir(FeaturesDir)).intersection(set(os.listdir(LabelOne))))

In [174]:

def get_paths_for_symbol(index, FeaturesDir, LabelOne):
    # Find the common elements between the two directories
    symbols = sorted(list(set(os.listdir(FeaturesDir)).intersection(set(os.listdir(LabelOne)))))
    
    # Check if the index is valid
    if index < 0 or index >= len(symbols):
        return "Index is out of range."

    # Get the symbol at the provided index
    selected_symbol = symbols[index]

    # Construct the paths for the selected symbol in both directories
    features_path = os.path.join(FeaturesDir, selected_symbol, 'MODEL_BASED')
    labels_path = os.path.join(LabelOne, selected_symbol)

    return {
        'Features Path': features_path,
        'Labels Path': labels_path
    }

# Example usage
FeaturesDir = '/media/ak/DataOnly1/SymbolFeatureDirectories/'
LabelOne = '/media/ak/DataOnly1/ExperimentCommonLocs/LabelsAlternateOne/'
index = 1  # Adjust this index based on your choice from the list of common symbols



{'Features Path': '/media/ak/DataOnly1/SymbolFeatureDirectories/APF.L/MODEL_BASED', 'Labels Path': '/media/ak/DataOnly1/ExperimentCommonLocs/LabelsAlternateOne/APF.L'}
['20170711', '20170712', '20170713', '20170714', '20170717', '20170718', '20170719', '20170720', '20170721', '20170724', '20170725', '20170726', '20170727', '20170728', '20170731', '20170801', '20170802', '20170803', '20170804', '20170807', '20170808', '20170809', '20170810', '20170811', '20170814', '20170815', '20170816', '20170817', '20170818', '20170821', '20170822', '20170823', '20170824', '20170825', '20170829', '20170830', '20170831', '20170901', '20170904', '20170905', '20170906', '20170907', '20170908', '20170911', '20170912', '20170913', '20170914', '20170915', '20170918', '20170919', '20170920', '20170921', '20170922', '20170925', '20170926', '20170927', '20170928', '20170929']


In [186]:
# # Call the function
# result = get_paths_for_symbol(index, FeaturesDir, LabelOne)
# print(result)
# datesDirs = os.listdir(result['Features Path'])
# dateFiles = os.listdir(result['Labels Path'])
# label_dates = [f.split(".csv")[0] for f in dateFiles]
# datesDirsIdx = 1
# datesDirPath = os.path.join(result['Features Path'], datesDirs[datesDirsIdx])
# features_files = os.listdir(datesDirPath)
# features_dict = {f.split("_")[5]: f for f in features_files if "_" in f and len(f.split("_")) > 5}

# features_dates= list(features_dict.keys())

# common_dates = sorted(list(set(features_dates).intersection(set(label_dates))))
# print(common_dates)
# DatesIdx = 1
# featuresCommonDatePath = os.path.join( datesDirPath, features_dict[common_dates[DatesIdx]])
# labelsCommonDatePath = os.path.join(result['Labels Path'], common_dates[DatesIdx]+str('.csv'))
# data =pd.read_csv(labelsCommonDatePath) # data
# # # Example Usage

# df = pd.DataFrame(data)

# # Initialize the feature creation class
# market_features = CreateMarketFeatures(df)

# # Calculate features
# df_with_features = market_features.ma_spread()
# df_with_features = market_features.obv_calc()
# df_with_features = market_features.chaikin_mf()

# print(df_with_features.columns.values)
# # Automatically detect label column
# label_columns = [col for col in df.columns if 'label' in col]
# if not label_columns:
#     raise ValueError("No label column found.")
# label_column = label_columns[0]  # Use the first label column found

# # Features and target variable
# X = df[['ma_spread', 'OBV', 'CMF']]
# y = df[label_column]

# # Checking for NaNs in features and target
# print("NaNs in X:", X.isna().sum())
# print("NaNs in y:", y.isna().sum())

# # Filling NaNs with the median or mean or dropping rows with NaNs
# X.fillna(X.median(), inplace=True)
# y.dropna(inplace=True)  # Dropping NaNs from y might require re-splitting the data

# # If you drop NaNs from y, you need to make sure that X only includes rows that have corresponding y values
# X = X.loc[y.index]

# # Re-splitting the data if you have dropped any rows
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=16)

# # Instantiate the model
# logreg = LogisticRegression(random_state=16)

# # Fit the model
# logreg.fit(X_train, y_train)

# # Predicting the test set results
# y_pred = logreg.predict(X_test)

# # Evaluating the model
# accuracy = accuracy_score(y_test, y_pred)
# precision = precision_score(y_test, y_pred, average='binary')
# recall = recall_score(y_test, y_pred, average='binary')
# f1 = f1_score(y_test, y_pred, average='binary')

# print(f"Accuracy: {accuracy}")
# print(f"Precision: {precision}")
# print(f"Recall: {recall}")
# print(f"F1 Score: {f1}")
# print(classification_report(y_test, y_pred))

In [184]:
### Multi-Day Fitting 

In [185]:
# Assuming `common_dates` and other setup from your code are correctly defined
train_dates = common_dates[:6]  # First 6 dates for training
test_dates = common_dates[6:]  # Remaining dates for testing

def load_data_for_dates(dates, features_path, labels_path, features_dict):
    data_frames = []
    for date in dates:
        features_date_path = os.path.join(features_path, features_dict[date])
        labels_date_path = os.path.join(labels_path, date + '.csv')
        df = pd.read_csv(labels_date_path)
        # Assuming feature calculation happens here or is pre-calculated
        market_features = CreateMarketFeatures(df)
        df = market_features.ma_spread()
        df = market_features.obv_calc()
        df = market_features.chaikin_mf()
        data_frames.append(df)
    return pd.concat(data_frames)

# Load training and testing data
train_data = load_data_for_dates(train_dates, datesDirPath, result['Labels Path'], features_dict)
test_data = load_data_for_dates(test_dates, datesDirPath, result['Labels Path'], features_dict)

# Ensure label columns are correctly identified
label_columns = [col for col in train_data.columns if 'label' in col]
if not label_columns:
    raise ValueError("No label column found.")
label_column = label_columns[0]  # Use the first label column found

# Features and target variable setup
X_train = train_data[['ma_spread', 'OBV', 'CMF']]
y_train = train_data[label_column]
X_test = test_data[['ma_spread', 'OBV', 'CMF']]
y_test = test_data[label_column]

# Handling missing data
X_train.fillna(X_train.median(), inplace=True)
y_train.fillna(method='ffill', inplace=True)  # Forward fill for labels, or choose a better imputation method
X_test.fillna(X_test.median(), inplace=True)
y_test.fillna(method='ffill', inplace=True)  # Forward fill for labels, or choose a better imputation method

### Step 2: Train and Evaluate the Model
logreg = LogisticRegression(random_state=16)
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)

# Model evaluation
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='binary')
recall = recall_score(y_test, y_pred, average='binary')
f1 = f1_score(y_test, y_pred, average='binary')

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")
print(classification_report(y_test, y_pred))

Accuracy: 0.7406647517439475
Precision: 0.7406647517439475
Recall: 1.0
F1 Score: 0.851013672795851
              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00       632
         1.0       0.74      1.00      0.85      1805

    accuracy                           0.74      2437
   macro avg       0.37      0.50      0.43      2437
weighted avg       0.55      0.74      0.63      2437



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  downcast=downcast,
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [188]:

number_of_training_days = 6
# Assuming `common_dates` and other setup from your code are correctly defined
train_dates = common_dates[:number_of_training_days]  # First 6 dates for training
test_dates = common_dates[number_of_training_days:]  # Remaining dates for testing

def load_data_for_dates(dates, features_path, labels_path, features_dict):
    data_frames = []
    for date in dates:
        features_date_path = os.path.join(features_path, features_dict[date])
        labels_date_path = os.path.join(labels_path, date + '.csv')
        df = pd.read_csv(labels_date_path)
        # Assuming feature calculation happens here or is pre-calculated
        market_features = CreateMarketFeatures(df)
        df = market_features.ma_spread()
        df = market_features.obv_calc()
        df = market_features.chaikin_mf()
        data_frames.append(df)
    return pd.concat(data_frames)

# Load training and testing data
train_data = load_data_for_dates(train_dates, datesDirPath, result['Labels Path'], features_dict)
test_data = load_data_for_dates(test_dates, datesDirPath, result['Labels Path'], features_dict)

# Ensure label columns are correctly identified
label_columns = [col for col in train_data.columns if 'label' in col]
if not label_columns:
    raise ValueError("No label column found.")
label_column = label_columns[0]  # Use the first label column found

# Features and target variable setup
X_train = train_data[['ma_spread', 'OBV', 'CMF']]
y_train = train_data[label_column]
X_test = test_data[['ma_spread', 'OBV', 'CMF']]
y_test = test_data[label_column]

# Handling missing data
X_train.fillna(X_train.median(), inplace=True)
y_train.fillna(method='ffill', inplace=True)  # Forward fill for labels, or choose a better imputation method
X_test.fillna(X_test.median(), inplace=True)
y_test.fillna(method='ffill', inplace=True)  # Forward fill for labels, or choose a better imputation method

### Step 2: Train and Evaluate the Model
logreg = LogisticRegression(random_state=16)
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)

# Model evaluation
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='binary')
recall = recall_score(y_test, y_pred, average='binary')
f1 = f1_score(y_test, y_pred, average='binary')

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")
print(classification_report(y_test, y_pred))


Accuracy: 0.7406647517439475
Precision: 0.7406647517439475
Recall: 1.0
F1 Score: 0.851013672795851
              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00       632
         1.0       0.74      1.00      0.85      1805

    accuracy                           0.74      2437
   macro avg       0.37      0.50      0.43      2437
weighted avg       0.55      0.74      0.63      2437



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  downcast=downcast,
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [191]:
from sklearn.metrics import f1_score, classification_report, roc_auc_score, confusion_matrix, matthews_corrcoef, log_loss
from sklearn.preprocessing import LabelBinarizer

# Assuming y_test and y_pred are available from your logistic regression model predictions
# Calculate F1 scores
f1_micro = f1_score(y_test, y_pred, average='micro')
f1_macro = f1_score(y_test, y_pred, average='macro')
f1_weighted = f1_score(y_test, y_pred, average='weighted')

print("F1 Micro: {:.2f}".format(f1_micro))
print("F1 Macro: {:.2f}".format(f1_macro))
print("F1 Weighted: {:.2f}".format(f1_weighted))

# Classification report
print(classification_report(y_test, y_pred))

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", cm)

# Matthews Correlation Coefficient
mcc = matthews_corrcoef(y_test, y_pred)
print("Matthews Correlation Coefficient: {:.2f}".format(mcc))

# If the logistic regression model outputs probabilities (you need to adjust model prediction if not)
# Assuming you have predicted probabilities y_prob for ROC and Log-Loss; if not use logreg.predict_proba(X_test)
# y_prob = logreg.predict_proba(X_test)[:, 1]  # Adjust according to your specific model

# ROC-AUC score
# Note: ROC-AUC can be computed for binary classification, for multi-class you would adjust the method
if len(np.unique(y_test)) == 2:  # Check if it's a binary classification
    roc_auc = roc_auc_score(y_test, y_prob[:, 1])  # adjust indexing based on your predict_proba output
    print("ROC-AUC Score: {:.2f}".format(roc_auc))
else:
    lb = LabelBinarizer()
    lb.fit(y_test)
    y_test_binarized = lb.transform(y_test)
    y_prob = logreg.predict_proba(X_test)
    roc_auc = roc_auc_score(y_test_binarized, y_prob, multi_class='ovr')  # ovr: One-vs-Rest
    print("ROC-AUC Score for Multi-class: {:.2f}".format(roc_auc))

# Log-Loss
logloss = log_loss(y_test, y_prob)
print("Log Loss: {:.2f}".format(logloss))


F1 Micro: 0.74
F1 Macro: 0.43
F1 Weighted: 0.63
              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00       632
         1.0       0.74      1.00      0.85      1805

    accuracy                           0.74      2437
   macro avg       0.37      0.50      0.43      2437
weighted avg       0.55      0.74      0.63      2437

Confusion Matrix:
 [[   0  632]
 [   0 1805]]
Matthews Correlation Coefficient: 0.00


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


NameError: name 'y_prob' is not defined

In [192]:
from sklearn.metrics import f1_score, classification_report, roc_auc_score, confusion_matrix, matthews_corrcoef, log_loss
from sklearn.preprocessing import LabelBinarizer

# Assuming y_test and y_pred are available from your logistic regression model predictions

# Instantiate the model (if not already instantiated)
logreg = LogisticRegression(random_state=16)

# Train the model
logreg.fit(X_train, y_train)

# Predicting the test set results
y_pred = logreg.predict(X_test)
y_prob = logreg.predict_proba(X_test)  # This will be used for ROC-AUC and log loss

# Calculate F1 scores
f1_micro = f1_score(y_test, y_pred, average='micro')
f1_macro = f1_score(y_test, y_pred, average='macro')
f1_weighted = f1_score(y_test, y_pred, average='weighted')

print("F1 Micro: {:.2f}".format(f1_micro))
print("F1 Macro: {:.2f}".format(f1_macro))
print("F1 Weighted: {:.2f}".format(f1_weighted))

# Classification report
print(classification_report(y_test, y_pred))

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", cm)

# Matthews Correlation Coefficient
mcc = matthews_corrcoef(y_test, y_pred)
print("Matthews Correlation Coefficient: {:.2f}".format(mcc))

# ROC-AUC score
# Note: ROC-AUC can be computed for binary classification, for multi-class you would adjust the method
if len(np.unique(y_test)) == 2:  # Check if it's a binary classification
    roc_auc = roc_auc_score(y_test, y_prob[:, 1])  # Adjust indexing based on your predict_proba output
    print("ROC-AUC Score: {:.2f}".format(roc_auc))
else:
    lb = LabelBinarizer()
    lb.fit(y_test)
    y_test_binarized = lb.transform(y_test)
    roc_auc = roc_auc_score(y_test_binarized, y_prob, multi_class='ovr')  # ovr: One-vs-Rest
    print("ROC-AUC Score for Multi-class: {:.2f}".format(roc_auc))

# Log-Loss
logloss = log_loss(y_test, y_prob)
print("Log Loss: {:.2f}".format(logloss))


F1 Micro: 0.74
F1 Macro: 0.43
F1 Weighted: 0.63
              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00       632
         1.0       0.74      1.00      0.85      1805

    accuracy                           0.74      2437
   macro avg       0.37      0.50      0.43      2437
weighted avg       0.55      0.74      0.63      2437

Confusion Matrix:
 [[   0  632]
 [   0 1805]]
Matthews Correlation Coefficient: 0.00
ROC-AUC Score: 0.51
Log Loss: 0.69


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
