In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, classification_report

# Load and preprocess data for each stock index (replace with actual file paths)
file_paths = [
    'Processed_S&P.csv',
    'Processed_RUSSELL.csv',
    'Processed_NYSE.csv',
    'Processed_NASDAQ.csv',
    'Processed_DJI.csv'
]

# Define the target variable ('Price_Up') as binary (1 if price goes up, 0 if it goes down)
selected_features = [
    'Volume', 'mom', 'mom1', 'mom2', 'mom3', 'DTB4WK', 'DTB3', 'DTB6', 'DGS5', 'DGS10'
]

test_size = 0.2
random_state = 42

for file_path in file_paths:
    # Load data
    data = pd.read_csv(file_path)

    # Create a binary target variable indicating if the price goes up (1) or down (0)
    data['Price_Up'] = (data['Close'].shift(-1) > data['Close']).astype(int)

    # Select features and target variable
    X = data[selected_features]
    y = data['Price_Up']

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

    # Create an imputer to fill missing values with the mean
    imputer = SimpleImputer(strategy='mean')

    # Fit and transform the imputer on your feature data
    X_train = imputer.fit_transform(X_train)
    X_test = imputer.transform(X_test)

    # Create and train a Logistic Regression model
    model = LogisticRegression()
    model.fit(X_train, y_train)

    # Make predictions on the testing data
    y_pred = model.predict(X_test)

    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    print(f'Accuracy for {file_path}: {accuracy:.2f}')
    # print(classification_report(y_test, y_pred, zero_division=0))  # Set zero_division to 0 to suppress the warning


Accuracy for Processed_S&P.csv: 0.58
Accuracy for Processed_RUSSELL.csv: 0.55
Accuracy for Processed_NYSE.csv: 0.54
Accuracy for Processed_NASDAQ.csv: 0.59
Accuracy for Processed_DJI.csv: 0.58
