In [1]:
"""Module imports"""
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import scipy
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn import metrics
from sklearn.utils import resample
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split

seed = 42

In [2]:
"""Train/test/validation data preparation for label_3"""

# For the sake of my current sanity the model does NOT have any strings in it

df = pd.read_csv('training_data.csv')
df = df.drop(columns=['title', 'label_7']) # no label_7 here
df['stock_id'] = LabelEncoder().fit_transform(df['stock'])
df = df.drop(columns=['stock'])

df = df.sort_values(['stock_id', 'date'])
df['prev_close'] = df.groupby('stock_id')['close'].shift(1)
df = df.drop(columns=['date'])
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.dropna(inplace=True)

def sort_by_stock_and_time(df, train_size = 0.15, val_size = 0.15, test_size = 0.7):
    train_stocks = []
    val_stocks = []
    test_stocks = []

    for stock in df['stock_id'].unique():
        stock_df = df[df['stock_id'] == stock]
        n = len(stock_df)
        train_end = int(n * train_size)
        val_end = int(n * val_size)
        test_end = int(n * test_size)
        
        train_stocks.append(stock_df.iloc[:train_end])
        val_stocks.append(stock_df.iloc[train_end:train_end + val_end])
        test_stocks.append(stock_df.iloc[train_end + val_end:])
        
    train_df = pd.concat(train_stocks).reset_index(drop=True)
    val_df = pd.concat(val_stocks).reset_index(drop=True)
    test_df = pd.concat(test_stocks).reset_index(drop=True)
    
    return train_df, val_df, test_df

train_df, val_df, test_df = sort_by_stock_and_time(df)

# Separate features and target for each set
X_train = train_df.drop(columns=['label_3'])
y_train = train_df['label_3']

X_val = val_df.drop(columns=['label_3'])
y_val = val_df['label_3']

X_test = test_df.drop(columns=['label_3'])
y_test = test_df['label_3']

In [3]:
"""Logistic Regression, solver = 'sag', penalty = 'l2', max_iter=2000, random_state=seed, class_weight is unbalanced for now"""

model = LogisticRegression(solver = 'sag', penalty='l2', max_iter=2000, random_state = seed)

model.fit(X_train, y_train)

# Calculate training accuracy and error 
tr_err = 1 - model.score(X_train, y_train)
tr_acc = model.score(X_train, y_train)

# Calculate validation accuracy and error
val_acc = model.score(X_val, y_val)  # Accuracy on validation set
val_err = 1 - val_acc  

# Calculate testing accuracy and error
test_acc = model.score(X_test, y_test)  
test_err = 1 - test_acc  



In [4]:
"""Preparing data for confusion matrix"""
y_train_pred = model.predict(X_train)
y_val_pred = model.predict(X_val)

labels = [0, 1, 2]

In [5]:
print(f'Label 3 Training Accuracy:{tr_acc}')
print(f'Label 3 Training Error:{tr_err}')
#cm_train = metrics.confusion_matrix(y_train, y_train_pred, labels=labels)
#cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = cm_train, display_labels = labels).plot()

Label 3 Training Accuracy:0.7671077052026583
Label 3 Training Error:0.23289229479734175


In [6]:
print(f'Label 3 Validation Accuracy:{val_acc}')
print(f'Label 3 Validation Error:{val_err}')
#cm_val = metrics.confusion_matrix(y_val, y_val_pred, labels=labels)
#cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = cm_val, display_labels = labels).plot()

Label 3 Validation Accuracy:0.7789618631383584
Label 3 Validation Error:0.22103813686164164


In [7]:
print(f'Label 3 Test Accuracy:{test_acc}')
print(f'Label 3 Test Error:{test_err}')
#cm_val = metrics.confusion_matrix(y_val, y_val_pred, labels=labels)
#cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = cm_val, display_labels = labels).plot()

Label 3 Test Accuracy:0.7453223240663586
Label 3 Test Error:0.25467767593364143


In [17]:
"""Train/test/validation data preparation for label_7"""

# For the sake of my current sanity the model does NOT have any strings in it

df = pd.read_csv('training_data.csv')
df = df.drop(columns=['title', 'label_3']) # no label_3 here
df['stock_id'] = LabelEncoder().fit_transform(df['stock'])
df = df.drop(columns=['stock'])

df = df.sort_values(['stock_id', 'date']).
df['prev_close'] = df.groupby('stock_id')['close'].shift(1)
df = df.drop(columns=['date'])
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.dropna(inplace=True)

def sort_by_stock_and_time(df, train_size = 0.15, val_size = 0.15, test_size = 0.7):
    train_stocks = []
    val_stocks = []
    test_stocks = []

    for stock in df['stock_id'].unique():
        stock_df = df[df['stock_id'] == stock]
        n = len(stock_df)
        train_end = int(n * train_size)
        val_end = int(n * val_size)
        test_end = int(n * test_size)
        
        train_stocks.append(stock_df.iloc[:train_end])
        val_stocks.append(stock_df.iloc[train_end:train_end + val_end])
        test_stocks.append(stock_df.iloc[train_end + val_end:])
        
    train_df = pd.concat(train_stocks).reset_index(drop=True)
    val_df = pd.concat(val_stocks).reset_index(drop=True)
    test_df = pd.concat(test_stocks).reset_index(drop=True)
    
    return train_df, val_df, test_df

train_df, val_df, test_df = sort_by_stock_and_time(df)

# Separate features and target for each set
X_train = train_df.drop(columns=['label_7'])
y_train = train_df['label_7']

X_val = val_df.drop(columns=['label_7'])
y_val = val_df['label_7']

X_test = test_df.drop(columns=['label_7'])
y_test = test_df['label_7']

SyntaxError: invalid syntax (3017569119.py, line 10)

In [9]:
"""Logistic Regression, solver = 'sag', penalty = 'l2', max_iter=2000, random_state=seed, class_weight is unbalanced for now"""

model = LogisticRegression(solver = 'sag', penalty='l2', max_iter=2000, random_state = seed)

model.fit(X_train, y_train)

# Calculate training accuracy and error 
tr_err = 1 - model.score(X_train, y_train)
tr_acc = model.score(X_train, y_train)

# Calculate validation accuracy and error
val_acc = model.score(X_val, y_val)  # Accuracy on validation set
val_err = 1 - val_acc  

# Calculate testing accuracy and error
test_acc = model.score(X_test, y_test)  
test_err = 1 - test_acc 



In [10]:
print(f'Label 7 Training Accuracy:{tr_acc}')
print(f'Label 7 Training Error:{tr_err}')
#cm_train = metrics.confusion_matrix(y_train, y_train_pred, labels=labels)
#cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = cm_train, display_labels = labels).plot()

Label 7 Training Accuracy:0.6696401843980123
Label 7 Training Error:0.3303598156019877


In [11]:
print(f'Label 7 Validation Accuracy:{val_acc}')
print(f'Label 7 Validation Error:{val_err}')
#cm_val = metrics.confusion_matrix(y_val, y_val_pred, labels=labels)
#cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = cm_val, display_labels = labels).plot()

Label 7 Validation Accuracy:0.6883793330539424
Label 7 Validation Error:0.3116206669460576


In [12]:
print(f'Label 7 Test Accuracy:{test_acc}')
print(f'Label 7 Test Error:{test_err}')
#cm_val = metrics.confusion_matrix(y_val, y_val_pred, labels=labels)
#cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = cm_val, display_labels = labels).plot()

Label 7 Test Accuracy:0.6622099335908895
Label 7 Test Error:0.33779006640911047
