<h1 style="text-align: center" >Stock Prediction Using Logistic Regression</h1>

## Import Libraries

In [1]:
import os
import pandas as pd
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

## Load Data

In [2]:
def load_data(directory):
    stock_data = []

    files = [f for f in os.listdir(directory) if f.endswith(".txt")]

    for filename in tqdm(files, desc="Loading data"):
        file_path = os.path.join(directory, filename)
        if os.stat(file_path).st_size != 0:
            symbol = filename.split(".")[0]
            stock_df = pd.read_csv(file_path, parse_dates=["Date"])
            stock_df["Symbol"] = symbol
            stock_data.append(stock_df)

    combined_df = pd.concat(stock_data)
    return combined_df

In [3]:
data = load_data("data/Stocks/")

Loading data:   0%|          | 0/7195 [00:00<?, ?it/s]

## Preprocess Data

In [4]:
def preprocess_data(data, days_ahead=30):
    data["Label"] = data["Close"].shift(-days_ahead) > data["Close"]
    data["Label"] = data["Label"].astype(int)
    data["30_day_moving_average"] = data["Close"].rolling(window=30).mean()
    data["Date"] = data["Date"].apply(lambda x: x.toordinal())
    
    data.dropna(inplace=True)
    return data

In [5]:
data = preprocess_data(data)

In [6]:
le = LabelEncoder()
data["Symbol"] = le.fit_transform(data["Symbol"])

In [7]:
data.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,OpenInt,Symbol,Label,30_day_moving_average
29,730119,53.666,53.966,51.473,52.188,2046399,0,0,0,32.633967
30,730122,53.161,53.286,45.483,48.602,4953196,0,0,1,33.263967
31,730123,45.984,46.495,43.708,44.89,5049339,0,0,1,33.851733
32,730124,44.722,44.89,40.71,42.109,6102164,0,0,1,34.2653
33,730125,41.596,41.854,39.232,40.502,2685624,0,0,1,34.7153


In [8]:
data.tail()

Unnamed: 0,Date,Open,High,Low,Close,Volume,OpenInt,Symbol,Label,30_day_moving_average
569,736639,10.42,11.54,10.42,11.19,977948,0,7162,0,9.496
570,736640,11.3,11.42,10.67,10.83,451210,0,7162,0,9.662333
571,736641,10.7,11.06,10.35,10.9,336449,0,7162,0,9.819333
572,736642,11.0,11.8563,10.97,11.6,463067,0,7162,0,9.891333
573,736643,11.68,13.15,11.3043,12.46,885587,0,7162,0,10.028


## Train and Test Models

In [9]:
# features = [
#     "Open",
#     "High",
#     "Low",
#     "Close",
#     "Volume",
#     "OpenInt",
#     "30_day_moving_average",
#     "Symbol",
# ]
# target = "Label"

X = data.drop(columns="Label")
y = data["Label"]

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

### Logistic Regression

In [11]:
from sklearn.linear_model import LogisticRegression

#### Training

In [12]:
lr = LogisticRegression(max_iter=1000)
scores = cross_val_score(lr, X_train, y_train, scoring="accuracy", cv=5)

In [13]:
print("Cross-validation scores:", scores)
print("Average Train Accuracy:", scores.mean())

Cross-validation scores: [0.54778125 0.54777706 0.54778377 0.54777645 0.54778106]
Average Train Accuracy: 0.5477799193758506


#### Testing

In [14]:
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)

In [15]:
print("Test Accuracy:", accuracy_score(y_test, y_pred))
# print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))
# print("Classification report:\n", classification_report(y_test, y_pred))

Test Accuracy: 0.547252284445352
