<h1 style="text-align: center" >Stock Prediction Using Logistic Regression</h1>

## Import Libraries

In [1]:
import os
import pandas as pd
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

## Load Data

In [2]:
def load_data(directory):
    stock_data = []

    files = [f for f in os.listdir(directory) if f.endswith(".txt")]

    for filename in tqdm(files, desc="Loading data"):
        file_path = os.path.join(directory, filename)
        if os.stat(file_path).st_size != 0:
            symbol = filename.split(".")[0]
            stock_df = pd.read_csv(file_path, parse_dates=["Date"])
            stock_df.set_index("Date", inplace=True)
            stock_df["Symbol"] = symbol
            stock_data.append(stock_df)

    combined_df = pd.concat(stock_data)
    return combined_df

In [3]:
data = load_data("data/Stocks/")

Loading data:   0%|          | 0/7195 [00:00<?, ?it/s]

## Preprocess Data

In [4]:
def preprocess_data(data, days_ahead=30):
    data["Label"] = data["Close"].shift(-days_ahead) > data["Close"]
    data["Label"] = data["Label"].astype(int)
    data["30_day_moving_average"] = data["Close"].rolling(window=30).mean()
    
    data.dropna(inplace=True)
    return data

In [5]:
data = preprocess_data(data)

In [6]:
le = LabelEncoder()
data["Symbol"] = le.fit_transform(data["Symbol"])

In [7]:
data.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,OpenInt,Symbol,Label,30_day_moving_average
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2016-12-21,105.75,105.77,105.13,105.13,9034,0,2452,1,100.559177
2016-12-22,105.45,105.82,105.155,105.155,4604,0,2452,1,100.722677
2016-12-23,105.31,105.75,105.31,105.75,14041,0,2452,1,100.831677
2016-12-27,105.85,105.94,105.651,105.78,3701,0,2452,1,101.069677
2016-12-28,104.92,104.93,104.73,104.73,3020,0,2452,1,101.27901


In [8]:
data.tail()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,OpenInt,Symbol,Label,30_day_moving_average
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2017-11-06,82.847,83.574,82.299,82.887,452189,0,5541,0,78.687333
2017-11-07,82.887,82.887,82.011,82.14,828976,0,5541,0,78.8928
2017-11-08,82.16,82.75,81.78,82.19,1230839,0,5541,0,79.104567
2017-11-09,82.05,82.9,81.97,82.72,597998,0,5541,0,79.335667
2017-11-10,82.32,83.07,81.52,83.06,374875,0,5541,0,79.5499


## Train and Test Models

In [9]:
X = data.drop(columns=["Label", "Symbol"])
y = data["Label"]

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

### Logistic Regression

In [11]:
from sklearn.linear_model import LogisticRegression

#### Training

In [12]:
lr = LogisticRegression(max_iter=1000)
scores = cross_val_score(lr, X_train, y_train, scoring="accuracy", cv=5)

In [13]:
print("Cross-validation scores:", scores)
print("Average Train Accuracy:", scores.mean())

Cross-validation scores: [0.54789502 0.54796387 0.54794708 0.54791121 0.54791834]
Average Train Accuracy: 0.5479271052769075


#### Testing

In [14]:
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)

In [15]:
print("Test Accuracy:", accuracy_score(y_test, y_pred))
# print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))
# print("Classification report:\n", classification_report(y_test, y_pred))

Test Accuracy: 0.5481483297554213
