<h1 style="text-align: center" >Stock Prediction Using Logistic Regression</h1>

## Import Libraries

In [1]:
import os
import pandas as pd
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

## Load Data

In [2]:
def load_data(directory):
    stock_data = []

    files = [f for f in os.listdir(directory) if f.endswith(".csv")]

    for filename in tqdm(files, desc="Loading data"):
        file_path = os.path.join(directory, filename)
        if os.stat(file_path).st_size != 0:
            symbol = filename.split(".")[0]
            stock_df = pd.read_csv(
                file_path,
                parse_dates=["Date"],
                dayfirst=True,
                on_bad_lines="skip",
            )
            stock_df.set_index("Date", inplace=True)
            stock_df["Symbol"] = symbol
            stock_data.append(stock_df)

    combined_df = pd.concat(stock_data)
    return combined_df

In [3]:
data = load_data("nasdaq/")

Loading data:   0%|          | 0/1564 [00:00<?, ?it/s]

## Preprocess Data

In [4]:
def preprocess_data(data, days_ahead=30):
    data["Label"] = data["Close"].shift(-days_ahead) > data["Close"]
    data["Label"] = data["Label"].astype(int)
    data["30_day_moving_average"] = data["Close"].rolling(window=30).mean()
    
    data.dropna(inplace=True)
    return data

In [5]:
data = preprocess_data(data)

In [6]:
le = LabelEncoder()
data["Symbol"] = le.fit_transform(data["Symbol"])

In [7]:
data.head()

Unnamed: 0_level_0,Low,Open,Volume,High,Close,Adjusted Close,Symbol,Label,30_day_moving_average
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2005-11-07 00:00:00,28.379999,29.0,486900.0,29.43,28.93,27.273512,0,1,22.847
2005-11-08 00:00:00,29.0,29.1,1395000.0,29.700001,29.43,27.744884,0,1,23.184667
2005-11-09 00:00:00,29.43,29.43,1312300.0,31.299999,31.299999,29.507811,0,1,23.544666
2005-11-10 00:00:00,31.15,31.15,1948900.0,33.959999,32.799999,30.921915,0,1,23.964333
2005-11-11 00:00:00,33.0,33.049999,1183900.0,33.950001,33.349998,31.440432,0,1,24.375666


In [8]:
data.tail()

Unnamed: 0_level_0,Low,Open,Volume,High,Close,Adjusted Close,Symbol,Label,30_day_moving_average
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2022-12-06 00:00:00,22.1,22.66,515300.0,23.049999,22.52,22.52,1562,0,22.981
2022-12-07 00:00:00,22.059999,22.379999,494000.0,23.18,22.99,22.99,1562,0,22.995
2022-12-08 00:00:00,22.690001,23.02,370400.0,23.549999,23.51,23.51,1562,0,23.026667
2022-12-09 00:00:00,22.120001,23.09,427400.0,23.09,22.700001,22.700001,1562,0,23.044333
2022-12-12 00:00:00,22.290001,22.74,48442.0,22.74,22.645,22.645,1562,0,23.043833


## Train and Test Models

In [9]:
X = data.drop(columns="Label")
y = data["Label"]

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

### Logistic Regression

In [11]:
from sklearn.linear_model import LogisticRegression

#### Training

In [12]:
lr = LogisticRegression(max_iter=1000)
scores = cross_val_score(lr, X_train, y_train, scoring="accuracy", cv=5)

In [13]:
print("Cross-validation scores:", scores)
print("Average Train Accuracy:", scores.mean())

Cross-validation scores: [0.52241795 0.5226437  0.52353722 0.52364281 0.52301011]
Average Train Accuracy: 0.5230503587080625


#### Testing

In [14]:
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)

In [15]:
print("Accuracy:", accuracy_score(y_test, y_pred))

Accuracy: 0.5229886132369805


### Random Forest Classifier

In [16]:
from sklearn.ensemble import RandomForestClassifier

In [17]:
rf = RandomForestClassifier(n_estimators=75, max_depth=5)
scores = cross_val_score(rf, X_train, y_train, scoring="accuracy", cv=5)

In [18]:
print("Cross-validation scores:", scores)
print("Average Train Accuracy:", scores.mean())

Cross-validation scores: [0.53075404 0.52893705 0.53010819 0.52964611 0.52996027]
Average Train Accuracy: 0.5298811335632346


In [19]:
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

In [20]:
print("Accuracy:", accuracy_score(y_test, y_pred))

Accuracy: 0.5299029652809518
