<h1 style="text-align: center" >Stock Prediction Using Logistic Regression</h1>

## Import Libraries

In [1]:
import os
import pandas as pd
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

## Load Data

In [2]:
def load_data(directory):
    stock_data = []

    files = [f for f in os.listdir(directory) if f.endswith(".csv")]

    for filename in tqdm(files, desc="Loading data"):
        file_path = os.path.join(directory, filename)
        if os.stat(file_path).st_size != 0:
            symbol = filename.split(".")[0]
            stock_df = pd.read_csv(
                file_path,
                parse_dates=["Date"],
                dayfirst=True,
                on_bad_lines="skip",
            )
            stock_df.set_index("Date", inplace=True)
            stock_df["Symbol"] = symbol
            stock_data.append(stock_df)

    combined_df = pd.concat(stock_data)
    return combined_df

In [3]:
data = load_data("nasdaq/")

Loading data:   0%|          | 0/1564 [00:00<?, ?it/s]

## Preprocess Data

In [4]:
def preprocess_data(data, days_ahead=30):
    data["Label"] = data["Close"].shift(-days_ahead) > data["Close"]
    data["Label"] = data["Label"].astype(int)
    data["30_day_moving_average"] = data["Close"].rolling(window=30).mean()
    
    data.dropna(inplace=True)
    return data

In [5]:
data = preprocess_data(data)

In [6]:
le = LabelEncoder()
data["Symbol"] = le.fit_transform(data["Symbol"])

In [7]:
data.head()

Unnamed: 0_level_0,Low,Open,Volume,High,Close,Adjusted Close,Symbol,Label,30_day_moving_average
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1996-08-29 00:00:00,8.875,8.9375,145600.0,9.0,8.875,7.718067,1100,1,7.504167
1996-08-30 00:00:00,8.6875,9.0,129800.0,9.0625,9.0625,7.881122,1100,1,7.6
1996-09-03 00:00:00,8.8125,8.8125,108600.0,9.25,9.0625,7.881122,1100,1,7.702083
1996-09-04 00:00:00,9.0625,9.25,58800.0,9.25,9.0625,7.881122,1100,1,7.829167
1996-09-05 00:00:00,9.0625,9.125,150200.0,9.125,9.125,8.438284,1100,1,7.966667


In [8]:
data.tail()

Unnamed: 0_level_0,Low,Open,Volume,High,Close,Adjusted Close,Symbol,Label,30_day_moving_average
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2022-12-06 00:00:00,115.260002,118.160004,75700.0,118.209999,117.279999,117.279999,921,0,116.012
2022-12-07 00:00:00,116.220001,117.779999,89200.0,118.68,116.5,116.5,921,0,116.281
2022-12-08 00:00:00,114.720001,116.190002,81800.0,117.620003,116.029999,116.029999,921,0,116.493667
2022-12-09 00:00:00,113.849998,116.059998,200600.0,118.129997,116.790001,116.790001,921,0,116.733667
2022-12-12 00:00:00,116.43,116.699997,6956.0,117.730003,117.730003,117.730003,921,0,116.912667


## Train and Test Models

In [9]:
X = data.drop(columns=["Label", "Symbol"])
y = data["Label"]

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

### Logistic Regression

In [11]:
from sklearn.linear_model import LogisticRegression

#### Training

In [12]:
lr = LogisticRegression(max_iter=1000)
scores = cross_val_score(lr, X_train, y_train, scoring="accuracy", cv=5)

In [13]:
print("Cross-validation scores:", scores)
print("Average Train Accuracy:", scores.mean())

Cross-validation scores: [0.52367739 0.52297893 0.52406096 0.5230761  0.52351192]
Average Train Accuracy: 0.5234610608229989


#### Testing

In [14]:
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)

In [15]:
print("Accuracy:", accuracy_score(y_test, y_pred))

Accuracy: 0.5235344830105528


### Random Forest Classifier

In [16]:
from sklearn.ensemble import RandomForestClassifier

In [17]:
rf = RandomForestClassifier(n_estimators=75, max_depth=5)
scores = cross_val_score(rf, X_train, y_train, scoring="accuracy", cv=5)

In [18]:
print("Cross-validation scores:", scores)
print("Average Train Accuracy:", scores.mean())

Cross-validation scores: [0.53057209 0.52993574 0.53002122 0.5298751  0.53007129]
Average Train Accuracy: 0.5300950850964666


In [19]:
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

In [20]:
print("Accuracy:", accuracy_score(y_test, y_pred))

Accuracy: 0.5300216550792997
