In [16]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from ydata_profiling import ProfileReport

In [20]:
stocks = pd.read_csv('../datasets/stock_data.csv')

# Convert Date to numerical values
stocks['Date'] = pd.to_datetime(stocks['Date'])
stocks['Year'] = stocks['Date'].dt.year
stocks['Month'] = stocks['Date'].dt.month
stocks['Day'] = stocks['Date'].dt.day
stocks['DayOfWeek'] = stocks['Date'].dt.dayofweek

# Drop Date and target variable
X = stocks.drop(columns=['Close', 'Date'])
Y = stocks['Close']
# ProfileReport(stocks, title="Stock Data Profiling Report", explorative=True).to_file(
#     output_file='stock-data-profiling-report.html')
stocks

Unnamed: 0,Date,Company,Sector,Open,High,Low,Close,Volume,Market_Cap,PE_Ratio,Dividend_Yield,Volatility,Sentiment_Score,Trend,Year,Month,Day,DayOfWeek
0,2022-01-01,Uber,Technology,100.000000,101.036120,97.477809,100.000000,171958,5.156585e+11,24.253218,0.163090,0.047484,0.939232,Bearish,2022,1,1,5
1,2022-01-02,Tesla,Automotive,100.071106,102.037960,97.152675,100.071106,196867,9.755658e+11,18.602848,0.288515,0.022472,0.469417,Bearish,2022,1,2,6
2,2022-01-03,Panasonic,Finance,99.857828,101.517497,98.108097,99.857828,181932,4.595300e+11,10.728933,2.221827,0.019991,0.399193,Bullish,2022,1,3,0
3,2022-01-04,Tencent,Automotive,98.851663,101.303783,96.998218,98.851663,153694,5.577481e+11,14.582696,1.377740,0.036166,0.705853,Stable,2022,1,4,1
4,2022-01-05,Wells Fargo,Automotive,98.391118,99.990486,96.230707,98.391118,169879,8.607714e+11,37.491114,3.110198,0.034770,-0.768354,Stable,2022,1,5,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,2024-09-22,Pfizer,Energy,121.428608,123.693983,119.425495,121.428608,54757,4.469189e+10,22.313632,0.083782,0.018820,0.627218,Stable,2024,9,22,6
996,2024-09-23,Panasonic,Consumer Goods,122.302376,124.731488,120.182416,122.302376,69097,1.116972e+11,22.945209,3.395996,0.017438,0.429906,Bullish,2024,9,23,0
997,2024-09-24,Goldman Sachs,Energy,120.638302,122.485538,118.135033,120.638302,191467,5.989330e+11,21.854227,0.382841,0.048033,-0.502406,Bearish,2024,9,24,1
998,2024-09-25,Lyft,Aerospace,120.228806,123.124765,117.959012,120.228806,197313,9.591669e+10,27.240528,2.103806,0.018396,0.820433,Bullish,2024,9,25,2


In [10]:
# Scale numerical values
numeric_features = ['Open', 'High', 'Low', 'Volume', 'Market_Cap', 'PE_Ratio', 'Dividend_Yield', 'Volatility', 'Sentiment_Score']
scaler = StandardScaler()
X[numeric_features] = scaler.fit_transform(X[numeric_features])

In [11]:
# One-hot encode categorical values
categorical_features = ['Company', 'Sector', 'Trend']
column_transformer = ColumnTransformer(
    transformers=[('encoder', OneHotEncoder(drop='first'), categorical_features)],
    remainder='passthrough')

X = column_transformer.fit_transform(X)

In [12]:
# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=0)

In [13]:
# Train Linear Regression Model
regressor = LinearRegression()
regressor.fit(X_train, y_train)

In [14]:
# Predict
Y_pred = regressor.predict(X_test)

In [15]:
# Evaluate
r2 = r2_score(y_test, Y_pred) * 100
print(f"R² Score: {r2}")

R² Score: 99.99999995286439
