In [36]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt


In [37]:
df=pd.read_csv('stock_market_data_large.csv')
df.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,RSI,MACD,Sentiment,Target
0,2010-01-01,100.309172,102.741242,99.822837,101.38133,743835,75.9058,0.0,0.058089,0
1,2010-01-04,101.147509,101.681257,99.244474,100.396541,2693069,81.444127,-0.078559,0.751219,1
2,2010-01-05,101.567491,103.949668,101.509347,103.012156,3027306,71.544018,0.069441,0.979723,1
3,2010-01-06,104.247814,106.652559,102.760082,105.846343,4341338,73.683548,0.410693,-0.858008,0
4,2010-01-07,105.33944,105.983592,103.517541,104.721465,2106182,72.675179,0.583641,0.264456,0


In [38]:
df.isnull().sum()

Date         0
Open         0
High         0
Low          0
Close        0
Volume       0
RSI          0
MACD         0
Sentiment    0
Target       0
dtype: int64

In [39]:
# Duplicates check
df.duplicated().sum()

np.int64(0)

In [40]:
# df.fillna(df.mean(), inplace=True)  # numerical column-এর জন্য mean দিয়ে fill
# df.drop_duplicates(inplace=True)


num_cols = df.select_dtypes(include='number').columns
df[num_cols] = df[num_cols].fillna(df[num_cols].mean())



In [41]:
df.isnull().sum()


Date         0
Open         0
High         0
Low          0
Close        0
Volume       0
RSI          0
MACD         0
Sentiment    0
Target       0
dtype: int64

In [44]:
#Feature Engineering
# Price difference
df['Price_diff'] = df['Close'] - df['Open']

# Volatility (High - Low)
df['Volatility'] = df['High'] - df['Low']

# Moving Average (MA5)
df['MA5'] = df['Close'].rolling(window=5, min_periods=1).mean()


# Day of Week from Date
df['Date'] = pd.to_datetime(df['Date'])
# df['Day_of_Week'] = df['Date'].dt.dayofweek


In [45]:
df.head(10)

Unnamed: 0,Date,Open,High,Low,Close,Volume,RSI,MACD,Sentiment,Target,Price_diff,Volatility,MA5
0,2010-01-01,100.309172,102.741242,99.822837,101.38133,743835,75.9058,0.0,0.058089,0,1.072158,2.918405,101.38133
1,2010-01-04,101.147509,101.681257,99.244474,100.396541,2693069,81.444127,-0.078559,0.751219,1,-0.750968,2.436783,100.888936
2,2010-01-05,101.567491,103.949668,101.509347,103.012156,3027306,71.544018,0.069441,0.979723,1,1.444665,2.440321,101.596676
3,2010-01-06,104.247814,106.652559,102.760082,105.846343,4341338,73.683548,0.410693,-0.858008,0,1.598528,3.892477,102.659093
4,2010-01-07,105.33944,105.983592,103.517541,104.721465,2106182,72.675179,0.583641,0.264456,0,-0.617975,2.466051,103.071567
5,2010-01-08,103.706622,106.241324,103.106346,104.405219,3688429,62.470552,0.687263,-0.063076,1,0.698597,3.134978,103.676345
6,2010-01-11,106.970913,108.267172,105.139529,106.597955,3909135,65.151281,0.935535,0.0784,1,-0.372958,3.127643,104.916627
7,2010-01-12,108.463029,111.081403,107.759084,109.60306,1756061,55.483691,1.359113,-0.59656,0,1.140031,3.322318,106.234808
8,2010-01-13,107.276693,109.4982,106.060857,107.79397,915340,46.913708,1.531172,0.788708,1,0.517277,3.437343,106.624334
9,2010-01-14,108.830229,110.679572,106.818216,109.00393,3990597,46.387851,1.745048,-0.426418,0,0.173701,3.861356,107.480827


In [46]:
#Normalization / Standardization
# Numerical features
num_features = ['Open', 'High', 'Low', 'Close', 'Volume', 'RSI', 'MACD', 'Sentiment', 'Price_diff', 'Volatility', 'MA5']

# Min-Max Scaling
scaler = MinMaxScaler()
df[num_features] = scaler.fit_transform(df[num_features])

# Optional: Standardization
# scaler_std = StandardScaler()
# df[num_features] = scaler_std.fit_transform(df[num_features])


In [47]:
df.head(10)

Unnamed: 0,Date,Open,High,Low,Close,Volume,RSI,MACD,Sentiment,Target,Price_diff,Volatility,MA5
0,2010-01-01,0.415504,0.41886,0.419937,0.418462,0.131342,0.777604,0.503257,0.528935,0,0.770116,0.480082,0.417678
1,2010-01-04,0.417705,0.416073,0.41842,0.415885,0.529175,0.834341,0.497902,0.875645,1,0.311792,0.35853,0.416365
2,2010-01-05,0.418807,0.422037,0.424361,0.422729,0.597391,0.732921,0.507989,0.989945,1,0.863762,0.359423,0.418252
3,2010-01-06,0.425842,0.429144,0.427642,0.430146,0.865582,0.754839,0.531247,0.070695,0,0.902443,0.725917,0.421084
4,2010-01-07,0.428707,0.427385,0.429629,0.427202,0.409393,0.744509,0.543034,0.632162,0,0.345226,0.365917,0.422184
5,2010-01-08,0.424422,0.428062,0.428551,0.426375,0.732325,0.639969,0.550096,0.468327,1,0.676205,0.53474,0.423797
6,2010-01-11,0.43299,0.433389,0.433884,0.432112,0.77737,0.667431,0.567017,0.539095,1,0.406821,0.532889,0.427104
7,2010-01-12,0.436906,0.440788,0.440756,0.439976,0.337934,0.568393,0.595886,0.201473,0,0.787179,0.582021,0.430618
8,2010-01-13,0.433792,0.436626,0.436301,0.435242,0.166345,0.4806,0.607612,0.894398,1,0.630622,0.611051,0.431657
9,2010-01-14,0.43787,0.439732,0.438288,0.438408,0.793996,0.475212,0.622189,0.28658,0,0.544249,0.718063,0.43394
