In [15]:
import pandas as pd
import numpy as np
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from keras.models import Sequential, load_model
from keras.layers import LSTM, Dense, Dropout
from keras import backend as K
import gc, os, datetime
import plotly.express as px
import plotly.graph_objects as go



In [4]:
def create_sequences(data, seq_length):
    X, y = [], []
    for i in range(len(data) - seq_length):
        X.append(data[i:(i + seq_length), :])
        y.append(data[i + seq_length, 0])  
    return np.array(X), np.array(y)

In [5]:
spark = SparkSession.builder \
        .appName("LSTM_StockML_Incremental") \
        .config("spark.jars.packages", "org.postgresql:postgresql:42.2.23") \
        .config("spark.driver.memory", "6g") \
        .getOrCreate()

:: loading settings :: url = jar:file:/media/bazzi/Bazzi/SSI_PROJECT/venv/lib/python3.12/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/bazzi/.ivy2/cache
The jars for the packages stored in: /home/bazzi/.ivy2/jars
org.postgresql#postgresql added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-c6956478-99b3-42a8-b894-f125d87b5e15;1.0
	confs: [default]
	found org.postgresql#postgresql;42.2.23 in central
	found org.checkerframework#checker-qual;3.5.0 in central
:: resolution report :: resolve 146ms :: artifacts dl 5ms
	:: modules in use:
	org.checkerframework#checker-qual;3.5.0 from central in [default]
	org.postgresql#postgresql;42.2.23 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	---------------------------------------------------------------------
	|      default     |   2   |   0   |   0   |   0   ||   2   |   0   |
	---------------------------------------------

In [6]:
db_url = "jdbc:postgresql://localhost:5432/stock_ml"
db_props = {
        "user": "bazzi",
        "password": "bazzi123",
        "driver": "org.postgresql.Driver"
    }

In [7]:
feature_df = spark.read.jdbc(url=db_url, table="ml_data.feature_data", properties=db_props)
pandas_df = feature_df.toPandas()
spark.stop()

                                                                                

In [8]:
print(pandas_df.head())

   id    symbol tradingdate last_price avg_price ref_price     total_val  \
0  79       A32  2025-10-28   33000.00  33000.00  33400.00    3300000.00   
1  80       BBS  2025-10-31   13200.00  13100.00  13000.00    2620000.00   
2  81       C21  2025-10-17   15100.00  15050.00  17300.00    3010000.00   
3  83       CBI  2025-10-29   14800.00  14800.00  14800.00  148000000.00   
4  85  CFPT2509  2025-10-28     120.00    110.00    140.00    7319000.00   

     change  ratio_change   highest    lowest                 created_at  
0   -400.00         -1.20  33000.00  33000.00 2025-11-01 22:45:10.092509  
1    200.00          1.54  13200.00  13000.00 2025-11-01 22:45:10.092509  
2  -2200.00        -12.72  15100.00  15000.00 2025-11-01 22:45:10.092509  
3      0.00          0.00  14800.00  14800.00 2025-11-01 22:45:10.092509  
4    -20.00        -14.29    140.00    110.00 2025-11-01 22:45:10.092509  


In [9]:
symbol = "VCB"
df_symbol = pandas_df[pandas_df['symbol'] == symbol].sort_values('tradingdate')

fig = px.line(df_symbol, x='tradingdate', y=['last_price', 'avg_price'], title=f'Giá cổ phiếu {symbol}')
fig.show()

In [10]:
symbol_target = 'VCB'
pandas_df['tradingdate'] = pd.to_datetime(pandas_df['tradingdate'])
pandas_df = pandas_df.sort_values(by=['symbol', 'tradingdate'])

In [11]:
df = pandas_df[pandas_df['symbol'] == symbol_target].copy()
if len(df) < 5:
    print("Dữ liệu quá ít để train.")
    

features = ['last_price', 'avg_price', 'ref_price', 'total_val',
                'change', 'ratio_change', 'highest', 'lowest']

In [12]:
scaler = MinMaxScaler(feature_range=(0, 1))
scaled_data = scaler.fit_transform(df[features].values)

seq_len = 2
X, y = create_sequences(scaled_data, seq_len)
print(f"Train {symbol_target}: {X.shape[0]} mẫu, {X.shape[1]} bước.")

Train VCB: 6 mẫu, 2 bước.


In [13]:
model = Sequential([
    LSTM(32, input_shape=(seq_len, len(features))),
    Dropout(0.1),
    Dense(1)
])
model.compile(optimizer='adam', loss='mean_squared_error')

# --- Train ---
model.fit(X, y, epochs=15, batch_size=1, verbose=0)


E0000 00:00:1762364989.828744   47481 cuda_executor.cc:1309] INTERNAL: CUDA Runtime error: Failed call to cudaGetRuntimeVersion: Error loading CUDA libraries. GPU will not be used.: Error loading CUDA libraries. GPU will not be used.
W0000 00:00:1762364989.838880   47481 gpu_device.cc:2342] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...

Do not pass an `input_shape`/`input_dim` argument to a layer. When using Sequential models, prefer using an `Input(shape)` object as the first layer in the model instead.



<keras.src.callbacks.history.History at 0x797e04455760>

In [14]:
predicted_scaled = model.predict(X)
predicted_full = np.zeros((len(predicted_scaled), scaled_data.shape[1]))
predicted_full[:, 0] = predicted_scaled.flatten()
predicted_prices = scaler.inverse_transform(predicted_full)[:, 0]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 126ms/step


In [19]:
fig = go.Figure()

fig.add_trace(go.Scatter(
    x=df['tradingdate'][seq_len:],
    y=df['last_price'][seq_len:],
    mode='lines+markers',
    name='Thực tế',
    line=dict(color='blue')
))

fig.add_trace(go.Scatter(
    x=df['tradingdate'][seq_len:],
    y=predicted_prices,
    mode='lines+markers',
    name='Dự đoán',
    line=dict(color='orange', dash='dash')
))

fig.update_layout(
    title=f"Dự đoán giá cổ phiếu {symbol_target} bằng LSTM",
    xaxis_title="Ngày giao dịch",
    yaxis_title="Giá (VNĐ)",
    legend=dict(x=0.02, y=0.98, bgcolor='rgba(255,255,255,0.5)'),
    template="plotly_white",
    hovermode="x unified",
    width=900,
    height=500
)

fig.add_trace(go.Scatter(
    x=[df['tradingdate'].iloc[-1]],
    y=[predicted_prices[-1]],
    mode="markers+text",
    text=[f"{predicted_prices[-1]:,.0f}"],
    textposition="top center",
    name="Giá dự đoán hôm sau",
    marker=dict(color="red", size=10, symbol="circle")
))

fig.show()