In [None]:
import pandas as pd
import numpy as np
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from keras.models import Sequential, load_model
from keras.layers import LSTM, Dense, Dropout
from keras import backend as K
import gc, os, datetime
import plotly.express as px
import plotly.graph_objects as go
from dotenv import load_dotenv
import os



In [None]:
def create_sequences(data, seq_length):
    X, y = [], []
    for i in range(len(data) - seq_length):
        X.append(data[i:(i + seq_length), :])
        y.append(data[i + seq_length, 0])  
    return np.array(X), np.array(y)

In [None]:
spark = SparkSession.builder \
        .appName("LSTM_StockML_Incremental") \
        .config("spark.jars.packages", "org.postgresql:postgresql:42.2.23") \
        .config("spark.driver.memory", "6g") \
        .getOrCreate()

In [None]:
db_url = os.getenv("RAW_DB_URL")
db_props = {
        "user": os.getenv("DB_USER"),
        "password": os.getenv("DB_PASSWORD"),
        "driver": os.getenv("DB_DRIVER")
    }

In [None]:
feature_df = spark.read.jdbc(url=db_url, table="ml_data.feature_data", properties=db_props)
pandas_df = feature_df.toPandas()
spark.stop()

In [None]:
print(pandas_df.head())

In [None]:
symbol = "VCB"
df_symbol = pandas_df[pandas_df['symbol'] == symbol].sort_values('tradingdate')

fig = px.line(df_symbol, x='tradingdate', y=['last_price', 'avg_price'], title=f'Giá cổ phiếu {symbol}')
fig.show()

In [None]:
symbol_target = 'VCB'
pandas_df['tradingdate'] = pd.to_datetime(pandas_df['tradingdate'])
pandas_df = pandas_df.sort_values(by=['symbol', 'tradingdate'])

In [None]:
df = pandas_df[pandas_df['symbol'] == symbol_target].copy()
if len(df) < 5:
    print("Dữ liệu quá ít để train.")
    

features = ['last_price', 'avg_price', 'ref_price', 'total_val',
                'change', 'ratio_change', 'highest', 'lowest']

In [None]:
scaler = MinMaxScaler(feature_range=(0, 1))
scaled_data = scaler.fit_transform(df[features].values)

seq_len = 2
X, y = create_sequences(scaled_data, seq_len)
print(f"Train {symbol_target}: {X.shape[0]} mẫu, {X.shape[1]} bước.")

In [None]:
model = Sequential([
    LSTM(32, input_shape=(seq_len, len(features))),
    Dropout(0.1),
    Dense(1)
])
model.compile(optimizer='adam', loss='mean_squared_error')

# --- Train ---
model.fit(X, y, epochs=15, batch_size=1, verbose=0)


In [None]:
predicted_scaled = model.predict(X)
predicted_full = np.zeros((len(predicted_scaled), scaled_data.shape[1]))
predicted_full[:, 0] = predicted_scaled.flatten()
predicted_prices = scaler.inverse_transform(predicted_full)[:, 0]

In [None]:
fig = go.Figure()

fig.add_trace(go.Scatter(
    x=df['tradingdate'][seq_len:],
    y=df['last_price'][seq_len:],
    mode='lines+markers',
    name='Thực tế',
    line=dict(color='blue')
))

fig.add_trace(go.Scatter(
    x=df['tradingdate'][seq_len:],
    y=predicted_prices,
    mode='lines+markers',
    name='Dự đoán',
    line=dict(color='orange', dash='dash')
))

fig.update_layout(
    title=f"Dự đoán giá cổ phiếu {symbol_target} bằng LSTM",
    xaxis_title="Ngày giao dịch",
    yaxis_title="Giá (VNĐ)",
    legend=dict(x=0.02, y=0.98, bgcolor='rgba(255,255,255,0.5)'),
    template="plotly_white",
    hovermode="x unified",
    width=900,
    height=500
)

fig.add_trace(go.Scatter(
    x=[df['tradingdate'].iloc[-1]],
    y=[predicted_prices[-1]],
    mode="markers+text",
    text=[f"{predicted_prices[-1]:,.0f}"],
    textposition="top center",
    name="Giá dự đoán hôm sau",
    marker=dict(color="red", size=10, symbol="circle")
))

fig.show()