### Install Packages (If any)

from tensorflow.keras.callbacks import EarlyStopping

from gensim import downloader

model_glove = downloader.load("glove-wiki-gigaword-50")

### Imports and Seed

In [221]:
import re
import numpy as np
import pandas as pd

SEED = 42
np.random.seed(SEED)

### Load data

In [222]:
df = pd.read_csv("stock_trend.csv")

print("Shape:", df.shape)
print("\nColumns:", df.columns.tolist())
print("\nDtypes:\n", df.dtypes)

print("\nMissing values:\n", df.isna().sum())

# Show 10 lines (useful for rubric and sanity check)
df.head(10)

Shape: (24388, 6)

Columns: ['Title', 'Time', 'Name', 'Quote', 'Before', 'After']

Dtypes:
 Title      object
Time       object
Name       object
Quote       int64
Before    float64
After     float64
dtype: object

Missing values:
 Title     0
Time      0
Name      0
Quote     0
Before    0
After     0
dtype: int64


Unnamed: 0,Title,Time,Name,Quote,Before,After
0,100 startups participate in Maxis' Market Acce...,2019-12-12T23:50:12+08:00,MAXIS,6012,5.16,5.11
1,16.89% stake in Subur Tiasa traded off-market,2020-02-20T22:41:12+08:00,SUBUR,6904,0.61,0.61
2,Najib wanted 1MDB's Genting Sanyen deal sped u...,2020-07-16T17:42:30+08:00,GENTING,3182,4.08,4.06
3,"25bps OPR cut likely in 2H20, says Manulife",2020-01-16T17:03:43+08:00,MANULFE,1058,2.42,2.42
4,A 25-month extension on concession pushes Phar...,2019-11-11T10:49:58+08:00,PHARMA,7081,2.62,2.43
5,3.7% of Yong Tai transacted off-market,2019-09-05T19:55:54+08:00,YONGTAI,7066,0.195,0.2
6,"3A, Ruberex, Thriven, Kanger, UniWall",2020-03-04T13:23:15+08:00,KANGER,170,0.125,0.12
7,40% stake in IWH-CREC may cost Ekovest RM1.5b,2020-09-16T10:00:00+08:00,EKOVEST,8877,0.61,0.58
8,4.41% Kronologi Asia shares traded off-market,2020-02-12T23:54:51+08:00,KRONO,176,0.895,0.89
9,4.96% of MMAG traded off market,2019-07-23T22:42:17+08:00,MMAG,34,0.245,0.235


### Data Cleaning

In [223]:
df = df.drop_duplicates()

df["Before"] = pd.to_numeric(df["Before"], errors="coerce")
df["After"]  = pd.to_numeric(df["After"],  errors="coerce")

df = df.dropna(subset=["Title", "Before", "After"])
df = df[df["Before"] > 0].copy()

print("After cleaning shape:", df.shape)

# Show 10 lines
df.head(10)

After cleaning shape: (24387, 6)


Unnamed: 0,Title,Time,Name,Quote,Before,After
0,100 startups participate in Maxis' Market Acce...,2019-12-12T23:50:12+08:00,MAXIS,6012,5.16,5.11
1,16.89% stake in Subur Tiasa traded off-market,2020-02-20T22:41:12+08:00,SUBUR,6904,0.61,0.61
2,Najib wanted 1MDB's Genting Sanyen deal sped u...,2020-07-16T17:42:30+08:00,GENTING,3182,4.08,4.06
3,"25bps OPR cut likely in 2H20, says Manulife",2020-01-16T17:03:43+08:00,MANULFE,1058,2.42,2.42
4,A 25-month extension on concession pushes Phar...,2019-11-11T10:49:58+08:00,PHARMA,7081,2.62,2.43
5,3.7% of Yong Tai transacted off-market,2019-09-05T19:55:54+08:00,YONGTAI,7066,0.195,0.2
6,"3A, Ruberex, Thriven, Kanger, UniWall",2020-03-04T13:23:15+08:00,KANGER,170,0.125,0.12
7,40% stake in IWH-CREC may cost Ekovest RM1.5b,2020-09-16T10:00:00+08:00,EKOVEST,8877,0.61,0.58
8,4.41% Kronologi Asia shares traded off-market,2020-02-12T23:54:51+08:00,KRONO,176,0.895,0.89
9,4.96% of MMAG traded off market,2019-07-23T22:42:17+08:00,MMAG,34,0.245,0.235


### Target Label (Trend)

Using relative price change

In [224]:
df["rel_change"] = (df["After"] - df["Before"]) / df["Before"]

def label_trend(x):
    if x > 0.03:
        return "uptrend"
    elif x < -0.03:
        return "downtrend"
    else:
        return "flat"

df["trend"] = df["rel_change"].apply(label_trend)

print(df["trend"].value_counts())

# Show 10 lines
df[["Title", "Before", "After", "rel_change", "trend"]].head(10)

trend
flat         16648
uptrend       4618
downtrend     3121
Name: count, dtype: int64


Unnamed: 0,Title,Before,After,rel_change,trend
0,100 startups participate in Maxis' Market Acce...,5.16,5.11,-0.00969,flat
1,16.89% stake in Subur Tiasa traded off-market,0.61,0.61,0.0,flat
2,Najib wanted 1MDB's Genting Sanyen deal sped u...,4.08,4.06,-0.004902,flat
3,"25bps OPR cut likely in 2H20, says Manulife",2.42,2.42,0.0,flat
4,A 25-month extension on concession pushes Phar...,2.62,2.43,-0.072519,downtrend
5,3.7% of Yong Tai transacted off-market,0.195,0.2,0.025641,flat
6,"3A, Ruberex, Thriven, Kanger, UniWall",0.125,0.12,-0.04,downtrend
7,40% stake in IWH-CREC may cost Ekovest RM1.5b,0.61,0.58,-0.04918,downtrend
8,4.41% Kronologi Asia shares traded off-market,0.895,0.89,-0.005587,flat
9,4.96% of MMAG traded off market,0.245,0.235,-0.040816,downtrend


### Text Normalization

In [225]:
def clean_text(s: str) -> str:
    s = str(s).lower()
    s = re.sub(r"http\S+|www\.\S+", " ", s)
    s = re.sub(r"[^a-z\s]", " ", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

df["text"] = df["Title"].apply(clean_text)

# Show 10 lines
df[["Title", "text", "trend"]].head(10)

Unnamed: 0,Title,text,trend
0,100 startups participate in Maxis' Market Acce...,startups participate in maxis market access day,flat
1,16.89% stake in Subur Tiasa traded off-market,stake in subur tiasa traded off market,flat
2,Najib wanted 1MDB's Genting Sanyen deal sped u...,najib wanted mdb s genting sanyen deal sped up...,flat
3,"25bps OPR cut likely in 2H20, says Manulife",bps opr cut likely in h says manulife,flat
4,A 25-month extension on concession pushes Phar...,a month extension on concession pushes pharman...,downtrend
5,3.7% of Yong Tai transacted off-market,of yong tai transacted off market,flat
6,"3A, Ruberex, Thriven, Kanger, UniWall",a ruberex thriven kanger uniwall,downtrend
7,40% stake in IWH-CREC may cost Ekovest RM1.5b,stake in iwh crec may cost ekovest rm b,downtrend
8,4.41% Kronologi Asia shares traded off-market,kronologi asia shares traded off market,flat
9,4.96% of MMAG traded off market,of mmag traded off market,downtrend


### Encode Labels

downtrend = 0 // flat = 1 // uptrend = 2

In [226]:
label2id = {"downtrend": 0, "flat": 1, "uptrend": 2}
id2label = {v: k for k, v in label2id.items()}

df["label"] = df["trend"].map(label2id)

print(df["label"].value_counts())

# Show 10 lines
df[["text", "trend", "label"]].head(10)

label
1    16648
2     4618
0     3121
Name: count, dtype: int64


Unnamed: 0,text,trend,label
0,startups participate in maxis market access day,flat,1
1,stake in subur tiasa traded off market,flat,1
2,najib wanted mdb s genting sanyen deal sped up...,flat,1
3,bps opr cut likely in h says manulife,flat,1
4,a month extension on concession pushes pharman...,downtrend,0
5,of yong tai transacted off market,flat,1
6,a ruberex thriven kanger uniwall,downtrend,0
7,stake in iwh crec may cost ekovest rm b,downtrend,0
8,kronologi asia shares traded off market,flat,1
9,of mmag traded off market,downtrend,0


### Split Data

Train 70%, Validation 20%, Test 10%

In [227]:
from sklearn.model_selection import train_test_split

final_df = df[["text", "label", "trend", "Time", "Name", "Quote", "Before", "After", "rel_change"]].copy()

train_df, temp_df = train_test_split(
    final_df,
    test_size=0.30,
    random_state=SEED,
    stratify=final_df["label"]
)

val_df, test_df = train_test_split(
    temp_df,
    test_size=(1/3),   # 10% out of total = 1/3 of the 30%
    random_state=SEED,
    stratify=temp_df["label"]
)

train_df = train_df.reset_index(drop=True)
val_df   = val_df.reset_index(drop=True)
test_df  = test_df.reset_index(drop=True)

print("Train:", train_df.shape)
print("Val:  ", val_df.shape)
print("Test: ", test_df.shape)

print("\nTrain dist:\n", train_df["trend"].value_counts(normalize=True))
print("\nVal dist:\n", val_df["trend"].value_counts(normalize=True))
print("\nTest dist:\n", test_df["trend"].value_counts(normalize=True))

# Show 10 lines from each
train_df.head(10), val_df.head(10), test_df.head(10)

Train: (17070, 9)
Val:   (4878, 9)
Test:  (2439, 9)

Train dist:
 trend
flat         0.682660
uptrend      0.189338
downtrend    0.128002
Name: proportion, dtype: float64

Val dist:
 trend
flat         0.682657
uptrend      0.189422
downtrend    0.127921
Name: proportion, dtype: float64

Test dist:
 trend
flat         0.682657
uptrend      0.189422
downtrend    0.127921
Name: proportion, dtype: float64


(                                                text  label      trend  \
 0  parkson s buy of property in wuxi will cut ren...      1       flat   
 1  bursa malaysia cycle and carriage bintang syar...      0  downtrend   
 2  eupe s parc in cheras to be launched by mid oc...      1       flat   
 3  guocoland klk batu kawan mbm msm pchem petgas ...      1       flat   
 4  top glove scientex poh kong aeon credit bumi a...      1       flat   
 5      umw toyota to sell new toyota harrier from rm      1       flat   
 6  ekovest iwcity kelington boustead plantation d...      1       flat   
 7  klci pares gains scomi group up on fresh lifeline      0  downtrend   
 8             meda inc seeks rm m damages from pr ma      1       flat   
 9  sapura energy kumpulan perangsang selangor opc...      0  downtrend   
 
                         Time     Name  Quote  Before  After  rel_change  
 0  2018-03-21T11:18:00+08:00  PARKSON   5657   0.455  0.445   -0.021978  
 1  2020-07-29T00:41:41

### Prepare Data

In [228]:
y_train = train_df["label"].values
y_val   = val_df["label"].values
y_test  = test_df["label"].values

print(y_train[:10])

[1 0 1 1 1 1 1 0 1 0]


## LSTM : GloVe via gensim + TweetTokenizer

### Setting parameter

In [229]:
sentence_length = 25
n_embedding = 50
n_output = 3
batch_size = 4
epochs = 20

### Tokenize using TweetTokenizer

In [230]:
import nltk
nltk.download("punkt", quiet=True)

from nltk.tokenize import TweetTokenizer

tweet_tokenizer = TweetTokenizer()

def tokenize_sentence(s: str):
    s = s.lower()
    return tweet_tokenizer.tokenize(s)

### OOV handling

Concert token -> embedding vectors

In [231]:
def tokens_to_vectors(tokens):
    vectors = []
    for tok in tokens:
        if tok in model_glove:
            vectors.append(model_glove[tok])
        else:
            vectors.append(np.zeros(n_embedding, dtype=np.float32))  # OOV
    return vectors

### Normalize Vector

In [232]:
def normalize_vectors(vectors):
    out = []
    for v in vectors:
        norm = np.linalg.norm(v)
        if norm > 0:
            out.append((v / norm).astype(np.float32))
        else:
            out.append(v.astype(np.float32))
    return out

### Build X-arrays

In [233]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

def texts_to_padded_embeddings(text_series, do_normalize=True):
    seqs = []
    for s in text_series:
        tokens = tokenize_sentence(s)
        vecs = tokens_to_vectors(tokens)
        if do_normalize:
            vecs = normalize_vectors(vecs)
        seqs.append(vecs)
    
    X = pad_sequences(
        seqs,
        maxlen=sentence_length,
        dtype="float32",
        padding="post",
        truncating="post"
    )
    return X

X_train = texts_to_padded_embeddings(train_df["text"])
X_val   = texts_to_padded_embeddings(val_df["text"])
X_test  = texts_to_padded_embeddings(test_df["text"])

print("X_train shape:", X_train.shape)
print("X_val shape:  ", X_val.shape)
print("X_test shape: ", X_test.shape)

# Show 10 samples (short preview)
X_train[:10].shape

X_train shape: (17070, 25, 50)
X_val shape:   (4878, 25, 50)
X_test shape:  (2439, 25, 50)


(10, 25, 50)

### Build LSTM model

In [234]:
import tensorflow as tf
from tensorflow.keras.layers import Input, LSTM, Dense, Flatten
from tensorflow.keras.models import Model

tf.random.set_seed(SEED)

inputs = Input(shape=(sentence_length, n_embedding))
lstm = LSTM(2, return_sequences=True, return_state=True)
outputs_seq, state_h, state_c = lstm(inputs)

flat = Flatten()(outputs_seq)
outputs = Dense(n_output, activation="softmax")(flat)

model = Model(inputs=inputs, outputs=outputs)

model.compile(
    optimizer="adam",
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"]
)

model.summary()

### Train model

In [237]:
early_stop = EarlyStopping(
    monitor="val_loss",
    patience=2,
    min_delta=0.01,
    restore_best_weights=True
)

history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=epochs,
    batch_size=batch_size,
    callbacks=[early_stop],
    verbose=1
)

Epoch 1/20
[1m4268/4268[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 5ms/step - accuracy: 0.6830 - loss: 0.8180 - val_accuracy: 0.6839 - val_loss: 0.8193
Epoch 2/20
[1m4268/4268[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 5ms/step - accuracy: 0.6856 - loss: 0.8107 - val_accuracy: 0.6845 - val_loss: 0.8169
Epoch 3/20
[1m4268/4268[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 5ms/step - accuracy: 0.6858 - loss: 0.8079 - val_accuracy: 0.6859 - val_loss: 0.8149


### Model evaluation

In [239]:
from sklearn.metrics import classification_report, accuracy_score

y_pred_probs = model.predict(X_test)
y_pred = np.argmax(y_pred_probs, axis=1)

print("Test Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n")
print(classification_report(
    y_test, y_pred,
    target_names=["downtrend", "flat", "uptrend"]
))

[1m77/77[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
Test Accuracy: 0.6851168511685117

Classification Report:

              precision    recall  f1-score   support

   downtrend       0.00      0.00      0.00       312
        flat       0.69      1.00      0.81      1665
     uptrend       0.58      0.02      0.05       462

    accuracy                           0.69      2439
   macro avg       0.42      0.34      0.29      2439
weighted avg       0.58      0.69      0.56      2439



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


### Confusion Matrix

In [240]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, y_pred)
cm

array([[   0,  309,    3],
       [   0, 1660,    5],
       [   0,  451,   11]])

In [None]:
model.save("lstm_stock_trend_model.keras")

# Save splits if needed
train_df.to_csv("train_split.csv", index=False)
val_df.to_csv("val_split.csv", index=False)
test_df.to_csv("test_split.csv", index=False)