In [19]:
import os

from vnstock import Quote
quote = Quote(symbol='VCI', source='VCI')

from vnstock import Listing
listing = Listing(source='VCI')

In [3]:
vn30 = listing.symbols_by_group('VN30')

In [4]:
from vnstock import Quote
import pandas as pd
from datetime import datetime, timedelta

def fetch_history(symbol, start_date, end_date):
    try:
        q = Quote(symbol=symbol, source='VCI')
        df = q.history(start=start_date, end=end_date)

        if df is None or len(df) == 0:
            print(f"[WARN] Không có dữ liệu cho {symbol}")
            return None

        df["symbol"] = symbol
        return df

    except Exception as e:
        print(f"[ERROR] {symbol}: {e}")
        return None


In [5]:
end_date = "2025-12-10"
start_date = "2000-01-01"

print("Start:", start_date, "End:", end_date)


Start: 2000-01-01 End: 2025-12-10


In [6]:
all_data = []

for sym in vn30:
    print("Đang tải:", sym)
    df = fetch_history(sym, start_date, end_date)

    all_data.append(df)


Đang tải: ACB
Đang tải: BCM
Đang tải: BID
Đang tải: CTG
Đang tải: DGC
Đang tải: FPT
Đang tải: GAS
Đang tải: GVR
Đang tải: HDB
Đang tải: HPG
Đang tải: LPB
Đang tải: MBB
Đang tải: MSN
Đang tải: MWG
Đang tải: PLX
Đang tải: SAB
Đang tải: SHB
Đang tải: SSB
Đang tải: SSI
Đang tải: STB
Đang tải: TCB
Đang tải: TPB
Đang tải: VCB
Đang tải: VHM
Đang tải: VIB
Đang tải: VIC
Đang tải: VJC
Đang tải: VNM
Đang tải: VPB
Đang tải: VRE


In [7]:
df_all = pd.concat(all_data, ignore_index=True)
df_all.head()


Unnamed: 0,time,open,high,low,close,volume,symbol
0,2006-11-21,3.86,4.83,3.83,4.34,56500,ACB
1,2006-11-22,4.61,4.61,4.18,4.34,62300,ACB
2,2006-11-23,4.67,4.67,4.18,4.6,69900,ACB
3,2006-11-24,4.51,4.89,4.51,4.83,63600,ACB
4,2006-11-27,4.72,5.19,4.34,4.57,42400,ACB


In [8]:
df_all.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 94109 entries, 0 to 94108
Data columns (total 7 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   time    94109 non-null  datetime64[ns]
 1   open    94109 non-null  float64       
 2   high    94109 non-null  float64       
 3   low     94109 non-null  float64       
 4   close   94109 non-null  float64       
 5   volume  94109 non-null  int64         
 6   symbol  94109 non-null  object        
dtypes: datetime64[ns](1), float64(4), int64(1), object(1)
memory usage: 5.0+ MB


In [9]:
sample_count = (
    df_all
    .groupby("symbol")
    .size()
    .reset_index(name="num_samples")
)

print(sample_count)

   symbol  num_samples
0     ACB         4749
1     BCM         1946
2     BID         2964
3     CTG         4098
4     DGC         2817
5     FPT         4735
6     GAS         3389
7     GVR         1927
8     HDB         1981
9     HPG         4506
10    LPB         2036
11    MBB         3524
12    MSN         4019
13    MWG         2854
14    PLX         2162
15    SAB         2253
16    SHB         4156
17    SSB         1180
18    SSI         4723
19    STB         4844
20    TCB         1883
21    TPB         1912
22    VCB         4110
23    VHM         3307
24    VIB         2223
25    VIC         4547
26    VJC         2199
27    VNM         4961
28    VPB         2080
29    VRE         2024


In [10]:
date_range = (
    df_all
    .groupby("symbol")["time"]
    .agg(
        min_date="min",
        max_date="max"
    )
    .reset_index()
)

print(date_range)

   symbol   min_date   max_date
0     ACB 2006-11-21 2025-12-10
1     BCM 2018-02-21 2025-12-10
2     BID 2014-01-24 2025-12-10
3     CTG 2009-07-16 2025-12-10
4     DGC 2014-08-26 2025-12-10
5     FPT 2006-12-13 2025-12-10
6     GAS 2012-05-21 2025-12-10
7     GVR 2018-03-21 2025-12-10
8     HDB 2018-01-05 2025-12-10
9     HPG 2007-11-15 2025-12-10
10    LPB 2017-10-05 2025-12-10
11    MBB 2011-11-01 2025-12-10
12    MSN 2009-11-05 2025-12-10
13    MWG 2014-07-14 2025-12-10
14    PLX 2017-04-21 2025-12-10
15    SAB 2016-12-06 2025-12-10
16    SHB 2009-04-20 2025-12-10
17    SSB 2021-03-24 2025-12-10
18    SSI 2006-12-15 2025-12-10
19    STB 2006-07-12 2025-12-10
20    TCB 2018-06-04 2025-12-10
21    TPB 2018-04-19 2025-12-10
22    VCB 2009-06-30 2025-12-10
23    VHM 2011-11-10 2025-12-10
24    VIB 2017-01-09 2025-12-10
25    VIC 2007-09-19 2025-12-10
26    VJC 2017-02-28 2025-12-10
27    VNM 2006-01-19 2025-12-10
28    VPB 2017-08-17 2025-12-10
29    VRE 2017-11-06 2025-12-10


In [26]:
from pathlib import Path

BASE_DIR = Path.cwd().parent

output_path = BASE_DIR / "data" / "timeseries" / "vn30_history.csv"
df_all.to_csv(output_path, index=False, encoding="utf-8")

print("Lưu dữ liệu vào:", output_path)

Lưu dữ liệu vào: /Users/anhoaithai/Documents/AHT/2. AREAS/UEH/Kì 5/Data Mining/Project/Vile/VN_StockAnalytics/data/timeseries/vn30_history.csv


# Macro data

In [42]:
print(BASE_DIR)
df_macro = pd.read_excel(f"{BASE_DIR}/data/timeseries/macro.xlsx")

/Users/anhoaithai/Documents/AHT/2. AREAS/UEH/Kì 5/Data Mining/Project/Vile/VN_StockAnalytics


In [None]:
print(df_macro.head())

   Year  Quarter    INF    GDP      DC
0  2020        1  2,95%  3,82%  11,64%
1  2020        2  2,45%  1,81%   9,72%
2  2020        3  1,97%  2,12%  10,21%
3  2020        4  0,99%  2,91%  12,17%
4  2021        1  0,73%  4,48%  13,99%


In [None]:
#df_macro = pd.read_excel(f"{BASE_DIR}/data/timeseries/macro.xlsx")