In [12]:
import pandas as pd
import sys
sys.path.append("../utils")

from utils import load_csv, save_data

In [13]:
df = load_csv("../../data/raw/fpt.csv")
df.head()

2025-06-14 10:19:11,300 - INFO - ✅ Load CSV thành công: ../../data/raw/fpt.csv | shape = (4565, 6)


Unnamed: 0,date,open,high,low,close,volume
0,,,,,,
1,13/06/2025,116000.0,116500.0,114900.0,115400.0,6161700.0
2,12/06/2025,117600.0,117700.0,116500.0,117000.0,4153100.0
3,11/06/2025,115115.0,117197.0,115115.0,116900.0,6554301.0
4,10/06/2025,116007.0,116404.0,114718.0,114718.0,3854500.0


In [14]:
df.dtypes

date      object
open      object
high      object
low       object
close     object
volume    object
dtype: object

In [15]:
def preprocess(df: pd.DataFrame) -> pd.DataFrame:
    original_rows = len(df)

    # Xoá các dòng toàn bộ là NaN hoặc chuỗi rỗng
    df = df.dropna(how='all')
    df = df[~(df.apply(lambda row: row.astype(str).str.strip().eq('').all(), axis=1))]
    
    # Chuẩn hóa tên cột
    df.columns = [col.strip().lower().replace(" ", "_") for col in df.columns]

    # Xoá trùng lặp
    df = df.drop_duplicates()

    # Chuẩn hóa kiểu dữ liệu
    df['date'] = pd.to_datetime(df['date'], format="%d/%m/%Y")

    for col in ['open', 'high', 'low', 'close']:
        df[col] = df[col].str.replace(",", "").astype(float)
        df[col] = (df[col] / 1000).round(2)
    
    df['volume'] = df['volume'].str.replace(",", "").astype(int)


    # Điền missing
    for col in df.columns:
        if df[col].dtype in [int, float, float]:
            df[col] = df[col].fillna(df[col].mean())
        else:
            df[col] = df[col].fillna(df[col].mode().iloc[0])

    return df


In [16]:
# Tạo danh sách mã cổ phiếu cần xử lý
symbols = ["fpt", "hpg", "vnm"]
dfs = {}

# Lặp qua từng mã, load và tiền xử lý
for symbol in symbols:
    raw_path = f"../../data/raw/{symbol}.csv"
    df = load_csv(raw_path)
    df_clean = preprocess(df)
    dfs[symbol] = df_clean
    print(f"✅ {symbol.upper()} - Rows: {df_clean.shape[0]}, Columns: {df_clean.shape[1]}")


2025-06-14 10:19:11,380 - INFO - ✅ Load CSV thành công: ../../data/raw/fpt.csv | shape = (4565, 6)


2025-06-14 10:19:12,553 - INFO - ✅ Load CSV thành công: ../../data/raw/hpg.csv | shape = (4362, 6)


✅ FPT - Rows: 4564, Columns: 6


2025-06-14 10:19:13,627 - INFO - ✅ Load CSV thành công: ../../data/raw/vnm.csv | shape = (4797, 6)


✅ HPG - Rows: 4361, Columns: 6
✅ VNM - Rows: 4796, Columns: 6


In [17]:
# Hiển thị 5 dòng đầu tiên của mỗi mã cổ phiếu
for symbol in symbols:
    print(f"\n{symbol.upper()} - Dữ liệu sau xử lý:")
    display(dfs[symbol].head())


FPT - Dữ liệu sau xử lý:


Unnamed: 0,date,open,high,low,close,volume
1,2025-06-13,116.0,116.5,114.9,115.4,6161700
2,2025-06-12,117.6,117.7,116.5,117.0,4153100
3,2025-06-11,115.12,117.2,115.12,116.9,6554301
4,2025-06-10,116.01,116.4,114.72,114.72,3854500
5,2025-06-09,114.52,115.91,114.02,115.21,4474400



HPG - Dữ liệu sau xử lý:


Unnamed: 0,date,open,high,low,close,volume
1,2025-06-13,26.6,26.75,26.3,26.4,38769800
2,2025-06-12,26.15,27.1,26.15,26.8,52217900
3,2025-06-11,26.3,26.35,26.0,26.1,16548100
4,2025-06-10,26.5,26.85,26.2,26.2,29410900
5,2025-06-09,26.1,26.4,25.95,26.3,24987900



VNM - Dữ liệu sau xử lý:


Unnamed: 0,date,open,high,low,close,volume
1,2025-06-13,55.5,55.7,55.2,55.5,3545400
2,2025-06-12,55.6,56.2,55.5,55.9,3282700
3,2025-06-11,55.3,55.6,55.1,55.2,1720700
4,2025-06-10,55.5,56.1,55.1,55.3,2999200
5,2025-06-09,55.4,56.3,55.3,55.3,3035700


In [18]:
# Save
for symbol in symbols:
    path = f"../../data/clean/{symbol}.csv"
    save_data(dfs[symbol], f"../../data/clean/{symbol}.csv")

2025-06-14 10:19:14,848 - INFO - 💾 Đã lưu dữ liệu thành công: ../../data/clean/fpt.csv
2025-06-14 10:19:14,868 - INFO - 💾 Đã lưu dữ liệu thành công: ../../data/clean/hpg.csv
2025-06-14 10:19:14,889 - INFO - 💾 Đã lưu dữ liệu thành công: ../../data/clean/vnm.csv
