In [1]:
import pandas as pd
import sys
sys.path.append("../utils")

from utils import load_csv, save_data

In [2]:
df = load_csv("../../data/raw/fpt.csv")
df.head()

2025-06-30 02:29:47,411 - INFO - ✅ Load CSV thành công: ../../data/raw/fpt.csv | shape = (4618, 6)


Unnamed: 0,date,open,high,low,close,volume
0,,,,,,
1,27/06/2025,117000.0,118300.0,116900.0,117300.0,5263200.0
2,26/06/2025,118000.0,118300.0,116600.0,117000.0,4132300.0
3,25/06/2025,117000.0,119400.0,117000.0,117100.0,6000700.0
4,24/06/2025,116300.0,117000.0,116300.0,116500.0,4271900.0


In [3]:
df.dtypes

date      object
open      object
high      object
low       object
close     object
volume    object
dtype: object

In [4]:
def preprocess(df: pd.DataFrame) -> pd.DataFrame:
    original_rows = len(df)

    # Xoá các dòng toàn bộ là NaN hoặc chuỗi rỗng
    df = df.dropna(how='all')
    df = df[~(df.apply(lambda row: row.astype(str).str.strip().eq('').all(), axis=1))]
    
    # Chuẩn hóa tên cột
    df.columns = [col.strip().lower().replace(" ", "_") for col in df.columns]

    # Xoá trùng lặp
    df = df.drop_duplicates()

    # Chuẩn hóa kiểu dữ liệu
    df['date'] = pd.to_datetime(df['date'], format="%d/%m/%Y")

    for col in ['open', 'high', 'low', 'close']:
        df[col] = df[col].str.replace(",", "").astype(float)
        df[col] = (df[col] / 1000).round(2)
    
    df['volume'] = df['volume'].str.replace(",", "").astype(int)


    # Điền missing
    for col in df.columns:
        if df[col].dtype in [int, float, float]:
            df[col] = df[col].fillna(df[col].mean())
        else:
            df[col] = df[col].fillna(df[col].mode().iloc[0])

    return df


In [5]:
# Tạo danh sách mã cổ phiếu cần xử lý
symbols = ["fpt", "hpg", "vnm"]
dfs = {}

# Lặp qua từng mã, load và tiền xử lý
for symbol in symbols:
    raw_path = f"../../data/raw/{symbol}.csv"
    df = load_csv(raw_path)
    df_clean = preprocess(df)
    dfs[symbol] = df_clean
    print(f"✅ {symbol.upper()} - Rows: {df_clean.shape[0]}, Columns: {df_clean.shape[1]}")


2025-06-30 02:29:47,491 - INFO - ✅ Load CSV thành công: ../../data/raw/fpt.csv | shape = (4618, 6)
2025-06-30 02:29:48,642 - INFO - ✅ Load CSV thành công: ../../data/raw/hpg.csv | shape = (4389, 6)


✅ FPT - Rows: 4617, Columns: 6


2025-06-30 02:29:49,712 - INFO - ✅ Load CSV thành công: ../../data/raw/vnm.csv | shape = (4844, 6)


✅ HPG - Rows: 4388, Columns: 6
✅ VNM - Rows: 4843, Columns: 6


In [6]:
# Hiển thị 5 dòng đầu tiên của mỗi mã cổ phiếu
for symbol in symbols:
    print(f"\n{symbol.upper()} - Dữ liệu sau xử lý:")
    display(dfs[symbol].head())


FPT - Dữ liệu sau xử lý:


Unnamed: 0,date,open,high,low,close,volume
1,2025-06-27,117.0,118.3,116.9,117.3,5263200
2,2025-06-26,118.0,118.3,116.6,117.0,4132300
3,2025-06-25,117.0,119.4,117.0,117.1,6000700
4,2025-06-24,116.3,117.0,116.3,116.5,4271900
5,2025-06-23,115.5,116.3,114.5,115.9,4587300



HPG - Dữ liệu sau xử lý:


Unnamed: 0,date,open,high,low,close,volume
1,2025-06-27,22.95,22.95,22.75,22.85,16481800
2,2025-06-26,23.05,23.15,22.75,22.8,38371000
3,2025-06-25,22.58,22.88,22.46,22.67,45638801
4,2025-06-24,22.54,22.67,22.46,22.5,27870000
5,2025-06-23,22.38,22.5,22.17,22.38,24710800



VNM - Dữ liệu sau xử lý:


Unnamed: 0,date,open,high,low,close,volume
1,2025-06-27,56.8,58.6,56.3,57.8,8895400
2,2025-06-26,56.8,57.0,56.3,56.4,3118900
3,2025-06-25,56.5,57.8,56.4,56.8,6268200
4,2025-06-24,56.2,56.5,56.1,56.4,2803800
5,2025-06-23,56.0,56.4,55.8,55.9,3022700


In [7]:
# Save
for symbol in symbols:
    path = f"../../data/clean/{symbol}.csv"
    save_data(dfs[symbol], f"../../data/clean/{symbol}.csv")

2025-06-30 02:29:50,933 - INFO - 💾 Đã lưu dữ liệu thành công: ../../data/clean/fpt.csv
2025-06-30 02:29:50,955 - INFO - 💾 Đã lưu dữ liệu thành công: ../../data/clean/hpg.csv
2025-06-30 02:29:50,975 - INFO - 💾 Đã lưu dữ liệu thành công: ../../data/clean/vnm.csv
