In [1]:
import pandas as pd
import numpy as np
import sys, os

In [2]:
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
if project_root not in sys.path:
   sys.path.append(project_root)

from source.utils.minio_helper import read_df_from_minio

In [3]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None) # Agar list panjang tidak terpotong

BUCKET_NAME = "mlbb-lakehouse"

In [4]:
def validate_bronze_schema(df, name):
    print(f"\n{'='*60}")
    print(f"üõ°Ô∏è VALIDASI BRONZE: {name}")
    print(f"{'='*60}")
    
    if df is None:
        print("‚ùå Data Kosong/Gagal Load")
        return

    # 1. Cek Metadata Wajib
    required_meta = ['ingested_at', 'data_source']
    missing_meta = [col for col in required_meta if col not in df.columns and 'region' not in df.columns] 
    
    if not missing_meta:
        print("‚úÖ Metadata Check: OK (ingested_at exists)")
    else:
        print(f"‚ö†Ô∏è Metadata Check: Missing {missing_meta}")

    # 2. Cek Tipe Data Kompleks (Array/List)
    list_cols = [c for c in df.columns if 'pick' in c or 'ban' in c]
    if list_cols:
        sample_val = df[list_cols[0]].iloc[0]
        val_type = type(sample_val)
        print(f"\nüß© Cek Struktur Array ({list_cols[0]}):")
        print(f"   - Tipe Data Python: {val_type}")
        print(f"   - Contoh Isi: {sample_val}")
        
        # Validasi apakah ini list atau numpy array
        if isinstance(sample_val, (list, np.ndarray)):
            print("   ‚úÖ VALID: Data tersimpan sebagai List/Array.")
        else:
            print("   ‚ùå INVALID: Data tersimpan sebagai String (Perlu fix di transformation).")

    # 3. Cek Normalisasi (Raw vs Normalized)
    # Cari pasangan kolom _raw dan _normalized
    raw_cols = [c for c in df.columns if '_raw' in c]
    if raw_cols:
        print("\nDATA COMPARISON (Raw vs Normalized):")
        
        # --- PERBAIKAN DISINI UNTUK MENGATASI ERROR ---
        col_base = raw_cols[0].replace('_raw', '')
        col_norm = f"{col_base}_normalized"
        
        if col_norm in df.columns:
            # Kita copy dulu 3 baris pertama
            preview_df = df[[raw_cols[0], col_norm]].head(3).copy()
            
            # Kita ubah jadi string agar to_markdown tidak error saat ketemu List
            for c in preview_df.columns:
                preview_df[c] = preview_df[c].astype(str)
                
            print(preview_df.to_markdown(index=False))
        else:
            print(f"‚ö†Ô∏è Kolom normalized untuk {col_base} tidak ditemukan.")

    # 4. Statistik Ringkas
    print(f"\nüìê Dimensi: {df.shape}")

In [5]:
df_stats = read_df_from_minio(BUCKET_NAME, "bronze/hero_stats/bronze_hero_stats.parquet", file_format='parquet')
validate_bronze_schema(df_stats, "Hero Stats")

# Deep Dive: Cek Tipe Data Angka
if df_stats is not None:
    print("\nüî¢ Cek Tipe Data Numerik:")
    print(df_stats[['win_rate', 'pick_rate']].dtypes)
    
    # Pastikan tidak ada object
    if df_stats['win_rate'].dtype == 'float64':
        print("‚úÖ Win Rate sukses dikonversi ke Float.")
    else:
        print("‚ùå Win Rate masih String/Object.")


üõ°Ô∏è VALIDASI BRONZE: Hero Stats
‚úÖ Metadata Check: OK (ingested_at exists)

üß© Cek Struktur Array (pick_rate):
   - Tipe Data Python: <class 'numpy.float64'>
   - Contoh Isi: 1.04
   ‚ùå INVALID: Data tersimpan sebagai String (Perlu fix di transformation).

DATA COMPARISON (Raw vs Normalized):
| hero_name_raw   | hero_name_normalized   |
|:----------------|:-----------------------|
| Aamon           | aamon                  |
| Akai            | akai                   |
| Aldous          | aldous                 |

üìê Dimensi: (130, 8)

üî¢ Cek Tipe Data Numerik:
win_rate     float64
pick_rate    float64
dtype: object
‚úÖ Win Rate sukses dikonversi ke Float.


In [None]:
df_mpl = read_df_from_minio(BUCKET_NAME, "bronze/tournament_matches/bronze_mpl_matches.parquet", file_format='parquet')
validate_bronze_schema(df_mpl, "MPL Matches (ID + PH + MY)")

if df_mpl is not None:
    print("\nüåç Validasi Union (Region Distribution):")
    print(df_mpl['region'].value_counts())
    
    print("\nüìÇ Validasi Source File:")
    print(df_mpl['source_file'].value_counts())
    
    print("\nüïµÔ∏è Intip Normalisasi Array:")
    # Kita lihat apakah tahun '2024' benar-benar hilang di kolom normalized
    cols_check = ['left_picks_raw', 'left_picks_normalized']
    if all(c in df_mpl.columns for c in cols_check):
        print(df_mpl[cols_check].head(3))
        display(df_mpl[cols_check].head(3))
    else:
        print('tidak sesuai')


üõ°Ô∏è VALIDASI BRONZE: MPL Matches (ID + PH)
‚úÖ Metadata Check: OK (ingested_at exists)

üß© Cek Struktur Array (left_bans_raw):
   - Tipe Data Python: <class 'numpy.ndarray'>
   - Contoh Isi: ['wanwan' 'yi sun shin' 'fanny' 'selena' 'uranus']
   ‚úÖ VALID: Data tersimpan sebagai List/Array.

DATA COMPARISON (Raw vs Normalized):
| left_bans_raw                                         | left_bans_normalized                                  |
|:------------------------------------------------------|:------------------------------------------------------|
| ['wanwan' 'yi sun shin' 'fanny' 'selena' 'uranus']    | ['wanwan' 'yi sun shin' 'fanny' 'selena' 'uranus']    |
| ['phoveus' 'fanny' 'arlott' 'uranus' 'hilda']         | ['phoveus' 'fanny' 'arlott' 'uranus' 'hilda']         |
| ['wanwan' 'lancelot' 'yi sun shin' 'selena' 'kadita'] | ['wanwan' 'lancelot' 'yi sun shin' 'selena' 'kadita'] |

üìê Dimensi: (414, 17)

üåç Validasi Union (Region Distribution):
region
ID    170
PH    14

Unnamed: 0,left_picks_raw,left_picks_normalized
0,"[cici, joy, pharsa, claude, hylos]","[cici, joy, pharsa, claude, hylos]"
1,"[cici, lancelot, pharsa, granger, gatotkaca]","[cici, lancelot, pharsa, granger, gatotkaca]"
2,"[uranus, fanny, pharsa, moskov, gatotkaca]","[uranus, fanny, pharsa, moskov, gatotkaca]"


In [7]:
# 1. Meta Tier
df_meta = read_df_from_minio(BUCKET_NAME, "bronze/meta/bronze_hero_meta.parquet", file_format='parquet')
validate_bronze_schema(df_meta, "Meta Tier")

if df_meta is not None:
    print("\nüèÜ Cek Konversi Tier ke Score:")
    # Tampilkan sample unik untuk memverifikasi mapping
    print(df_meta[['tier_raw', 'tier_score']].drop_duplicates().sort_values('tier_score', ascending=False).to_markdown(index=False))

# 2. Counter
df_counter = read_df_from_minio(BUCKET_NAME, "bronze/counter_hero/bronze_hero_counter.parquet", file_format='parquet')
validate_bronze_schema(df_counter, "Hero Counter")


üõ°Ô∏è VALIDASI BRONZE: Meta Tier
‚úÖ Metadata Check: OK (ingested_at exists)

DATA COMPARISON (Raw vs Normalized):
| hero_name_raw   | hero_name_normalized   |
|:----------------|:-----------------------|
| Gloo            | gloo                   |
| Aamon           | aamon                  |
| Angela          | angela                 |

üìê Dimensi: (130, 9)

üèÜ Cek Konversi Tier ke Score:
| tier_raw   |   tier_score |
|:-----------|-------------:|
| SS         |            5 |
| S          |            4 |
| A          |            3 |
| B          |            2 |
| C          |            1 |
| D          |            0 |

üõ°Ô∏è VALIDASI BRONZE: Hero Counter
‚úÖ Metadata Check: OK (ingested_at exists)

DATA COMPARISON (Raw vs Normalized):
| hero_name_raw   | hero_name_normalized   |
|:----------------|:-----------------------|
| Miya            | miya                   |
| Miya            | miya                   |
| Miya            | miya                   |

üìê Dimensi