In [1]:
# script_0_exploration.py
import pandas as pd
import numpy as np
import glob
import os


In [None]:
import os
import glob

def simplify_filename(data_folder):
    all_files = glob.glob(os.path.join(data_folder, "*"))
    
    for file_path in all_files:
        if not os.path.isfile(file_path):
            continue
            
        original_name = os.path.basename(file_path)
        directory = os.path.dirname(file_path)
        file_ext = os.path.splitext(original_name)[1]  # Keep original extension

        name_without_ext = os.path.splitext(original_name)[0]

        normalized_name = name_without_ext.lower()
        normalized_name = normalized_name.replace(' ', '_').replace('-', '_')
        normalized_name = normalized_name.replace('__', '_').replace('__', '_')  # Remove double underscores

        normalized_name += file_ext.lower()

        new_file_path = os.path.join(directory, normalized_name)

        if original_name != normalized_name:
            try:
                os.rename(file_path, new_file_path)
                print(f"Renamed: {original_name} â†’ {normalized_name}")
            except Exception as e:
                print(f"Error renaming {original_name}: {e}")
        else:
            print(f"Already normalized: {original_name}")

# Usage
if __name__ == "__main__":
    data_folder = r'data\data_iklim_2023'
    simplify_filename(data_folder)
    print("ðŸŽ‰ File normalization complete!")

File normalization complete!


In [None]:

def explore_raw_data(data_folder):
    """
    Step 0: Understand the raw data structure before cleaning
    """
    print("=== PHASE 1: DATA EXPLORATION ===")
    
    # 1. Explore Climate Data Structure
    iklim_files = glob.glob(os.path.join(data_folder, "iklim_*.csv"))
    print(f"Found {len(iklim_files)} climate files")
    
    # Check first file structure
    if iklim_files:
        sample_file = iklim_files[0]
        print(f"\n--- Structure of {os.path.basename(sample_file)} ---")
        
        # Read without skipping rows to see original structure
        with open(sample_file, 'r', encoding='utf-8') as f:
            for i, line in enumerate(f.readlines()[:15]):  # First 15 lines
                print(f"Line {i}: {line.strip()}")
    
    # 2. Explore Scraping Data Structure
    scraping_files = glob.glob(os.path.join(data_folder, "*scraping*.csv"))
    if scraping_files:
        scraping_file = scraping_files[0]
        print(f"\n--- Structure of scraping data ---")
        scraping_df = pd.read_csv(scraping_file)
        print(f"Shape: {scraping_df.shape}")
        print(f"Columns: {list(scraping_df.columns)}")
        print(f"First few rows:")
        print(scraping_df.head(3))

def check_data_quality(data_folder):
    """
    Check data quality issues before processing
    """
    print("\n=== DATA QUALITY CHECK ===")
    
    # Check for each month file
    for month in ['januari', 'februari', 'maret']:  # Add all months
        file_pattern = os.path.join(data_folder, f"*{month}*.csv")
        files = glob.glob(file_pattern)
        
        if files:
            file_path = files[0]
            df = pd.read_csv(file_path, skiprows=8, encoding='utf-8')
            
            print(f"\n--- {os.path.basename(file_path)} ---")
            print(f"Rows: {len(df)}")
            print(f"Date range: {df['TANGGAL'].min()} to {df['TANGGAL'].max()}")
            
            # Check for missing values
            missing = df.isnull().sum()
            if missing.sum() > 0:
                print("Missing values:")
                print(missing[missing > 0])
            
            # Check for anomalous values in RR
            if 'RR' in df.columns:
                unique_rr = df['RR'].unique()
                print(f"Unique RR values: {sorted(unique_rr)}")

# Run exploration
if __name__ == "__main__":
    data_folder = "path/to/your/raw/data/folder"
    explore_raw_data(data_folder)
    check_data_quality(data_folder)