In [1]:
# script_0_exploration.py
import pandas as pd
import numpy as np
import glob
import os


In [None]:
def simplify_filename(data_folder):
    all_files = glob.glob(os.path.join(data_folder, "*"))
    
    if not all_files:
        return
    
    renamed_count = 0
    skipped_count = 0
    error_count = 0
    
    for file_path in all_files:
        if not os.path.isfile(file_path):
            continue
            
        original_name = os.path.basename(file_path)
        directory = os.path.dirname(file_path)
        file_ext = os.path.splitext(original_name)[1]
        name_without_ext = os.path.splitext(original_name)[0]

        normalized_name = name_without_ext.lower()
        normalized_name = normalized_name.replace(' ', '_')
        normalized_name = normalized_name.replace('-', '_')
   
        while '__' in normalized_name:
            normalized_name = normalized_name.replace('__', '_')

        normalized_name = normalized_name.strip('_')

        normalized_name += file_ext.lower()

        new_file_path = os.path.join(directory, normalized_name)

        if original_name == normalized_name:
            skipped_count += 1
            continue
        
        try:
            if os.path.exists(new_file_path):
                error_count += 1
                continue
                
            os.rename(file_path, new_file_path)
            renamed_count += 1
            
        except Exception as e:
            error_count += 1

if __name__ == "__main__":
    # Update path to go up 2 levels from src/exp_data_structure
    data_folder = r'..\..\data\data_iklim_2023'
    
    print(f"Current working directory: {os.getcwd()}")
    print(f"Attempting to rename files in: {data_folder}\n")
    
    simplify_filename(data_folder)
    print("\n✓ File normalization complete!")

Current working directory: d:\Kuliah\Semester 7\Data Wrangling\UTS\surabaya_climate_flood_fusion\src\exp_data_structure
Attempting to rename files in: ..\..\data\data_iklim_2023

✓ Folder found: d:\Kuliah\Semester 7\Data Wrangling\UTS\surabaya_climate_flood_fusion\data\data_iklim_2023
✓ Found 12 items

✓ Renamed: 'Tanjung Perak 1 - Agustus 2023.xlsx' → 'tanjung_perak_1_agustus_2023.xlsx'
✓ Renamed: 'Tanjung Perak 1 - April 2023.xlsx' → 'tanjung_perak_1_april_2023.xlsx'
✓ Renamed: 'Tanjung Perak 1 - Desember 2023.xlsx' → 'tanjung_perak_1_desember_2023.xlsx'
✓ Renamed: 'Tanjung Perak 1 - Februari 2023.xlsx' → 'tanjung_perak_1_februari_2023.xlsx'
✓ Renamed: 'Tanjung Perak 1 - Januari 2023.xlsx' → 'tanjung_perak_1_januari_2023.xlsx'
✓ Renamed: 'Tanjung Perak 1 - Juli 2023.xlsx' → 'tanjung_perak_1_juli_2023.xlsx'
✓ Renamed: 'Tanjung Perak 1 - Juni 2023.xlsx' → 'tanjung_perak_1_juni_2023.xlsx'
✓ Renamed: 'Tanjung Perak 1 - Maret 2023.xlsx' → 'tanjung_perak_1_maret_2023.xlsx'
✓ Renamed: 'Tanj

In [12]:

def explore_raw_data(data_folder):
    """
    Step 0: Understand the raw data structure before cleaning
    """
    print("=== PHASE 1: DATA EXPLORATION ===")
    
    # 1. Explore Climate Data Structure
    iklim_files = glob.glob(os.path.join(data_folder, "Tanjung*.xlsx"))
    print(f"Found {len(iklim_files)} climate files")
    
    # Check first file structure
    if iklim_files:
        sample_file = iklim_files[0]
        print(f"\n--- Structure of {os.path.basename(sample_file)} ---")
        
        # Read without skipping rows to see original structure
        with open(sample_file, 'r', encoding='utf-8') as f:
            for i, line in enumerate(f.readlines()[:15]):  # First 15 lines
                print(f"Line {i}: {line.strip()}")
    
    # 2. Explore Scraping Data Structure
    scraping_files = glob.glob(os.path.join(data_folder, "*scraping*.csv"))
    if scraping_files:
        scraping_file = scraping_files[0]
        print(f"\n--- Structure of scraping data ---")
        scraping_df = pd.read_csv(scraping_file)
        print(f"Shape: {scraping_df.shape}")
        print(f"Columns: {list(scraping_df.columns)}")
        print(f"First few rows:")
        print(scraping_df.head(3))

def check_data_quality(data_folder):
    """
    Check data quality issues before processing
    """
    print("\n=== DATA QUALITY CHECK ===")
    
    # Check for each month file
    for month in ['januari', 'februari', 'maret']:  # Add all months
        file_pattern = os.path.join(data_folder, f"*{month}*.csv")
        files = glob.glob(file_pattern)
        
        if files:
            file_path = files[0]
            df = pd.read_csv(file_path, skiprows=8, encoding='utf-8')
            
            print(f"\n--- {os.path.basename(file_path)} ---")
            print(f"Rows: {len(df)}")
            print(f"Date range: {df['TANGGAL'].min()} to {df['TANGGAL'].max()}")
            
            # Check for missing values
            missing = df.isnull().sum()
            if missing.sum() > 0:
                print("Missing values:")
                print(missing[missing > 0])
            
            # Check for anomalous values in RR
            if 'RR' in df.columns:
                unique_rr = df['RR'].unique()
                print(f"Unique RR values: {sorted(unique_rr)}")

# Run exploration
if __name__ == "__main__":
    data_folder = "path/to/your/raw/data/folder"
    explore_raw_data(data_folder)
    check_data_quality(data_folder)

=== PHASE 1: DATA EXPLORATION ===
Found 0 climate files

=== DATA QUALITY CHECK ===
