In [None]:
# script_0_exploration.py
import pandas as pd
import numpy as np
import glob
import os
import openpyxl

### Normalize Climate Data

In [44]:
def simplify_filename(data_folder):
    all_files = glob.glob(os.path.join(data_folder, "*"))
    
    if not all_files:
        return
    
    renamed_count = 0
    skipped_count = 0
    error_count = 0
    
    for file_path in all_files:
        if not os.path.isfile(file_path):
            continue
            
        original_name = os.path.basename(file_path)
        directory = os.path.dirname(file_path)
        file_ext = os.path.splitext(original_name)[1]
        name_without_ext = os.path.splitext(original_name)[0]

        normalized_name = name_without_ext.lower()
        normalized_name = normalized_name.replace(' ', '_')
        normalized_name = normalized_name.replace('-', '_')
   
        while '__' in normalized_name:
            normalized_name = normalized_name.replace('__', '_')

        normalized_name = normalized_name.strip('_')

        normalized_name += file_ext.lower()

        new_file_path = os.path.join(directory, normalized_name)

        if original_name == normalized_name:
            skipped_count += 1
            continue
        
        try:
            if os.path.exists(new_file_path):
                error_count += 1
                continue
                
            os.rename(file_path, new_file_path)
            renamed_count += 1
            
        except Exception as e:
            error_count += 1

if __name__ == "__main__":
    # Update path to go up 2 levels from src/exp_data_structure
    data_folder = r'..\..\data\data_iklim_2023'
    
    print(f"Current working directory: {os.getcwd()}")
    print(f"Attempting to rename files in: {data_folder}\n")
    
    simplify_filename(data_folder)
    print("\nFile renaming complete!")

Current working directory: d:\Kuliah\Semester 7\Data Wrangling\UTS\surabaya_climate_flood_fusion\src\exp_data_structure
Attempting to rename files in: ..\..\data\data_iklim_2023


File renaming complete!


In [46]:
folder_path = r"..\..\data\data_iklim_2023"
output_path = r"..\..\data\normalize"


files = glob.glob(os.path.join(folder_path, "*tanjung*.xlsx"))

if not files:
    raise SystemExit

for f in files:
    df_raw = pd.read_excel(f, header=None)

    header_row = df_raw[df_raw.eq("TANGGAL").any(axis=1)].index[0]

    df = pd.read_excel(f, header=header_row)

    df = df.drop(range(0, header_row + 1))

    ket_row = df[df.eq("KETERANGAN:").any(axis=1)].index

    if len(ket_row) > 0:
        stop_at = ket_row[0]
        df = df.loc[:stop_at - 1] 

    df = df.reset_index(drop=True)

    out_name = os.path.splitext(os.path.basename(f))[0] + "_clean.csv"
    df.to_csv(os.path.join(output_path, out_name), index=False)

print("File cleaned and saved.")


File cleaned and saved.


### Normalize X data

In [None]:

# Load file
df = pd.read_excel("../../data/data_iklim_2023/data_sraping_x_2023.xlsx")


# melt all tweet columns
tweet_cols = [col for col in df.columns if col.startswith("tweet")]

df_long = df.melt(a
    id_vars="TANGGAL",
    value_vars=tweet_cols,
    var_name="tweet_id",
    value_name="tweet_text"
)

# drop empty rows
df_long = df_long.dropna(subset=["tweet_text"])
df_long = df_long[df_long["tweet_text"].str.strip() != ""]

# sort by date
df_long = df_long.sort_values(["TANGGAL"])

# save to csv
df_long.to_csv(r"../../data/normalize/tweets_normalized.csv", index=False)

df_long.head()


Unnamed: 0,TANGGAL,tweet_id,tweet_text
17,2023-01-18,tweet1,hujan deres
1112,2023-01-18,tweet4,dengerin suara hujan
382,2023-01-18,tweet2,hujan petir suara pesawat deket sekali admin
1477,2023-01-18,tweet5,hati hati ya pulangnya sedia jas hujan
747,2023-01-18,tweet3,surabaya hujan


In [None]:
print("CWD:", os.getcwd())
print("List root:", os.listdir(os.getcwd()))


CWD: d:\Kuliah\Semester 7\Data Wrangling\UTS\surabaya_climate_flood_fusion\src\exp_data_structure
List root: ['explore_data_before_processing.ipynb']


In [None]:
print("CWD:", os.getcwd())
print("List parent:", os.listdir("../"))
print("List grandparent:", os.listdir("../../"))


CWD: d:\Kuliah\Semester 7\Data Wrangling\UTS\surabaya_climate_flood_fusion\src\exp_data_structure
List parent: ['data_normalize', 'exp_data_structure', 'processing']
List grandparent: ['.git', '.gitignore', '.python-version', '.venv', 'data', 'main.py', 'pyproject.toml', 'README.md', 'src', 'uv.lock']


In [None]:
print("Listing data/data_iklim_2023:")
print(os.listdir("../../data/data_iklim_2023"))


Listing data/data_iklim_2023:
['data_sraping_x_2023.xlsx', 'tanjung_perak_1_agustus_2023.xlsx', 'tanjung_perak_1_april_2023.xlsx', 'tanjung_perak_1_desember_2023.xlsx', 'tanjung_perak_1_februari_2023.xlsx', 'tanjung_perak_1_januari_2023.xlsx', 'tanjung_perak_1_juli_2023.xlsx', 'tanjung_perak_1_juni_2023.xlsx', 'tanjung_perak_1_maret_2023.xlsx', 'tanjung_perak_1_mei_2023.xlsx', 'tanjung_perak_1_november_2023.xlsx', 'tanjung_perak_1_oktober_2023.xlsx', 'tanjung_perak_1_september_2023.xlsx']
