## Generate Base Description

In [13]:
import re
import json
import os
import pandas as pd
from typing import Tuple, Dict
from typing import Tuple, Dict, Any

def extract_params_from_title(title: str) -> Tuple[str, str, int]:
    """
    
    Extract the measurement title, unit, and year from the column title
    """

    def extract_short_title(title: str) -> str:
        """
        Extracts the short title by removing phrases like "Menurut Kecamatan di Kota Malang"
        or similar variations.
        """
        # Define the patterns to remove
        patterns = [
            r"\s*Menurut Kecamatan.*",
            r"/s*di Kota Malang.*"
        ]

        # Remove the patterns from the title
        for pattern in patterns:
            title = re.sub(pattern, "", title).strip()

        return title

    def extract_unit(title: str) -> str:
        """
        Extracts the unit from the title by finding the last occurrence of parentheses,
        handling nested parentheses cases.
        """
        if "(" not in title or ")" not in title:
            return ""

        stack = []
        last_unit = ""
        current_unit = ""
        in_parentheses = False

        for char in title:
            if char == '(':
                stack.append(char)
                if len(stack) == 1:
                    in_parentheses = True
                    current_unit = ""
                else:
                    current_unit += char
            elif char == ')':
                if not stack:
                    continue
                stack.pop()
                if stack:  # Still have open parentheses
                    current_unit += char
                else:  # Closed all parentheses
                    in_parentheses = False
                    last_unit = current_unit
            elif in_parentheses:
                current_unit += char

        return last_unit.strip()
    
    def extract_unit_flexible(title: str) -> str:
        title_lower = title.lower()
        if "meter kubik" in title_lower:
            return "m³"
        if "meter persegi" in title_lower:
            return "m²"
        if "jiwa/km2" in title_lower:
            return "jiwa/km²"
        if "hektar" in title_lower:
            return "ha"
        return extract_unit(title)

    def normalize_unit(unit: str) -> str:
        replacements = {
            r"^m3$": "m³",
            r"^m2$": "m²",
            r"km2$": "km²",
            r"ha$": "ha"
        }
        for pattern, repl in replacements.items():
            unit = re.sub(pattern, repl, unit.strip(), flags=re.IGNORECASE)
        return unit

    # Find year pattern (4 digits)
    year_match = re.search(r'\bTahun (\d{4})\b', title)
    year = int(year_match.group(1)) if year_match else None

    # Find unit pattern
    unit = extract_unit_flexible(title)
    unit = normalize_unit(unit)

    # Clean the title
    clean_title = extract_short_title(title)
    if year_match:
        clean_title = clean_title.replace(year_match.group(0), '').strip()
    if unit:
        clean_title = clean_title.replace(f'({unit})', '').strip()

    return clean_title, unit, year


def load_and_process_data(file_path: str, target_kota: str = "KOTA MALANG") -> Tuple[pd.DataFrame, Dict, Any]:
    """
    Load CSV file, process it, and extract parameters along with the 'KOTA MALANG' value.
    """
    df = pd.read_csv(file_path)
    columns = df.columns.tolist()
    value_column = columns[1]

    # Extract parameters
    title, unit, year = extract_params_from_title(value_column)

    # Get KOTA MALANG value and remove it
    kota_value = df.loc[df['Kecamatan'] == target_kota, value_column].values[0]
    df = df[df['Kecamatan'] != target_kota]
    df = df.set_index('Kecamatan')

    params = {
        'title': title,
        'unit': unit,
        'year': year,
        'value_column': value_column
    }
    return df, params, kota_value, target_kota


def define_adverb(title: str) -> str:
    rules = {
        "Jumlah": "sebanyak",
        "Luas": "sebesar",
        "Produksi": "sebanyak",
        "Daya": "sebesar",
        "Nilai": "sebesar",
        "Pertumbuhan": "sebesar",
        "Kepadatan": "sebesar",
        "Rasio": "sebesar",
        "Banyaknya": "sebanyak"
    }
    first_word = title.split()[0]
    return rules.get(first_word, "sebanyak")


def generate_base_description(df: pd.DataFrame, params: Dict, kota_value: Any, target_kota: str) -> str:
    def format_value(value: Any) -> str:
        def is_rupiah(unit: str) -> bool:
            return "Rp" in unit

        if isinstance(value, str):
            if value == '-':
                return value
            try:
                value = float(value)
            except ValueError:
                return value
        elif isinstance(value, int) or isinstance(value, float):
            value = float(value)

        # Format Indonesia: koma = desimal, titik = ribuan
        formatted = f"{value:,.2f}"
        formatted = formatted.replace(",", "X").replace(".", ",").replace("X", ".")
        formatted = formatted.rstrip("0").rstrip(",")
        
        if is_rupiah(params['unit']):
            return f"Rp{formatted}"
        return f"{formatted}"


    kata_keterangan = define_adverb(params['title'])

    # Identify districts without data
    no_data_mask = df[params['value_column']] == '-'
    no_data_districts = df[no_data_mask].index.tolist()

    df_with_data = df[~no_data_mask]
    sorted_df = df_with_data.sort_values(by=params['value_column'], ascending=False)

    highest = sorted_df.iloc[0]
    lowest = sorted_df.iloc[-1]
    middle = sorted_df.iloc[1:-1]

    # Build base description
    # desc = (
    #     f"Berdasarkan data tahun {params['year']}, {params['title']} tertinggi terdapat di Kecamatan {highest.name} "
    #     f"dengan nilai {kata_keterangan} {format_value(highest[params['value_column']])}"
    #     + ("" if "Rp" in params['unit'] else f" {params['unit']}")
    # )

    # if len(middle) > 0:
    #     for i, (d_name, row) in enumerate(middle.iterrows()):
    #         value_str = format_value(row[params['value_column']])
    #         unit_str = "" if "Rp" in params["unit"] else f" {params['unit']}"
            
    #         if i == 0:
    #             desc += f". Selanjutnya, Kecamatan {d_name} mencatatkan nilai {kata_keterangan} {value_str}{unit_str}"
    #         else:
    #             desc += f", Kecamatan {d_name} {kata_keterangan} {value_str}{unit_str}"

    # desc += (
    #     f". Sementara itu, {params['title']} paling rendah terdapat di Kecamatan {lowest.name} dengan nilai "
    #     f"{kata_keterangan} {format_value(lowest[params['value_column']])}"
    #     f"{'' if 'Rp' in params['unit'] else f' {params['unit']}' }."
    # )

    # if no_data_districts:
    #     if len(no_data_districts) == 1:
    #         desc += f" Terdapat catatan bahwa tidak tersedia data {params['title']} untuk Kecamatan {no_data_districts[0]}."
    #     else:
    #         names = ", ".join(no_data_districts)
    #         desc += f" Terdapat catatan bahwa tidak tersedia data {params['title']} untuk {names}."

    # # Tambahkan ringkasan KOTA MALANG
    # adverb_kota = define_adverb(params['title'])
    # fmt_kota = format_value(kota_value)
    # desc += (
    #     f" Dengan demikian, dapat disimpulkan bahwa {params['title']} untuk {target_kota.title()} {adverb_kota} "
    #     f"{fmt_kota} {params['unit']}.")

    desc = (
        f"Dengan demikian, Kecamatan {highest.name} memiliki {params['title']} tertinggi {kata_keterangan} "
        f"{format_value(highest[params['value_column']])}"
        f"{'' if 'Rp' in params['unit'] else f' {params['unit']}'}, "
        f"sedangkan Kecamatan {lowest.name} memiliki nilai terendah "
        f"{kata_keterangan} {format_value(lowest[params['value_column']])}"
        f"{'' if 'Rp' in params['unit'] else f' {params['unit']}'} di Kota Malang.")

    return desc

In [14]:
def detect_encoding(file_path):
    """Try different encodings to read the file"""
    encodings = ['utf-8', 'latin1', 'iso-8859-1', 'cp1252']

    for encoding in encodings:
        try:
            with open(file_path, 'r', encoding=encoding) as f:
                f.read()
            return encoding
        except UnicodeDecodeError:
            continue
    return None

def process_folder(input_folder_path: str, output_folder_path: str) -> None:
    os.makedirs(output_folder_path, exist_ok=True)
    csv_files = sorted([f for f in os.listdir(input_folder_path) if f.endswith('.csv')])

    for file_name in csv_files:
        try:
            input_file = os.path.join(input_folder_path, file_name)

            # Detect encoding
            encoding = detect_encoding(input_file)
            if not encoding:
                print(f"Could not detect encoding for {file_name}")
                continue

            # Read with detected encoding
            df = pd.read_csv(input_file, encoding=encoding)
            df, params, kota_value, target_kota = load_and_process_data(input_file)
            base_description = generate_base_description(df, params, kota_value, target_kota)

            base_filename = os.path.splitext(file_name)[0]
            result = {
                base_filename: {
                    "metadata": {
                        "title": params['title'],
                        "unit": params['unit'],
                        "year": params['year']
                    },
                    "base_description": base_description
                }
            }

            output_file = os.path.join(output_folder_path, f'{base_filename}.json')
            with open(output_file, 'w', encoding='utf-8') as f:
                json.dump(result, f, ensure_ascii=False, indent=2)

            # Simpan base_description ke dalam file .txt
            output_txt_file = os.path.join(output_folder_path, f'{base_filename}.txt')
            with open(output_txt_file, 'w', encoding='utf-8') as f:
                f.write(base_description)

            print(f"Successfully processed {file_name} with {encoding} encoding")

        except Exception as e:
            print(f"Error processing {file_name}: {str(e)}")
            continue

In [21]:
process_folder(
    input_folder_path='C:/Users/ASUS/OneDrive - Politeknik Statistika STIS/Dokumen/KULIAH/SEMESTER 7/[2] SKRIPSI/[4] PENGOLAHAN/Table Dataset',
    output_folder_path='C:/Users/ASUS/OneDrive - Politeknik Statistika STIS/Dokumen/KULIAH/SEMESTER 7/[2] SKRIPSI/[4] PENGOLAHAN/Generate Descriptions Version 2/base_tambahan_reasoning'
)

Successfully processed T0001.csv with utf-8 encoding
Successfully processed T0002.csv with utf-8 encoding
Successfully processed T0003.csv with utf-8 encoding
Successfully processed T0004.csv with utf-8 encoding
Successfully processed T0005.csv with utf-8 encoding
Successfully processed T0006.csv with utf-8 encoding
Successfully processed T0007.csv with utf-8 encoding
Successfully processed T0008.csv with utf-8 encoding
Successfully processed T0009.csv with utf-8 encoding
Successfully processed T0010.csv with utf-8 encoding
Successfully processed T0011.csv with utf-8 encoding
Successfully processed T0012.csv with utf-8 encoding
Successfully processed T0013.csv with utf-8 encoding


Successfully processed T0014.csv with utf-8 encoding
Successfully processed T0015.csv with utf-8 encoding
Successfully processed T0016.csv with utf-8 encoding
Successfully processed T0017.csv with utf-8 encoding
Successfully processed T0018.csv with utf-8 encoding
Successfully processed T0019.csv with utf-8 encoding
Successfully processed T0020.csv with utf-8 encoding
Successfully processed T0021.csv with utf-8 encoding
Successfully processed T0022.csv with utf-8 encoding
Successfully processed T0023.csv with utf-8 encoding
Successfully processed T0024.csv with utf-8 encoding
Successfully processed T0025.csv with utf-8 encoding
Successfully processed T0026.csv with utf-8 encoding
Successfully processed T0027.csv with utf-8 encoding
Successfully processed T0028.csv with utf-8 encoding
Successfully processed T0029.csv with utf-8 encoding
Successfully processed T0030.csv with utf-8 encoding
Successfully processed T0031.csv with utf-8 encoding
Successfully processed T0032.csv with utf-8 en

In [22]:
import json

def view_json_file(file_path):
    """
    Membuka dan menampilkan isi file JSON.

    Args:
        file_path: Path ke file JSON yang ingin dibuka.
    """
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)

        print(json.dumps(data, indent=4, ensure_ascii=False))  # Menampilkan data dengan indentasi

    except FileNotFoundError:
        print(f"File tidak ditemukan: {file_path}")
    except json.JSONDecodeError:
        print(f"File bukan JSON yang valid: {file_path}")
    except Exception as e:
        print(f"Terjadi kesalahan: {e}")

In [24]:
file_path = 'C:/Users/ASUS/OneDrive - Politeknik Statistika STIS/Dokumen/KULIAH/SEMESTER 7/[2] SKRIPSI/[4] PENGOLAHAN/Generate Descriptions Version 2/base_tambahan_reasoning/T0234.json'
view_json_file(file_path)

{
    "T0234": {
        "metadata": {
            "title": "Kepadatan Penduduk",
            "unit": "jiwa/km²",
            "year": 2015
        },
        "base_description": "Dengan demikian, Kecamatan Klojen memiliki Kepadatan Penduduk tertinggi sebesar 11.792 jiwa/km², sedangkan Kecamatan Kedungkandang memiliki nilai terendah sebesar 4.665 jiwa/km² di Kota Malang."
    }
}
