# Import Library

In [1]:
import json
import pandas as pd
import re

# Data Preparation Functions

### Data Gathering

In [2]:
def read_json(file_path):
    '''
    Args:
        file_path (str): Path to the JSON file.
        
    Returns:
        data (list): List of dictionaries loaded from the JSON file.
    '''

    with open(file_path) as f:
        data = json.load(f)
    return data

def load_json_to_dataframe(file_path):   
    '''
    Args:
        file_path (str): Path to the JSON file.

    Returns:
        df (pd.DataFrame): DataFrame containing the data from the JSON file.
    '''     
    topics = []
    subtopics = []
    contents = []
    urls = []

    data = read_json(file_path)

    for item in data:
        for subtopic in item['subtopics']:
            topics.append(item['topic'])
            subtopics.append(subtopic['subtopic'])
            contents.append(subtopic['content'])
            urls.append(subtopic['subtopic_url'])

    df = pd.DataFrame({'topic': topics,'subtopic': subtopics,'content': contents,'url': urls})
    # df.to_csv('data/cvd.csv', index=False)
    return df

### Text Cleaning

In [None]:
def get_cleaned_text_data(df):
    '''
    Args:
        df (pd.DataFrame): DataFrame containing the data to be cleaned.
    
    Returns:
        df_prepared (pd.DataFrame): DataFrame with cleaned text data.
    '''

    df_prepared = df.copy()

    for index, row in df_prepared.iterrows():
        # Menghapus karakter non-ASCII dan mengganti dengan spasi
        topic = re.sub(r"[^\x20-\x7E\n]", " ", row['topic']) 
        topic = re.sub(r"  ", " ", topic)

        subtopic = re.sub(r"[^\x20-\x7E\n]", " ", row['subtopic'])  
        subtopic = re.sub(r"  ", " ", subtopic)

        content = re.sub(r"[^\x20-\x7E\n]", " ", row['content']) 
        content = re.sub(r"  ", " ", content)
        
        df_prepared.loc[index, 'topic'] = topic
        df_prepared.loc[index, 'subtopic'] = subtopic
        df_prepared.loc[index, 'content'] = content
    return df_prepared

def get_prepared_content_page(df):
    '''
    Args:
        df (pd.DataFrame): DataFrame containing the data to be prepared.
    Returns:
        df_prepared (pd.DataFrame): DataFrame with prepared content page.
    '''

    df_prepared = df.copy()

    # Menggabungkan kolom 'topic' dan 'subtopic' dengan konten
    df_prepared['content'] = 'Topik: ' + df_prepared['topic'] + ', Subtopik: ' + df_prepared['subtopic'] + ' \n ' + df_prepared['content']
    return df_prepared

def get_prepared_data(df):
    '''
    Args:
        df (pd.DataFrame): DataFrame containing the data to be prepared.
    Returns:
        df_prepared (pd.DataFrame): DataFrame with prepared data.
    '''
    df_prepared = get_cleaned_text_data(df)
    df_prepared = get_prepared_content_page(df_prepared)
    return df_prepared

In [4]:
# Eksekusi fungsi untuk membersihkan teks dan menggabungkan konten dengan topik dan subtopik
df = load_json_to_dataframe('../data/cvd.json')
df_prepared = get_prepared_data(df)

df_prepared.head()

Unnamed: 0,topic,subtopic,content,url
0,Hipertensi,Pengertian Hipertensi,"Topik: Hipertensi, Subtopik: Pengertian Hipert...",https://www.alodokter.com/hipertensi
1,Hipertensi,Penyebab Hipertensi,"Topik: Hipertensi, Subtopik: Penyebab Hiperten...",https://www.alodokter.com/hipertensi/penyebab
2,Hipertensi,Penyebab Hipertensi: Faktor Risiko Hipertensi,"Topik: Hipertensi, Subtopik: Penyebab Hiperten...",https://www.alodokter.com/hipertensi/penyebab
3,Hipertensi,Gejala Hipertensi,"Topik: Hipertensi, Subtopik: Gejala Hipertensi...",https://www.alodokter.com/hipertensi/gejala
4,Hipertensi,Gejala Hipertensi: Kapan Harus ke Dokter,"Topik: Hipertensi, Subtopik: Gejala Hipertensi...",https://www.alodokter.com/hipertensi/gejala


In [6]:
# Menghitung frekuensi karakter unicode dalam konten
unicode_count = {}

df_prepared['combined_content'] = df_prepared['topic'] + ' ' + df_prepared['subtopic'] + ' ' + df_prepared['content']
for index, row in df_prepared.iterrows():
    for char in row['combined_content']:
        if ord(char) > 127:
            if char in unicode_count:
                unicode_count[char] += 1
            else:
                unicode_count[char] = 1

df_prepared.drop(columns=['combined_content'], inplace=True)

sorted_unicode_count = dict(sorted(unicode_count.items(), key=lambda item: item[1], reverse=True))

sorted_unicode_count

{}

In [7]:
df_prepared

Unnamed: 0,topic,subtopic,content,url
0,Hipertensi,Pengertian Hipertensi,"Topik: Hipertensi, Subtopik: Pengertian Hipert...",https://www.alodokter.com/hipertensi
1,Hipertensi,Penyebab Hipertensi,"Topik: Hipertensi, Subtopik: Penyebab Hiperten...",https://www.alodokter.com/hipertensi/penyebab
2,Hipertensi,Penyebab Hipertensi: Faktor Risiko Hipertensi,"Topik: Hipertensi, Subtopik: Penyebab Hiperten...",https://www.alodokter.com/hipertensi/penyebab
3,Hipertensi,Gejala Hipertensi,"Topik: Hipertensi, Subtopik: Gejala Hipertensi...",https://www.alodokter.com/hipertensi/gejala
4,Hipertensi,Gejala Hipertensi: Kapan Harus ke Dokter,"Topik: Hipertensi, Subtopik: Gejala Hipertensi...",https://www.alodokter.com/hipertensi/gejala
...,...,...,...,...
641,Broken Heart Syndrome,Pengobatan Broken Heart Syndrome,"Topik: Broken Heart Syndrome, Subtopik: Pengob...",https://www.alodokter.com/broken-heart-syndrome
642,Broken Heart Syndrome,Pengobatan Broken Heart Syndrome: Pemberian ob...,"Topik: Broken Heart Syndrome, Subtopik: Pengob...",https://www.alodokter.com/broken-heart-syndrome
643,Broken Heart Syndrome,Pengobatan Broken Heart Syndrome: Pola hidup s...,"Topik: Broken Heart Syndrome, Subtopik: Pengob...",https://www.alodokter.com/broken-heart-syndrome
644,Broken Heart Syndrome,Komplikasi Broken Heart Syndrome,"Topik: Broken Heart Syndrome, Subtopik: Kompli...",https://www.alodokter.com/broken-heart-syndrome


### Data Saving

In [9]:
# Menyimpan DataFrame yang sudah diproses ke dalam file JSON
with open('../data/cvd_prepared.json', 'w') as f:
    json.dump(df_prepared.to_dict('records'), f, indent=4)