# Import Library

In [1]:
import json
import pandas as pd

# Data Preparation Functions

### Data Gathering

In [2]:
def read_json(file_path):
    with open(file_path) as f:
        data = json.load(f)
    return data

def load_json_to_dataframe(file_path):        
    topics = []
    subtopics = []
    contents = []
    urls = []

    data = read_json(file_path)

    for item in data:
        for subtopic in item['subtopics']:
            topics.append(item['topic'])
            subtopics.append(subtopic['subtopic'])
            contents.append(subtopic['content'])
            urls.append(subtopic['subtopic_url'])

    df = pd.DataFrame({'topic': topics,'subtopic': subtopics,'content': contents,'url': urls})
    # df.to_csv('data/cvd.csv', index=False)
    return df

### Text Cleaning

In [3]:
def get_cleaned_text_data(df):
    df_prepared = df.copy()

    for index, row in df_prepared.iterrows():
        try:
            topic = row['topic'].encode("latin1").decode("utf-8")
        except:
            topic = row['topic'].replace('\u00a0', ' ')
        try:
            subtopic = row['subtopic'].encode("latin1").decode("utf-8")
        except:
            subtopic = row['subtopic'].replace('\u00a0', ' ')
        try:
            content = row['content'].encode("latin1").decode("utf-8")
        except:
            content = row['content'].replace('\u00a0', ' ')
        # print(row['content'])
        # content = row['content'].replace('\u00a0', '').encode("latin1").decode("utf-8")
        df_prepared.loc[index, 'topic'] = topic
        df_prepared.loc[index, 'subtopic'] = subtopic
        df_prepared.loc[index, 'content'] = content
    return df_prepared

def get_prepared_content_page(df):
    df_prepared = df.copy()
    df_prepared['content'] = 'Topik: ' + df_prepared['topic'] + ', Subtopik: ' + df_prepared['subtopic'] + ' \n ' + df_prepared['content']
    return df_prepared

def get_prepared_data(df):
    df_prepared = get_cleaned_text_data(df)
    df_prepared = get_prepared_content_page(df_prepared)
    return df_prepared

In [5]:
df = load_json_to_dataframe('../data/cvd.json')
df_prepared = get_prepared_data(df)

df_prepared.head()

Unnamed: 0,topic,subtopic,content,url
0,Hipertensi,Pengertian Hipertensi,"Topik: Hipertensi, Subtopik: Pengertian Hipert...",https://www.alodokter.com/hipertensi
1,Hipertensi,Penyebab Hipertensi,"Topik: Hipertensi, Subtopik: Penyebab Hiperten...",https://www.alodokter.com/hipertensi/penyebab
2,Hipertensi,Penyebab Hipertensi: Faktor Risiko Hipertensi,"Topik: Hipertensi, Subtopik: Penyebab Hiperten...",https://www.alodokter.com/hipertensi/penyebab
3,Hipertensi,Gejala Hipertensi,"Topik: Hipertensi, Subtopik: Gejala Hipertensi...",https://www.alodokter.com/hipertensi/gejala
4,Hipertensi,Gejala Hipertensi: Kapan Harus ke Dokter,"Topik: Hipertensi, Subtopik: Gejala Hipertensi...",https://www.alodokter.com/hipertensi/gejala


### Data Saving

In [6]:
# save to json
with open('../data/cvd_prepared.json', 'w') as f:
    json.dump(df_prepared.to_dict('records'), f, indent=4)