In [1]:
import pandas as pd
import numpy as np

In [None]:
# Load CSV paragraph 
df_data_paragraph = pd.read_csv('labeled_paskah_final_bastian.csv')

# Load CSV other
df_data_table = pd.read_csv('material_book_1_other.csv')
df_data_paragraph.head()

In [None]:
df_data_paragraph.drop(columns="Unnamed: 0", inplace=True)

In [None]:
df_data_paragraph.head()

In [None]:
# Add new columns 'label' dan 'label_id' for labeling
df_data_paragraph['label_id'] = 0
df_data_paragraph['label'] = None
df_data_paragraph.head()

### Sermon Labeling (automatic)
Use patterns to distinguish between announcement & Sermon
- Sermon always started with the title "Khotbah" in documents
- Ended with terms '[AWi]' clue: use regex

In [None]:
# Start Pattern for Sermon Label
import re

marker_start_shermon = df_data_paragraph.loc[df_data_paragraph.VALUE.str.contains("KHOTBAH")]
marker_end_shermon = df_data_paragraph.loc[df_data_paragraph.VALUE.apply(lambda x: bool(re.match("^\[\w{1,3}\s*\w{1,3}]$", x)))]

In [None]:
lst_sermon_idx = []
for idx_sermon_start in marker_start_shermon.index:
    lst_start_end_idx = []
    lst_start_end_idx.append(idx_sermon_start)
    for idx_sermon_end in marker_end_shermon.index:
        if idx_sermon_end > idx_sermon_start:
            lst_start_end_idx.append(idx_sermon_end)
            break
    lst_sermon_idx.append(lst_start_end_idx)
lst_sermon_idx

In [None]:
# Label "Khotbah .." As Sermon based on list of index that contains of khotbah content
for idx in lst_sermon_idx:
    for i in range(idx[0],idx[1]):
        df_data_paragraph.loc[(df_data_paragraph.index == i), 'label_id'] = 3
df_data_paragraph.loc[(df_data_paragraph['label_id'] == 3)]

In [None]:
df_data_paragraph.loc[(df_data_paragraph['label_id'] == 3), 'label'] = 'Sermon'
df_data_paragraph.loc[(df_data_paragraph['label_id'] == 3)]

### Worship Labeling
Semi automatic using format text style and format

In [None]:
df_data_paragraph.groupby(['style_left', 'style_hanging']).size().nlargest(20)

In [None]:
# Check format pattern for worship
# 993 - 426
# 426 - 426
worship_lyric = 'jangan sesat dengar sabda-Nya'
df_data_paragraph.loc[df_data_paragraph.VALUE.str.contains(worship_lyric)]

In [None]:
df_data_paragraph.loc[(((df_data_paragraph['style_hanging'] == '426')) &
                       ((df_data_paragraph['style_left'] == '426') | (df_data_paragraph['style_left'] == '993'))), 
                     'label_id'] = 4
df_data_paragraph.loc[(df_data_paragraph['label_id'] == 4), 'label'] = 'Worship'

In [None]:
df_data_paragraph.loc[(df_data_paragraph['label_id'] == 4)]

### Checking & Labeling each rows
Checking manualy using indexes

In [None]:
search_text = 'Apakah relevansi nilai kemanusiaan baru Yesus bagi sikap tindakan Gereja di tengah krisis pandemik'
df_data_paragraph.loc[df_data_paragraph.VALUE.str.contains(search_text)]

In [None]:
df_data_paragraph.loc[(df_data_paragraph.index) == 3032]

### Announcement Checking & Labeling
With Label_id = 1 and label = Announcement

In [None]:
df_data_paragraph.loc[(df_data_paragraph.label_id == 0), 'label_id'] = 1

In [None]:
df_data_paragraph.loc[(df_data_paragraph.label_id == 1), 'label'] = 'Announcement'

### Prayer Checking & Labeling
With Label_id = 2 and label = Prayer

In [None]:
# Bahan Liturgi
# 689 - 696 = Rabu Abu <Votum & Salam?> <Hal 132>
# 709 - 711 = Rabu Abu <Pengakuan Dosa> <Hal 133>
# 801 - 806 = Minggu Pra Paska I <Votum & Salam?> <Hal 140>
# 817 - 819 = Minggu Pra Paska I <Pengakuan Dosa> <Hal 141>
# 822 - 823 = Minggu Pra Paska I <Pengakuan Dosa> <Hal 142>
# 825 - 826 = Minggu Pra Paska I <Pengakuan Dosa> <Hal 142>
# 878 - 879 = Minggu Pra Paska I <Berkat> <Hal 146>
# 891 - 896 = Minggu Pra Paska II <Votum & Salam?> <Hal 148>
# 903 - 904 = Minggu Pra Paska II <Pengakuan Dosa> <Hal 149>
# 962 - 964 = Minggu Pra Paska II <Berkat> <Hal 152>
# 983 - 988 = Minggu Pra Paska III <Votum & Salam?> <Hal 154>
# 999 - 1001 = Minggu Pra Paska III <Pengakuan Dosa> <Hal 155>
# 1066 - 1067 = Minggu Pra Paska III <Doa Persembahan> <Hal 159>
# 1086 - 1088 = Minggu Pra Paska III <Berkat> <Hal 160-161>
# 1112 - 1117 = Minggu Pra Paska IV <Votum & Salam?> <Hal 164>
# 1126 - 1135 = Minggu Pra Paska IV <Pengakuan Dosa> <Hal 164-165>
# 1205 - 1207 = Minggu Pra Paska IV <Doa Persembahan> <Hal 168>
# 1243 - 1248 = Minggu Pra Paska V <Votum & Salam?> <Hal 172>
# 1264, 1268, 1272 = Minggu Pra Paska V <Litani Pengakuan Dosa> <Hal 173-174>
# 1334 - 1335 = Minggu Pra Paska V <Berkat> <Hal 179>


idx_start = 1334
idx_end = 1336
for idx_not_labeled in range(idx_start, idx_end):
    df_data_paragraph.loc[(df_data_paragraph.index == idx_not_labeled), 'label_id'] = 2
df_data_paragraph.loc[(df_data_paragraph['label_id'] == 2), 'label'] = 'Prayer'

df_data_paragraph.loc[(df_data_paragraph.index == idx_start+1)]

### Sermon Checking & Labeling
With label_id = 3 and label = Sermon

In [None]:
# Bahan Khotbah
# 87 - 116 = Pengantar <Hal. 1-5>
# 130 - 153 = Rabu Abu 17 Feb <Hal 11-18>
# 169 - 202 = Minggu I Pra Paska <Hal 19-30>
# 244 - 265 = Minggu II Pra Paska <Hal 31-38>
# 282 - 307 = Minggu III Pra Paska <Hal 39-48>
# 334 - 354 = Minggu IV Pra Paska <Hal 49-60>
# 388 - 409 = Minggu V Pra Paska <Hal 61-69>
# Bahan Anak
# 2113 - 2123 = BA 1 <Hal 231-235>
# 2136 - 2144 = BA 1 Kelas Paud <Hal 235-237>
# 2153 - 2160 = BA 1 Kelas Kecil <Hal 237-239>
# 2168 - 2176 = BA 1 Kelas Besar <Hal 239-240>
# 2194 - 2211 = BA 2 <Hal 245-247>
# 2224 - 2233 = BA 2 Kelas Paud <Hal 248-249>
# 2243 - 2253 = BA 2 Kelas Kecil <Hal 249-251>
# 2262 - 2272 = BA 2 Kelas Besar <Hal 252-253>
# 2284 - 2295 = BA 3 <Hal 257-259>
# 2309 - 2318 = BA 3 Kelas Paud <Hal 260-261>
# 2329 - 2338 = BA 3 Kelas Kecil <Hal 262-263>
# 2349 - 2357 = BA 3 Kelas Besar <Hal 264-265>
# Bahan Pemahaman Alkitab
# 2477 - 2493 = PA 1 <Hal 285-287>
# 2500 - 2515 = PA 2 <Hal 289-291>
# 2537 - 2552 = PA 3 <Hal 293-296>
# Bahan Persekutuan Doa
# 2701 - 2713 = PD 1 <Hal 316-318>
# 2740 - 2748 = PD 2 <Hal 322-324>
# 2782 - 2784 = PD 3 <Hal 328>
# 2789 - 2795 = PD 3 <Hal 328 - 329>]
# Bahan Sarasehan
# 3032 - 3086 = Sarasehan <Hal 351-369>

idx_start = 3032
idx_end = 3087
for idx_not_labeled in range(idx_start, idx_end):
    df_data_paragraph.loc[(df_data_paragraph.index == idx_not_labeled), 'label_id'] = 3
df_data_paragraph.loc[(df_data_paragraph['label_id'] == 3), 'label'] = 'Sermon'

df_data_paragraph.loc[(df_data_paragraph.index == idx_start+1)]

### Worship Checking & Labeling
With label_id = 4 and label = Worship

In [None]:
# Worship Labeling <Bahan Liturgi dan Khotbah>
# 383 - 385 = Bahan Khotbah (Masa Paska Minggu IV) <Hal 59>

# Bahan Liturgi
# 677 - 688 = Rabu Abu <Hal 131>
# 700 - 708 = Rabu Abu <Hal 132>
# 727 - 731 = Rabu Abu <Hal 134>
# 758 - 769 = Rabu Abu <Hal 135-136>
# 774 - 786 = Rabu Abu <Hal 136>
# 796 - 800 = Minggu Pra Paska I <Hal 140>
# 810 - 816 = Minggu Pra Paska I <Hal 141>
# 820 - 821 = Minggu Pra Paska I <Hal 142>
# 824 = Minggu Pra Paska I <Hal 142>
# 827 = Minggu Pra Paska I <Hal 142>
# 831 - 836 = Minggu Pra Paska I <Hal 143>
# 863 - 867 = Minggu Pra Paska I <Hal 144-145>
# 872 - 875 = Minggu Pra Paska I <Hal 145>
# 881 - 883 = Minggu Pra Paska I <Hal 146>
# 888 - 890 = Minggu Pra Paska II <Hal 147>
# 900 - 902 = Minggu Pra Paska II <Hal 148>
# 906 - 910 = Minggu Pra Paska II <Hal 149>
# 916 - 919 = Minggu Pra Paska II <Hal 149-150>
# 946 - 954 = Minggu Pra Paska II <Hal 151>
# 959 - 961 = Minggu Pra Paska II <Hal 152>
# 973 - 982 = Minggu Pra Paska III <Hal 154>
# 995 - 998 = Minggu Pra Paska III <Hal 155>
# 1003 - 1005 = Minggu Pra Paska III <Hal 156>
# 1014 - 1022 = Minggu Pra Paska III <Hal 156-157>
# 1057 - 1062 = Minggu Pra Paska III <Hal 158-159>
# 1073 - 1077 = Minggu Pra Paska III <Hal 160>
# 1104 - 1111 = Minggu Pra Paska IV <Hal 163>
# 1122 - 1124 = Minggu Pra Paska IV <Hal 164>
# 1137 - 1142 = Minggu Pra Paska IV <Hal 165>
# 1151 - 1153 = Minggu Pra Paska IV <Hal 165-166>
# 1195 - 1203 = Minggu Pra Paska IV <Hal 167>
# 1211 - 1223 = Minggu Pra Paska IV <Hal 168>
# 1233 - 1234 = Minggu Pra Paska IV <Hal 169>
# 1239 - 1242 = Minggu Pra Paska V <Hal 171-172>
# 1253 - 1257 = Minggu Pra Paska V <Hal 173>
# 1262 - 1263 = Minggu Pra Paska V Mazmur 31 <Hal 173-174>
# 1266 - 1267 = Minggu Pra Paska V Mazmur 31 <Hal 173-174>
# 1270 - 1271 = Minggu Pra Paska V Mazmur 31 <Hal 173-174>
# 1274 - 1275 = Minggu Pra Paska V Mazmur 31 <Hal 173-174>
# 1284 - 1288 = Minggu Pra Paska V <Hal 175>
# 1292 - 1293 = Minggu Pra Paska V <Hal 176>
# 1318 - 1321 = Minggu Pra Paska V <Hal 177>
# 1327 - 1331 = Minggu Pra Paska V <Hal 178>
# Bahan Persekutuan Dewasa
# 2524 - 2536 = PA 3 <Hal 293>
# Bahan Persekutuan Doa
# 2690 - 2693 = PD 1 <Hal 315>
# 2696 - 2700 = PD 1 <Hal 315>
# 2715 - 2716 = PD 1 <Hal 318>
# 2722 - 2724 = PD 1 <Hal 319>
# 2728 - 2732 = PD 2 <Hal 321>
# 2735 - 2738 = PD 2 <Hal 321>
# 2750 - 2753 = PD 2 <Hal 324>
# 2758 - 2762 = PD 2 <Hal 325>
# 2766 - 2774 = PD 3 <Hal 327>
# 2777 - 2780 = PD 3 <Hal 328>
# 2785 - 2788 = PD 3 <Hal 328>
# 2797 - 2802 = PD 3 <Hal 330>
# 2807 - 2816 = PD 4 <Hal 331>

idx_start = 2807
idx_end = 2817
for idx_not_labeled in range(idx_start, idx_end):
    df_data_paragraph.loc[(df_data_paragraph.index == idx_not_labeled), 'label_id'] = 4
df_data_paragraph.loc[(df_data_paragraph['label_id'] == 4), 'label'] = 'Worship'

df_data_paragraph.loc[(df_data_paragraph.index == idx_start + 1)]

In [None]:
# Temporary Save
# df_data_paragraph.to_csv('labeled_paskah_partial1.csv')

### Slicing Data
--- Menjadi Manusia Paska 2021---
Bahan Bastian:
Bahan Dasar	 1

Bahan Kotbah 
Rabu Abu [17 Februari 2021]	11
Minggu I Pra Paska [21 Februari 2021]	19
Minggu II Pra Paska [28 Februari 2021]	31
Minggu III Pra Paska [7 Maret 2021]	39
Minggu IV Pra Paska [14 Maret 2021]	49
Minggu V Pra Paska [21 Maret 2021]	61

Bahan Liturgi 
Rabu Abu [17 Februari 2021]	131
Minggu I Pra Paska [21 Februari 2021]	139
Minggu II Pra Paska [28 Februari 2021]	147
Minggu III Pra Paska [7 Maret 2021]	153
Minggu IV Pra Paska [14 Maret 2021]	163
Minggu V Pra Paska [21 Maret 2021]	171

Bahan Anak 
Bahan Anak 1 	231
Bahan Anak 2	245
Bahan Anak 3	 257

Bahan PA Dewasa
Bahan PA Dewasa 1 	285
Bahan PA Dewasa 2	289
Bahan PA Dewasa 3	 293

Bahan Persekutuan Doa 
Bahan Persekutuan Doa 1 	315
Bahan Persekutuan Doa 2	 321
Bahan Persekutuan Doa 3	 327

Bahan Sarasehan	 351

In [None]:
search_text = 'Pertanyaan dapat ditambahi, diolah atau dikembangkan sesuai dengan situasi dan kondisi jemaat'
df_data_paragraph.loc[df_data_paragraph.VALUE.str.contains(search_text)]

In [None]:
df_data_paragraph.loc[(df_data_paragraph.index) == 3088]

In [None]:
df_data_paragraph.iloc[2688:1338]

In [None]:
# Pengantar
df_1 = df_data_paragraph.iloc[86:130]
df_2 = df_data_paragraph.iloc[130:426]
df_3 = df_data_paragraph.iloc[675:1338]
df_4 = df_data_paragraph.iloc[2112:2374]
df_5 = df_data_paragraph.iloc[2476:2557]
df_6 = df_data_paragraph.iloc[2688:2818]
df_7 = df_data_paragraph.iloc[3032:3089]

In [None]:
lst_df = [df_1, df_2, df_3, df_4, df_5, df_6, df_7]
df_final_sliced = pd.concat(lst_df)

In [None]:
df_final_sliced.to_csv('book_paskah_labeled_bastian.csv', index = True)

In [2]:
df_test = pd.read_csv('book_paskah_labeled_bastian.csv')
df_test

Unnamed: 0.1,Unnamed: 0,TYPE,VALUE,style_type,style_left,style_hanging,numPr_type,numPr_ilvl,numPr_id,label_id,label
0,86,text,Pengantar,[],[],[],[],[],[],1,Announcement
1,87,text,"Bagi orang Kristen, hidup dimulai dari kebangk...",CT_Ind,[],[],[],[],[],3,Sermon
2,88,text,Kristus telah melewati kematian agar manusia b...,CT_Ind,[],[],[],[],[],3,Sermon
3,89,text,Gereja adalah persekutuan kebangkitan. Para mu...,CT_Ind,[],[],[],[],[],3,Sermon
4,90,text,"Melalui tema ""Menjadi Manusia Paska"" ini, LPP ...",CT_Ind,[],[],[],[],[],3,Sermon
5,91,text,"Covid, Allah yang Kalah?",[],[],[],[],[],[],3,Sermon
6,92,text,"Saat bahan dasar ini dibuat, seluruh warga dun...",[],[],[],[],[],[],3,Sermon
7,93,text,Covid menjadikan rasa damai hilang. Hilangnya ...,CT_Ind,[],[],[],[],[],3,Sermon
8,94,text,Situasi krisis ini melahirkan pertanyaan: apa ...,CT_Ind,[],[],[],[],[],3,Sermon
9,95,text,Peristiwa salib mengajak kita menghayati peris...,CT_Ind,[],[],[],[],[],3,Sermon


In [None]:
# df_data_paragraph2 = pd.read_csv('labeled_paskah_partial1.csv')
df_data_paragraph.loc[(df_data_paragraph.index == 881)]

In [None]:
# Final Save
df_data_paragraph.to_csv('labeled_paskah_final_bastian.csv')

In [None]:
df_data_paragraph_final = pd.read_csv('labeled_paskah_final_bastian.csv')

In [None]:
df_data_paragraph_final.loc[(df_data_paragraph_final.label_id == 4)]