In [9]:
def separate_text(text, symbol, length):
    parts = text.split(symbol)
    if len(parts) > length:
        return symbol.join(parts[:length]), symbol.join(parts[length:])
    else:
        return text, ''

In [10]:
import csv
import pandas as pd

hadith_csv = './hadis/musnad_syafii.csv'
output_csv = './output_file.csv'

data = {'ID': [], 'Kitab': [], 'Sanad': [], 'Matan': [], 'Sanad_Arab': [], 'Matan_Arab': []}

with open(hadith_csv, newline='', encoding='utf-8') as csvfile:
    hadith_reader = csv.DictReader(csvfile, delimiter=';', quotechar='"')
    for row in hadith_reader:
        id_value = row['id']
        translate_text = row['terjemah']
        arabic_text = row['arab']

        kitab, _, rest = map(str.strip, translate_text.partition(':'))
        sanad, _, matan = map(str.strip, rest.partition('"'))
        kitabArab, _, restArab = map(str.strip, arabic_text.partition(':'))

        sanad_colon_count = sanad.count(':')
        sanad_arab, matan_arab = separate_text(restArab, ":", sanad_colon_count)
        
        data['ID'].append(id_value)
        data['Kitab'].append(kitab)
        data['Sanad'].append(sanad)
        data['Matan'].append(matan)
        data['Sanad_Arab'].append(sanad_arab)
        data['Matan_Arab'].append(matan_arab)

# Creating a DataFrame from the collected data
df = pd.DataFrame(data)
    
display(df)

# Exporting the DataFrame to a CSV file
df.to_csv(output_csv, index=False, sep=';')

Unnamed: 0,ID,Kitab,Sanad,Matan,Sanad_Arab,Matan_Arab
0,1,Musnad Syafi'i 1,Imam Abu Abdullah Muhammad bin Idris Asy-Syafi...,"Sesungguhnya kami akan menaiki perahu di laut,...",أَخْبَرَنَا الْإِمَامُ أَبُو عَبْدِ اللهِ مُحَ...,«هُوَ الطَّهُورُ مَاؤُهُ، الْحِلُّ مَيْتَتُهُ»
1,2,Musnad Syafi'i 2,Orang yang terpercaya memberitahukan kepada ka...,"Apabila air mencapai dua qullah, maka ia tidak...",أَنْبَأَنَا الثِّقَةُ، عَنِ الْوَلِيدِ بْنِ كَ...,«إِذَا كَانَ الْمَاءُ قُلَّتَيْنِ لَمْ يَحْمِ...
2,3,Musnad Syafi'i 3,Malik mengabarkan kepada kami dari Abu Az-Zina...,Apabila anjing minum dari wadah seseorang di a...,أَخْبَرَنَا مَالِكٌ، عَنْ أَبِي الزِّنَادِ، عَ...,«إِذَا شَرِبَ الْكَلْبُ مِنْ إِنَاءِ أَحَدِكُ...
3,4,Musnad Syafi'i 4,Sufyan bin Uyainah mengabarkan kepada kami dar...,Apabila seekor anjing menjilat wadah seseorang...,أَخْبَرَنَا سُفْيَانُ بْنُ عُيَيْنَةَ، عَنْ أَ...,«إِذَا وَلَغَ الْكَلْبُ فِي إِنَاءِ أَحَدِكُم...
4,5,Musnad Syafi'i 5,Ibnu Uyainah memberitahukan kepada kami dari A...,Apabila anjing menjilat dalam wadah seseorang ...,أَنْبَأَنَا ابْنُ عُيَيْنَةَ، عَنْ أَيُّوبَ بْ...,«إِذَا وَلَغَ الْكَلْبُ فِي إِنَاءِ أَحَدِكُم...
...,...,...,...,...,...,...
1795,1796,Musnad Syafi'i 1796,Sufyan mengabarkan kepada kami dari Hisyam bin...,"Apakah engkau tidak bermaksud melakukan haji?""...",أَخْبَرَنَا سُفْيَانُ، عَنْ هِشَامِ بْنِ عُرْو...,«أَمَا تُرِيدِينَ الْحَجَّ؟» قَالَتْ: إِنِّي ...
1796,1797,Musnad Syafi'i 1797,Sufyan mengabarkan kepada kami dari Hisyam bin...,"Wahai anak saudara perempuanku, apakah engkau ...",أَخْبَرَنَا سُفْيَانُ، عَنْ هِشَامِ بْنِ عُرْو...,قَالَتْ عَائِشَةُ: «يَا ابْنَ أُخْتِي، هَلْ ت...
1797,1798,Musnad Syafi'i 1798,Ibnu Ulayah mengabarkan kepada kami dari Abu H...,,أَخْبَرَنَا ابْنُ عُلَيَّةَ، عَنْ أَبِي حَمْزَ...,قُلْتُ: «كَانَ أَحَبَّ أَنْ يَكُونَ لِكُلِّ و...
1798,1799,Musnad Syafi'i 1799,"Pamanku, Muhammad bin Ali bin Syafi' mengabark...",Ketika aku bersama Utsman pada tumpukan harta ...,,أَخْبَرَنِي عَمِّي مُحَمَّدُ بْنُ عَلِيِّ بْنِ...


In [30]:
import csv

def extract_sanad_words(file_path, exceptions):
    sanad_words = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            word = line.strip()
            if word not in exceptions:
                sanad_words.append(word)
    return sanad_words

verba_file = './kamus/verba.txt'
dasar_file = './kamus/dasar.txt'
exceptions = ['bin', 'binti', 'Nabi', 'Rasul', 'Allah', 'Abu', 'Ummu', 'Ummu', 'Imam']

verba_sanad_words = extract_sanad_words(verba_file, exceptions)
dasar_sanad_words = extract_sanad_words(dasar_file, exceptions)

input_csv = 'output_file.csv'
output_txt = 'extracted_words.txt'

extracted_words = set()  # Use a set to avoid duplicates

with open(input_csv, 'r', encoding='utf-8') as file:
    csv_reader = csv.DictReader(file, delimiter=';')
    for row in csv_reader:
        sanad_sentence = row['Sanad']
        words = sanad_sentence.split()  # Split the sentence into words
        for word in words:
            if word in verba_sanad_words or word in dasar_sanad_words and word not in exceptions:
                extracted_words.add(word)  # Add word to the set

with open(output_txt, 'w', encoding='utf-8') as file:
    file.write('\n'.join(extracted_words))

print('Extracted words have been written to extracted_words.txt.')

Extracted words have been written to extracted_words.txt.


In [32]:
import csv
import re

# Read the words from extracted_words.txt
extracted_words_file = 'extracted_words.txt'
with open(extracted_words_file, 'r', encoding='utf-8') as file:
    extracted_words = set(file.read().splitlines())

# Read the Indonesian stop words
with open('./kamus/stop.txt', 'r', encoding='utf-8') as file:
    for line in file:
        word = line.strip()
        if word not in ['bin', 'binti', 'Nabi', 'Rasul', 'Allah', 'Abu', 'Ummu', 'Ummu', 'Imam']:
            stop_words = set(file.read().splitlines())

with open('./kamus/dasar.txt', 'r', encoding='utf-8') as file:
    for line in file:
        word = line.strip()
        if word not in ['bin', 'binti', 'Nabi', 'Rasul', 'Allah', 'Abu', 'Ummu', 'Ummu', 'Imam']:
            stop_words = set(file.read().splitlines())

with open('./kamus/nomina.txt', 'r', encoding='utf-8') as file:
    for line in file:
        word = line.strip()
        if word not in ['bin', 'binti', 'Nabi', 'Rasul', 'Allah', 'Abu', 'Ummu', 'Ummu', 'Imam']:
            stop_words = set(file.read().splitlines())

# Input and output file paths
input_csv = 'output_file.csv'
output_csv = 'new_output_file.csv'

# Create a new CSV file with the "perawi" column
with open(input_csv, 'r', encoding='utf-8') as file:
    csv_reader = csv.DictReader(file, delimiter=';')
    fieldnames = csv_reader.fieldnames + ['perawi']

    with open(output_csv, 'w', encoding='utf-8', newline='') as output_file:
        writer = csv.DictWriter(output_file, fieldnames=fieldnames, delimiter=';')
        writer.writeheader()

        for row in csv_reader:
            sanad_sentence = row['Sanad']
            words = sanad_sentence.split()
            perawi_sentence = ''
            previous_word_matched = False

            for word in words:
                if word in extracted_words:
                    if not previous_word_matched:
                        perawi_sentence += ','
                        previous_word_matched = True
                else:
                    perawi_sentence += ' ' + word
                    previous_word_matched = False

            # Remove double commas
            perawi_sentence = re.sub(r',+', ',', perawi_sentence)

            # Remove Indonesian stop words
            perawi_words = perawi_sentence.split()
            perawi_words = [word for word in perawi_words if word.lower() not in stop_words]
            perawi_sentence = ' '.join(perawi_words)

            row['perawi'] = perawi_sentence.strip()
            writer.writerow(row)

print('New CSV file has been created: new_output_file.csv.')

New CSV file has been created: new_output_file.csv.
