In [13]:
import requests
import pandas as pd
from bs4 import BeautifulSoup

# the base url and the page parameter
base_url = 'http://e-masjid.jais.gov.my/index.php/ekhutbah'
page_param = '?page='

# create an empty list to store the results
khutbah_list = []

# loop through all the pages (1 to 228)
for i in range(1, 229):
    # make a request to the page
    response = requests.get(base_url + page_param + str(i))
    # parse the html content with BeautifulSoup
    soup = BeautifulSoup(response.content, 'html.parser')
    # find the table with id "pemohon"
    table = soup.find('table', {'id': 'pemohon'})
    # loop through all the rows in the table (except the header row)
    for row in table.find_all('tr')[1:]:
        # extract the data from each cell
        cells = row.find_all('td')
        bil = cells[0].text.strip()
        # tarikh = pd.to_datetime(cells[1].text.strip(), format='%d-%m-%Y') # convert to datetime format
        tarikh = cells[1].text.strip()
        tajuk = cells[2].text.strip()
        versi = cells[3].text.strip()
        fail = cells[4].find('a')['href'].strip()
        # create a dictionary of the data
        khutbah = {
            'bil': bil,
            'tarikh': tarikh,
            'tajuk': tajuk,
            'versi': versi,
            'fail': fail
        }
        # add the dictionary to the list
        khutbah_list.append(khutbah)

# create a pandas dataframe from the list of dictionaries
df = pd.DataFrame(khutbah_list)

# extract year from tarikh column
df['year'] = df['tarikh'].str[-4:]

# format the fail column with the full URL
prefix = 'http://e-masjid.jais.gov.my/'
df['fail'] = prefix + df['fail'].str[2:]

# print the dataframe
df


Unnamed: 0,bil,tarikh,tajuk,versi,fail,year
0,1,07-04-2023,HIKMAH BULAN RAMADHAN,Rumi,http://e-masjid.jais.gov.my//uploads/uploads/0...,2023
1,2,07-04-2023,HIKMAH BULAN RAMADHAN,Jawi,http://e-masjid.jais.gov.my//uploads/uploads/0...,2023
2,3,07-04-2023,THE WISDOM OF RAMADAAN,English,http://e-masjid.jais.gov.my//uploads/uploads/H...,2023
3,4,07-04-2023,THE WISDOM OF RAMADAAN,Multimedia,http://e-masjid.jais.gov.my//uploads/uploads/H...,2023
4,5,07-04-2023,THE WISDOM OF RAMADAAN PDF,Multimedia,http://e-masjid.jais.gov.my//uploads/uploads/S...,2023
...,...,...,...,...,...,...
4538,4539,14-01-2011,PERANAN ULAMA,Multimedia,http://e-masjid.jais.gov.my//uploads/uploads/K...,2011
4539,4540,07-01-2011,MENCARI MAKANAN YANG HALAL ADALAH WAJIB,Rumi,http://e-masjid.jais.gov.my//uploads/uploads/K...,2011
4540,4541,07-01-2011,MENCARI MAKANAN YANG HALAL ADALAH WAJIB,Jawi,http://e-masjid.jais.gov.my//uploads/uploads/K...,2011
4541,4542,07-01-2011,MENCARI MAKANAN YANG HALAL ADALAH WAJIB,Multimedia,http://e-masjid.jais.gov.my//uploads/uploads/K...,2011


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4543 entries, 0 to 4542
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   bil     4543 non-null   object
 1   tarikh  4543 non-null   object
 2   tajuk   4543 non-null   object
 3   versi   4543 non-null   object
 4   fail    4543 non-null   object
dtypes: object(5)
memory usage: 177.6+ KB


In [29]:
import os
import re
import requests
from datetime import datetime

# Define function to create folder structure
def create_folder_structure():
    for year in range(2011, 2024):
        year_folder = str(year)
        os.makedirs(os.path.join(year_folder, 'BM'))
        os.makedirs(os.path.join(year_folder, 'English'))

# Create folder structure
create_folder_structure()

# Filter the DataFrame to only include rows where VERSI KHUTBAH is 'Rumi' or 'English'
df_filtered = df[df['versi'].isin(['Rumi', 'English'])]

# Iterate over the rows in the filtered DataFrame
for i, (index, row) in enumerate(df_filtered.iterrows(), 1):
    # Check if the 'Fail Khutbah' column contains a PDF link
    if '.pdf' in row['fail']:
        # Download the PDF file and save it to disk
        pdf_url = row['fail']
        response = requests.get(pdf_url)
        # Extract the language from the 'versi' column
        language = row['versi']
        # Create a filename based on the 'tarikh', 'tajuk', and 'versi' columns
        # Remove any symbol characters from the 'tajuk' column using regular expressions
        tajuk = re.sub(r'[^\w\s]', '', row['tajuk'])
        # Convert the 'tarikh' string to a datetime object
        tarikh = datetime.strptime(row['tarikh'], '%d-%m-%Y')
        filename = f"{tarikh.year}_{tarikh.month}_{tarikh.day}_{tajuk}_{row['versi']}.pdf"
        # Save the file to the appropriate folder
        if language == 'Rumi':
            folder = os.path.join(str(tarikh.year), 'BM')
        else:
            folder = os.path.join(str(tarikh.year), 'English')
        with open(os.path.join(folder, filename), 'wb') as f:
            f.write(response.content)
        # Print progress
        print(f"File {i} downloaded and saved to {os.path.join(folder, filename)}")


File 1 downloaded and saved to 2023\BM\2023_4_7_HIKMAH BULAN RAMADHAN_Rumi.pdf
File 2 downloaded and saved to 2023\English\2023_4_7_THE WISDOM OF RAMADAAN_English.pdf
File 3 downloaded and saved to 2023\BM\2023_3_31_PUASA MENINGKATKAN KETAKWAAN UMAT ISLAM_Rumi.pdf
File 4 downloaded and saved to 2023\English\2023_3_31_FASTING INCREASES THE TAQWA OF THE MUSLIM UMMAH_English.pdf
File 5 downloaded and saved to 2023\BM\2023_3_24_LGBT TABIAT SONGSANG_Rumi.pdf
File 6 downloaded and saved to 2023\English\2023_3_24_LGBT IS A PERVERTED HABIT_English.pdf
File 7 downloaded and saved to 2023\BM\2023_3_17_EKSTREMIS AGAMA_Rumi.pdf
File 8 downloaded and saved to 2023\English\2023_3_17_RELIGIOUS EXTREMISTS_English.pdf
File 9 downloaded and saved to 2023\BM\2023_3_10_TAULIAH MENGAJAR SUATU KEPERLUAN_Rumi.pdf
File 10 downloaded and saved to 2023\English\2023_3_10_TEACHING CREDENTIAL IS A NECESSITY_English.pdf
File 11 downloaded and saved to 2023\BM\2023_3_3_AJARAN SEMPURNA ANAKANAK TERPELIHARA_Rumi.pdf
F

In [30]:
df_filtered.shape

(1270, 6)