In [1]:
import pandas as pd
from datetime import date
import PyPDF2
from pdfminer.high_level import extract_text

In [2]:
attendance_start = date(2022,12,19)
current_session = "session_15"

In [4]:
ss = pd.read_csv(f'sessions/{current_session}/{current_session}.csv',parse_dates=['date'], infer_datetime_format=True)
ss.date = pd.to_datetime(ss.date).dt.date
ss = ss[ss.date >= attendance_start]
sessions = ss.session.tolist()
session_date = dict(zip(ss.session,ss.date))

In [5]:
mp = pd.read_csv(f'sessions/{current_session}/mp_session_15.csv') #,usecols=['seat_code','seat','mp'])
mp = mp.iloc[:, :].astype("string")
mp['seat_search'] = ['(' + ''.join(area.split()).lower() + ')' for area in mp.seat.tolist()]

In [6]:
df = pd.DataFrame(columns=['date'] + mp.seat_code.tolist())
df.head()

Unnamed: 0,date,P001,P002,P003,P004,P005,P006,P007,P008,P009,...,P213,P214,P215,P216,P217,P218,P219,P220,P221,P222


### Strategy
- Step 1: Use the phrase **"Senarai Kehadiran"** to find the page where the present list starts
- Step 2: Use the phrase **"Tidak Hadir"** to find the page where the absent list starts
- Step 3: Extract text from these pages, join, and remove anything after the **"tidak hadir"** phrase
- Step 4: Encode everyone as absent; encode as present if in string from Step 3

In [8]:
def find_MP(seat,string): return 1 if seat in string else 0

In [9]:
from tqdm.notebook import tqdm

for session in tqdm(sessions):
    #print(session)
    pdf_active = PyPDF2.PdfReader(open(f'src_hansard/{current_session}/hansard_' + session + '.pdf', 'rb', ),strict=False)
    n_pages = len(pdf_active.pages)
    extract_start = 0
    start_set = 0
    extract_end = 0
    # create a for loop to find start and end page for MPs attendance
    for page in range(n_pages):
        page_active = ''.join(pdf_active.pages[page].extract_text().split()).lower()
        if start_set == 0 and ('senaraikehadiran' in page_active or 'ahliyanghadir' in page_active):
            extract_start = page
            start_set = 1 # ensure first instance is taken and frozen
        if 'yangtidakhadir' in page_active: extract_end = page
        if extract_start > 0 and extract_end > 0: break # break the moment we find the end of the section

    res = extract_text(f'src_hansard/{current_session}/hansard_' + session + '.pdf',page_numbers=[x for x in range(extract_start,extract_end+1)])
    res = ''.join(res.split()).lower()
    res = res.replace('(johorbaru)','(johorbahru)')
    hadir = res.split('yangtidakhadir')[0] #only get list name that attend

    # find MP attendance
    attendance = [find_MP(area,hadir) for area in mp.seat_search.tolist()]
    # add date and attendance by seats
    df.loc[len(df)] = [session_date[session]] + attendance

  0%|          | 0/33 [00:00<?, ?it/s]

In [10]:
df = df.set_index('date').transpose()
df['total'] = df.sum(axis=1)
session_dates = list(df.columns)
df = df.reset_index().rename(columns={'index':'seat_code'})
df = pd.merge(df,mp,on=['seat_code'],how='left')
df = df[['seat_code','seat','mp'] + session_dates] # reorder the column

In [11]:
df.head(2)

Unnamed: 0,seat_code,seat,mp,2022-12-19,2022-12-20,2023-02-13,2023-02-14,2023-02-15,2023-02-16,2023-02-20,...,2023-03-21,2023-03-22,2023-03-23,2023-03-27,2023-03-28,2023-03-29,2023-03-30,2023-04-03,2023-04-04,total
0,P001,Padang Besar,RUSHDAN BIN RUSMI,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,33
1,P002,Kangar,ZAKRI BIN HASSAN,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,33


In [12]:
df.to_csv(f'sessions/{current_session}/attendance_{current_session}.csv',index=False)

### Analysis for absence

In [13]:
x_df = df.drop("total", axis=1).replace(0, 2)
x_df.iloc[:, 3:] = x_df.iloc[:,3:] - 1
x_df['total'] = x_df.sum(axis=1)

  x_df['total'] = x_df.sum(axis=1)


In [14]:
x_df.head(5)

Unnamed: 0,seat_code,seat,mp,2022-12-19,2022-12-20,2023-02-13,2023-02-14,2023-02-15,2023-02-16,2023-02-20,...,2023-03-21,2023-03-22,2023-03-23,2023-03-27,2023-03-28,2023-03-29,2023-03-30,2023-04-03,2023-04-04,total
0,P001,Padang Besar,RUSHDAN BIN RUSMI,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,P002,Kangar,ZAKRI BIN HASSAN,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,P003,Arau,SHAHIDAN BIN KASSIM,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,P004,Langkawi,MOHD SUHAIMI BIN ABDULLAH,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,P005,Jerlun,ABDUL GHANI BIN AHMAD,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,7


In [15]:
x_df.to_csv(f'sessions/{current_session}/absence_{current_session}.csv',index=False)