In [2]:
from tabula import read_pdf
import tabula
import os, glob, re
import pandas as pd
import numpy as np

pd.set_option('display.max_rows', 300)

### Ścieżki

In [49]:
FIRST_1 = os.path.join(os.getcwd(), "1st/1_round/")
FIRST_2 = os.path.join(os.getcwd(), "1st/2_round/")
SECOND_1 = os.path.join(os.getcwd(), "2nd/1_round/")

paths = [FIRST_1, FIRST_2, SECOND_1]

In [50]:
def organize_columns(df):
    df = df.drop(['lp.', 'Decyzja'], axis=1)
    df['Liczba punktów'] = df['Liczba punktów'].replace(',','.', regex=True).astype(float)
    
    return df

In [51]:
def extract_sex(str_in: str):
    return "K" if re.search("a$", str_in) else "M"

In [52]:
def extract_paragraph(str_in: str):
    if re.search("\*{2}$", str_in):
        return str_in[:-2], 2
    elif re.search("\*$", str_in):
        return str_in[:-2], 1
    else: 
        return str_in, 0

### Ekstrakcja

In [53]:
frames = []
for path in paths:
    for filename in glob.glob(os.path.join(path, '*.pdf')):
        info = re.search('Z_\d{3}_([A-Za-z]+_[A-Za-z_-]+[^_\d])', filename, re.IGNORECASE).group(1)
        department, course = info.split('_', 1)
        data = read_pdf(filename, pages='all', multiple_tables=False, lattice=True, silent=True)
        df = pd.DataFrame(data[0])
        df = organize_columns(df)

        df['Wydział'] = department
        df['Kierunek'] = course
        
        if path == FIRST_1:
            df['Stopień'] = 1
            df['nabór'] = 1
        elif path == FIRST_2:
            df['Stopień'] = 1
            df['nabór'] = 2
        else:
            df['Stopień'] = 2
            df['nabór'] = 1
        
        df['Płeć'] = df['Imię'].map(extract_sex)
        df['Nazwisko'], df['Paragraf'] = zip(*df['Nazwisko'].map(extract_paragraph))
        
        frames.append(df)
    
main_df = pd.concat(frames)
main_df.to_csv(os.path.join(os.getcwd(), 'main.csv'), index=False, encoding="utf-8-sig")

### Przykład

In [54]:
data = read_pdf(SECOND_1+"lista_przyjetych_2020-2021_Z_122_WIiT_Bioinformatyka_20200903.pdf"
                , pages='all', multiple_tables=False, lattice=True, silent=True)
df = pd.DataFrame(data[0])
df

Unnamed: 0,lp.,Nazwisko,Imię,Drugie imię,Liczba punktów,Decyzja
0,1,Cieluba**,Jan,,65,Przyjęty
1,2,Poniatowska,Paulina,Ewa,51,Przyjęty
2,3,Rynkiewicz,Adam,Aron,74,Przyjęty
3,4,Samsel,Barbara,,89,Przyjęty
