In [1]:
import pdfplumber
from pathlib import Path
import glob
import re
import pprint
from collections import defaultdict
import logging
from datetime import datetime

In [2]:
from DataObjects.article import Article
from DataObjects.legislation import Legislation
from DataObjects.public_consultation import PublicConsultation
from DataObjects.base import Base
from sqlalchemy import create_engine, select, union_all, func, delete, and_
from sqlalchemy.orm import Session, aliased
from configparser import ConfigParser
config = ConfigParser()
config.read("config.ini")
engine = create_engine(config.get('DEFAULT', 'db_file'))

In [33]:
#Pending Files
txt_files_full = glob.glob("pdf_files/text/*_full.txt")

with Session(engine) as sess:
    stmt = select(Article.legislation_id).where(Article.legislation_id>890).distinct()
    legIds = sess.execute(stmt).scalars().all()
# print(legIds)
for f in txt_files_full:
    if not any((str(legID) in f for legID in legIds)):
        print(str(f))

gibberish = ['1231','1442','995']

pdf_files/text\1089_full.txt
pdf_files/text\1231_full.txt
pdf_files/text\1442_full.txt
pdf_files/text\898_full.txt
pdf_files/text\995_full.txt


In [59]:
bills_with_full_text_article_numbers = set(
    ('1279',
    '1247',
    
    '1296',
    '1307',
    '1376',
    '1418',
    '1423')
)

bills_with_inproper_spacing = set(
    ('1255',
    '1442')
)
extempted_from_analysis = [('898','Page 25,30 of pdf.pages have invisible characters in the middle of the screen and misclasify two-column pages as one column.'),
                           ('1089','Page 7 keeps extracting "/" character in the middle  even though bounding box nears an area of zero ((295.5,366,301.8,759.5))'),
                           ('1231','Gibberish extracted')]


In [3]:
logging.basicConfig(filename="pdf_extract.log",
                    filemode='a',
                    format='%(asctime)s - %(levelname)s - %(message)s',level=logging.INFO,
                    encoding='utf-8')

In [19]:
pdf_files_txt = glob.glob("pdf_files/*.pdf")
pdf_files = [Path(pdf_file) for pdf_file in pdf_files_txt]
print(all(pdf_file.exists() for pdf_file in pdf_files))  # check if PathObjs are correctly set up

True


Extract text from PDF

In [20]:
to_do_again = ['1005',
'1023',
'1081',
'1176',
'1235',
'1237',
'1239',
'1240',
'1243',
'1246',
'1247',
'1248',
'1249',
'1250',
'1251',
'1252',
'1253',
'1254',
'1255',
'1259',
'1260',
'1262',
'1263',
'1265',
'1267',
'1272',
'1273',
'1282',
'1284',
'1285',
'1322',
'1323',
'1325',
'1330',
'1339',
'1344',
'1358',
'1415',
'1432',
'1433',
'1434',
'1438',
'1439',
'1440',
'1441',
'1442',
'1443',
'1444',
'1445',
'1448',
'1457',
'894',
'897',
'901',
'902',
'907',
'942',
'957',
'959',
'961',
'963',
'977',
'982',
'984',
'990',
'992',
'994'
]

one_column_list =[ "898",
"1089","1296","1279","1376","1418","1423","1247","1391","1407",
]

In [21]:
# Page Width: 595
# Page Height: 842
txt_files_dir = "pdf_files/text"
for pdf in pdf_files:  #Second run from 53  third run from 70 forth run frm 217
    # print(f"Current File {pdf.stem}")
    with pdfplumber.open(pdf) as f:
        # if pdf.stem not in to_do_again:
        #     continue

        if pdf.stem not in one_column_list:
            continue
        print(f"Current File {pdf.stem}")

        #scrap article numbers and titles
        txt_articles = ""
        txt_full = ""
        now = datetime.now()
        current_time = now.strftime("%H:%M:%S")
        for idx, page in enumerate(f.pages):
            # print(f"Page {idx}")
            p = page.filter(lambda obj: obj["object_type"] == "char" and "Bold" in obj["fontname"])
            if p.width - 595 >1 or p.height - 842 > 1:
                logging.info(msg=f"Page {idx+1}(of {len(f.pages)}) has differnt width or height from usual page. File={pdf.name}, pWidth={p.width}, pHeight={p.height}")
                continue
            box_check_for_one_column = (300,65,301.8,759.5)
            box_check_for_one_column_first_page = (300,400,301.8,759.5)
            one_column_bbox =(0,64,p.width,799)
            one_column_bbox_first_page =(0,400,p.width,799)
            # bbox=(x0,top,x1,bottom) [(0,0) is bottom left]
            # coordinates found with GIMP
            one_column = False
            if idx==0:  #First Page
                
                if page.crop(box_check_for_one_column_first_page).extract_text(): # if text exists in the middle of the page
                    one_column = True
            else:
                if page.crop(box_check_for_one_column).extract_text(): # if text exists in the middle of the page
                    one_column = True
            if one_column == False:
                if idx==0:  # if First Page
                    left_articles = p.crop((0, 366, 0.5 * float(p.width), 779))
                    right_articles = p.crop((0.5 * float(p.width), 366, p.width, 779))

                    left_full = page.crop((0, 366, 0.5 * float(p.width), 779))
                    right_full = page.crop((0.5 * float(p.width), 366, p.width, 779))
                else:
                    left_articles = p.crop((0, 64, 0.5 * float(p.width), 779))
                    right_articles = p.crop((0.5 * float(p.width), 64, p.width, 779))

                    left_full = page.crop((0, 64, 0.5 * float(p.width), 779))
                    right_full = page.crop((0.5 * float(p.width), 64, p.width, 779))
                
                txt_articles +="\n"+ left_articles.extract_text()
                txt_articles +="\n"+ right_articles.extract_text()

                txt_full += "\n" + left_full.extract_text()
                txt_full += "\n" + right_full.extract_text()
            else:
                if idx ==0:
                    page_articles = p.crop(one_column_bbox_first_page)
                    page_full = page.crop(one_column_bbox_first_page)
                else:
                    page_articles = p.crop(one_column_bbox)
                    page_full = page.crop(one_column_bbox)
                
                txt_articles +="\n"+ page_articles.extract_text()
                txt_full += "\n" + page_full.extract_text()
            
        now = datetime.now()
        current_time = now.strftime("%H:%M:%S")
        print(f"Done With Text Extraction at {current_time}")


        txt_articles += "\nΆρθρ"  # Add this so we have a match on the last article
        # txt_full += "Ά"  # Add this so we have a match on the last article
        # matches_articles = re.findall(article_nums_digits_pattern,txt_articles,re.DOTALL)
        now = datetime.now()
        current_time = now.strftime("%H:%M:%S")
        # print(f"Done With ReGEx at {current_time}")

        #Write txt to files
        txt_articles_file = Path("pdf_files") / "text" / f"{pdf.stem}_articles.txt"
        # if not txt_articles_file.exists():
        txt_articles_file.touch()
        txt_articles_file.write_text(txt_articles,encoding="utf-8")
        
        txt_full_file = Path("pdf_files") / "text" / f"{pdf.stem}_full.txt"
        # if not txt_full_file.exists():
        txt_full_file.touch()
        txt_full_file.write_text(txt_full,encoding="utf-8")

    

Current File 1089
Done With Text Extraction at 20:31:53
Current File 1247
Done With Text Extraction at 20:32:07
Current File 1279
Done With Text Extraction at 20:32:35
Current File 1296
Done With Text Extraction at 20:32:46
Current File 1376
Done With Text Extraction at 20:32:50
Current File 1391
Done With Text Extraction at 20:34:06
Current File 1407
Done With Text Extraction at 20:34:45
Current File 1418
Done With Text Extraction at 20:34:46
Current File 1423
Done With Text Extraction at 20:35:02
Current File 898
Done With Text Extraction at 20:35:30


Read text, extract articles and Save to DB

In [7]:
txt_files_articles = glob.glob("pdf_files/text/*_articles.txt")
txt_files_full = glob.glob("pdf_files/text/*_full.txt")

end_of_bill1 = "Ο ΠΡΟΕΔΡΟ" # Ο ΠΡΟΕΔΡΟΣ ΤΗΣ ΒΟΥΛΗΣ
end_of_bill2= "ΠΑΡΑΡ" # ΠΑΡΑΡΤΗΜΑ

for txt_articles_file, txt_full_file in zip(txt_files_articles,txt_files_full):  # (100)1255: Άρθρο1
    # Read Text Files
    # tested on 
    print(f"Text File {txt_full_file}")
    txt_articles = Path(txt_articles_file).read_text(encoding='utf-8')
    txt_articles += "ρθρ"  #Necessary in order to match last article due to regex changes on the positive lookahead (?=Ά) = (?=Άρθρ)
    txt_full = Path(txt_full_file).read_text(encoding='utf-8')

    # article_nums_digits_pattern= r"(?<!«)Άρθρο\s?(\d+)(.*?)(:?(?=Ά)|(?=ΚΕΦΑ)|(?=ΜΕΡΟΣ))"
    # article_nums_digits_pattern = r"^Άρθρο\s?(\d+)(.*?)(:?(?=Ά)|(?=ΚΕΦΑ)|(?=ΜΕΡΟΣ))"
    # article_nums_digits_pattern = r"^Άρθρο\s?(\d+)(.*?)(:?(?=Άρθ)|(?=ΚΕΦΑ)|(?=ΜΕΡΟΣ))"

    # change positve lookahead because (?=Ά) falsely disregarded article titles that started with Ά
    article_nums_digits_pattern = r"^Άρθρ[οo]\s?(\d+)(.*?)(:?(?=Άρθρ)|(?=ΚΕΦΑ)|(?=ΜΕΡΟΣ))"  #id=892 Άρθρο 95 has a Latin "o" 

    matches_articles = re.findall(article_nums_digits_pattern,txt_articles,re.DOTALL| re.MULTILINE)

    # Create Article Dict
    articles_dict = {}
    for match in matches_articles:
        articles_dict[match[0]] = match[1]

    data = defaultdict(dict)
    try:
        max_article_num = max((int(x) for x in articles_dict.keys()))
        min_article_num = min((int(x) for x in articles_dict.keys()))
    except ValueError as e:
        logging.log(logging.INFO, f"File={txt_articles_file} - Empty Sequence. Msg={str(e)} ")
        continue

    if sorted((int(x) for x in articles_dict.keys())) != list(range(min_article_num,max_article_num+1)):
        # raise ValueError(f"Missing Article Numbers {set(range(min_article_num,max_article_num+1)).difference(set(sorted((int(x) for x in articles_dict.keys()))))}")
        logging.log(logging.INFO, f"File={txt_articles_file} Missing Article Numbers {set(range(min_article_num,max_article_num+1)).difference(set(sorted((int(x) for x in articles_dict.keys()))))}")
        continue

    break_to_outer_loop = False
    for k in articles_dict.keys():

        current_article_num = int(k)
        next_article_num = current_article_num+1

        
        # article_text_start_idx = txt_full.rfind(article_text_start)  # We need the last occurance of Άρθρο \d+\n
        # article_text_start = articles_dict[k].strip()

        # Try to match by Article No. And Title
        article_text_start = f"Άρθρο {current_article_num}{articles_dict[str(current_article_num)]}".strip()
        article_text_start_idx = txt_full.find(article_text_start)
        flag_use_full_title_in_search=True

        # If no result, match by Article No. Only
        if article_text_start_idx==-1:
            # if matching  by title fails, match by article
            # 1310 => Article 2 has hidden text
            article_text_start = f"Άρθρο {current_article_num}\n"
            article_text_start_idx = txt_full.find(article_text_start)
            flag_use_full_title_in_search = False
        
        if article_text_start_idx==-1:
            # id=892 => Match by Latin "o"
            article_text_start = f"Άρθρo {current_article_num}\n"
            article_text_start_idx = txt_full.find(article_text_start)
            flag_use_full_title_in_search = False

        # If still no result, try with no whitespace between Article and No (caused by improper text extraction)
        if article_text_start_idx==-1:
            # 1362 => Άρθρο 10 is outputed as 'Άρθρο10'
            article_text_start = f"Άρθρο{current_article_num}\n"
            article_text_start_idx = txt_full.find(article_text_start)

        if article_text_start_idx==-1:
            logging.log(logging.INFO,f"File={txt_articles_file}Cannot find text {article_text_start} in full_text")
            break_to_outer_loop=True

        if current_article_num!=max_article_num:
            # article_text_end = f"Άρθρο {next_article_num}\n"
            article_text_end = f"Άρθρο {next_article_num}{articles_dict[str(next_article_num)]}".strip()
            # article_text_end_idx = txt_full.rfind(article_text_end)
            article_text_end_idx = txt_full.find(article_text_end)

            if article_text_end_idx==-1:
                # if None, try to match only the article number
                article_text_end = f"Άρθρο {next_article_num}".strip()
                article_text_end_idx = txt_full.find(article_text_end)
            
            if article_text_end_idx==-1:
                # id=892 => Match by Latin "o"
                article_text_end = f"Άρθρo {next_article_num}".strip()
                article_text_end_idx = txt_full.find(article_text_end)

            if article_text_end_idx==-1:
                # if None, try to match only the article number without whitespace
                article_text_end = f"Άρθρο{next_article_num}".strip()
                article_text_end_idx = txt_full.find(article_text_end)

            if article_text_end_idx==-1:
                logging.log(logging.INFO,f"File={txt_articles_file} Cannot find text {article_text_end} in full_text")
                # raise ValueError(f"Cannot find text {article_text_end} in full_text")
                break_to_outer_loop=True


        else:
            if txt_full.rfind(end_of_bill2)>0 :
                article_text_end_idx = txt_full.rfind(end_of_bill2) # ΠΑΡΑΡΤΗΜΑ
            elif txt_full.rfind(end_of_bill1)>0:
                article_text_end_idx = txt_full.rfind(end_of_bill1) # Ο ΠΡΟΕΔΡΟΣ
            else:
                article_text_end_idx = len(txt_full)
                
        article_text = txt_full[article_text_start_idx+len(article_text_start):article_text_end_idx]
        if flag_use_full_title_in_search==False:
            article_text = article_text.replace(articles_dict[k].strip(),"",1)  #remove title from text

        data[k].update({'title': articles_dict[k]})
        data[k].update({'text': article_text})
    
    if break_to_outer_loop:
        logging.log(logging.INFO, f"File={txt_articles_file} did not continue to DB Save due to previous error")
        continue

    with Session(engine) as sess:
        try:
            legislation_id = re.search(r"\d+",txt_articles_file)[0]
            stmt = select(Legislation).where(Legislation.id==legislation_id)
            legObj = sess.execute(stmt).scalar_one()

            for article,textAndTitle in data.items():
                articleObj = Article(number=article,title=textAndTitle['title'], text=textAndTitle['text'])
                if articleObj.number not in (legiglationArticlesObj.number for legiglationArticlesObj in legObj.articles):
                    legObj.articles.append(articleObj)
            sess.commit()
        except:
            logging.log(logging.INFO, f"File={txt_articles_file} Failed to save to DB.")




Text File pdf_files/text\1000_full.txt
Text File pdf_files/text\1002_full.txt
Text File pdf_files/text\1003_full.txt
Text File pdf_files/text\1005_full.txt
Text File pdf_files/text\1013_full.txt
Text File pdf_files/text\1014_full.txt
Text File pdf_files/text\1021_full.txt
Text File pdf_files/text\1023_full.txt
Text File pdf_files/text\1045_full.txt
Text File pdf_files/text\1049_full.txt
Text File pdf_files/text\1050_full.txt
Text File pdf_files/text\1053_full.txt
Text File pdf_files/text\1054_full.txt
Text File pdf_files/text\1055_full.txt
Text File pdf_files/text\1057_full.txt
Text File pdf_files/text\1061_full.txt
Text File pdf_files/text\1062_full.txt
Text File pdf_files/text\1064_full.txt
Text File pdf_files/text\1065_full.txt
Text File pdf_files/text\1073_full.txt
Text File pdf_files/text\1076_full.txt
Text File pdf_files/text\1080_full.txt
Text File pdf_files/text\1081_full.txt
Text File pdf_files/text\1083_full.txt
Text File pdf_files/text\1084_full.txt
Text File pdf_files/text\

In [79]:
txt_files_to_do_again = ["897","1296","1279",
"898",
"939",
"948",
"949",
"974",
"999",
"1089",
"1091",
"1170",
"1191",
"1223",
"1271",
"1350",
"1352",
"1366",
"1379",
"1383",
"1388",
"1389",
"1402",
"1419",
"1426",
"1427",
"1442",
"897",
"898",
"939",
"948",
"949","1247","1307","1376","1418","1423"
"974",
"999"]

Extract Articles and DB Save

In [31]:
bills_with_full_text_article_numbers = set(
    ('1279',
     '1176',
     '1391',
    '1247',
    '1423',
    '1235',
    '1247',
    '1296',
    '1307',
    '1407',
    '1376',
    '1418',
    '1423')
)


In [23]:
from textUtils.textDecorators import text_to_numeric

Extract Articles and DB Save Final with fully written articles

In [32]:
txt_files_articles = glob.glob("pdf_files/text/*_articles.txt")
txt_files_full = glob.glob("pdf_files/text/*_full.txt")

end_of_bill1 = "Ο ΠΡΟΕΔΡΟ" # Ο ΠΡΟΕΔΡΟΣ ΤΗΣ ΒΟΥΛΗΣ
end_of_bill2= "ΠΑΡΑΡ" # ΠΑΡΑΡΤΗΜΑ

for txt_articles_file, txt_full_file in zip(txt_files_articles,txt_files_full):  # (100)1255: Άρθρο1
    # if not any((to_do_txt_file in txt_articles_file for to_do_txt_file in txt_files_to_do_again)):
    #     continue
    if not any((to_do_txt_file in txt_articles_file for to_do_txt_file in bills_with_full_text_article_numbers)):
        continue
    
    full_text_article_flag = False
    print(f"Text File {txt_full_file}")    
    txt_articles = Path(txt_articles_file).read_text(encoding='utf-8')
    txt_articles += "ρθρ"  #Necessary in order to match last article due to regex changes on the positive lookahead (?=Ά) = (?=Άρθρ)
    txt_full = Path(txt_full_file).read_text(encoding='utf-8')

    if "1307" in txt_articles_file:
        print()
    # change positve lookahead because (?=Ά) falsely disregarded article titles that started with Ά
    # article_nums_digits_pattern = r"^Άρθρ[οo]\s?(\d+)(.*?)(:?(?=Άρθρ)|(?=ΚΕΦΑ)|(?=ΜΕΡΟΣ))"  #id=892 Άρθρο 95 has a Latin "o" 
    if any((to_do_txt_file in txt_articles_file for to_do_txt_file in bills_with_full_text_article_numbers)):
        articles_full_text_pattern = r"^[ΆΑA]ρθρ[οo]\s?([\w\s]*?)\n(.*?)(:?(?=[ΆΑA]ρθρ)|(?=ΚΕΦΑ)|(?=ΜΕΡΟΣ))"
        full_text_article_flag = True
 
    article_nums_digits_pattern = r"^[ΆΑA]ρθρ[οo]\s?(\d+)(.*?)(:?(?=[ΆΑA]ρθρ)|(?=ΚΕΦΑ)|(?=ΜΕΡΟΣ))"  #id=892 Άρθρο 95 has a Latin "o" 
        
    matches_articles = re.findall(article_nums_digits_pattern,txt_articles,re.DOTALL| re.MULTILINE)
    articles_dict = {}

    #Text To Numeric Mapping for Full Text Articles
    if full_text_article_flag:
        txt_to_numeric = {}
        matches_full_text = re.findall(articles_full_text_pattern, txt_articles,re.DOTALL|re.MULTILINE)
        for m in matches_full_text:
            article_num_integer = text_to_numeric(m[0])
            original_text = m[0]
            article_title = m[1]
            if article_num_integer== -1:
                raise ValueError(f"Text To Numeric Failed for PDF={txt_articles_file}, txt={m[0]}")
            txt_to_numeric[original_text] = article_num_integer
            articles_dict[original_text] = m[1] 
    else:
        # Create Article Dict
        for match in matches_articles:
            articles_dict[match[0]] = match[1]

    if "1383" in txt_articles_file:
        articles_dict[70] = "Article 70 Does not Exist in 1383!!!!!!"
    

    data = defaultdict(dict)
    if full_text_article_flag:
        try:
            max_article_num = max((int(x) for x in txt_to_numeric.values()))
            min_article_num = min((int(x) for x in txt_to_numeric.values()))
        except ValueError as e:
            logging.log(logging.INFO, f"File={txt_articles_file} - Empty Sequence. Msg={str(e)} ")
            continue
    else:
        try:
            max_article_num = max((int(x) for x in articles_dict.keys()))
            min_article_num = min((int(x) for x in articles_dict.keys()))
        except ValueError as e:
            logging.log(logging.INFO, f"File={txt_articles_file} - Empty Sequence. Msg={str(e)} ")
            continue
    
    if full_text_article_flag:
        sorted_articles = sorted((int(x) for x in txt_to_numeric.values()))
    else:
        sorted_articles = sorted((int(x) for x in articles_dict.keys()))

    if sorted_articles != list(range(min_article_num,max_article_num+1)):
        # raise ValueError(f"Missing Article Numbers {set(range(min_article_num,max_article_num+1)).difference(set(sorted((int(x) for x in articles_dict.keys()))))}")
        logging.log(logging.INFO, f"File={txt_articles_file} Missing Article Numbers {set(range(min_article_num,max_article_num+1)).difference(set(sorted_articles))}")
        continue

    break_to_outer_loop = False
    if full_text_article_flag:
        articles_array = list(sorted(articles_dict.keys(),key=lambda x: text_to_numeric(x)))
        max_article_num = articles_array[len(articles_array)-1]
    else:
        articles_array = list(sorted(articles_dict.keys(),key=lambda x: int(x)))
        articles_array = list((int(x) for x in articles_array))


    for idx,k in enumerate(articles_array):

        current_article_num = k
        if current_article_num==70 and "1383" in txt_articles_file:
            continue # 1383=> Article 70 does not Exist!
        if idx < len(articles_array)-1:
            next_article_num = articles_array[idx+1]
        
        if next_article_num==70 and "1383" in txt_articles_file:  #if next article is 70, go to 71
            next_article_num =articles_array[idx+1+1]  # 1383=> Article 70 does not Exist!

        
        # article_text_start_idx = txt_full.rfind(article_text_start)  # We need the last occurance of Άρθρο \d+\n
        # article_text_start = articles_dict[k].strip()

        # Try to match by Article No. And Title
        article_text_start = f"Άρθρο {current_article_num}{articles_dict[str(current_article_num)]}".strip()
        article_text_start_idx = txt_full.find(article_text_start)
        flag_use_full_title_in_search=True

        # If no result, match by Article No. Only
        if article_text_start_idx==-1:
            # if matching  by title fails, match by article
            # 1310 => Article 2 has hidden text
            # id=892 => Match by Latin "o"
        # If still no result, try with no whitespace between Article and No (caused by improper text extraction)
            # 1362 => Άρθρο 10 is outputed as 'Άρθρο10'
            flag_use_full_title_in_search = False
            m = re.search(r"[ΆΑA]ρθρ[οo]\s?"+str(current_article_num)+"\n",txt_full)
            if m is not None:
                article_text_start_idx = m.start()    

        if article_text_start_idx==-1:
            logging.log(logging.INFO,f"File={txt_articles_file}Cannot find text {article_text_start} in full_text")
            break_to_outer_loop=True

        if current_article_num!=max_article_num:
            # article_text_end = f"Άρθρο {next_article_num}\n"
            article_text_end = f"Άρθρο {next_article_num}{articles_dict[str(next_article_num)]}".strip()
            # article_text_end_idx = txt_full.rfind(article_text_end)
            article_text_end_idx = txt_full.find(article_text_end)

            if article_text_end_idx==-1:
                # if None, try to match only the article number
                m = re.search(r"[ΆΑA]ρθρ[οo]\s?"+str(next_article_num)+"\n",txt_full)
                if m is not None:
                    article_text_end_idx = m.start()    
                    article_text_end = m.group()

            if article_text_end_idx==-1:
                logging.log(logging.INFO,f"File={txt_articles_file} Cannot find text {article_text_end} in full_text")
                # raise ValueError(f"Cannot find text {article_text_end} in full_text")
                break_to_outer_loop=True

        else:
            if txt_full.rfind(end_of_bill2)>0 :
                article_text_end_idx = txt_full.rfind(end_of_bill2) # ΠΑΡΑΡΤΗΜΑ
            elif txt_full.rfind(end_of_bill1)>0:
                article_text_end_idx = txt_full.rfind(end_of_bill1) # Ο ΠΡΟΕΔΡΟΣ
            else:
                article_text_end_idx = len(txt_full)
                
        article_text = txt_full[article_text_start_idx+len(article_text_start):article_text_end_idx]

        if flag_use_full_title_in_search==False:  # if seach did not use full article title, try to remove on this step
            article_text = article_text.replace(articles_dict[str(current_article_num)].strip(),"",1)  #remove title from text

        data[k].update({'title': articles_dict[str(current_article_num)]})
        data[k].update({'text': article_text})
    
    if break_to_outer_loop:
        logging.log(logging.INFO, f"File={txt_articles_file} did not continue to DB Save due to previous error")
        continue

    with Session(engine) as sess:
        try:
            legislation_id = re.search(r"\d+",txt_articles_file)[0]
            stmt = select(Legislation).where(Legislation.id==legislation_id)
            legObj = sess.execute(stmt).scalar_one()

            for article,textAndTitle in data.items():
                article_num: int
                if full_text_article_flag:
                    article_num = text_to_numeric(article)
                else:
                    article_num = article
                articleObj = Article(number=article_num,title=textAndTitle['title'], text=textAndTitle['text'])
                if full_text_article_flag:
                    # print(articleObj)
                if articleObj.number not in (legiglationArticlesObj.number for legiglationArticlesObj in legObj.articles):
                    legObj.articles.append(articleObj)
            sess.commit()
        except:
            logging.log(logging.INFO, f"File={txt_articles_file} Failed to save to DB.")




Text File pdf_files/text\1176_full.txt
Article(id=None, number=1, title=Κύρωση Σύµβασης και του
συνοδευτικού αυτής Πρωτοκόλλου


























































































































































, text=υ
Κυρώνονται και έχουν την ισχύ, που ορίζει η παρ., legislation_id=None)
Article(id=None, number=2, title=Κατάργηση ν.δ. 4386/1964
, text=4
Από τη θέση σε ισχύ της Σύµβασης που κυρώνεται,
, legislation_id=None)
Article(id=None, number=3, title=Έναρξη ισχύος

, text=ς
Η ισχύς του παρόντος νόµου αρχίζει από τη δηµοσί, legislation_id=None)
Text File pdf_files/text\1235_full.txt
Article(id=None, number=1, title=

, text=
Κυρώνεται και αποκτά ισχύ νόμου από τη δημοσίευσή, legislation_id=None)
Article(id=None, number=2, title=, text=
Στην εξαίρεση από την αναστολή της παραγράφου 2
τ, legislation_id=None)
Article(id=None, number=3, title=, text=
Οι διατάξεις της παραγράφου 4 του άρθρου 28 του
ν, legislation_id

In [19]:
with pdfplumber.open("pdf_files/898.pdf") as f:
    p_oneColumn = f.pages[12]
    p_twoColums = f.pages[5]
    # bbox=(x0,top,x1,bottom) [(0,0) is bottom left]
    # coordinates found with GIMP
    box_check_for_one_column = (291.5,60,303.8,759.5)
    box_check_for_one_column_first_page = (291.5,366,303.8,759.5)
    one_column_bbox =(0,64,p_oneColumn.width,799)
    p_oneColumn_crop = p_oneColumn.crop(box)
    p_twoColums_crop = p_twoColums.crop(box)
    print("One Column Text:")
    print(p_oneColumn_crop.extract_text())
    p_oneColumn.crop(one_column_bbox).to_image().show()
    print("Two Column Text:")
    print(p_twoColums_crop.extract_text())
    if not p_twoColums_crop.extract_text():
        print("IS EMPTY")
    # p_twoColums.to_image().show()
    

One Column Text:
ρο
εω
δηµ
αι
ν ε
ωπ
Two Column Text:

IS EMPTY


In [28]:
f = "pdf_files/898.pdf"


with pdfplumber.open(f) as pdf:
    p = pdf.pages[25]
    box_check_for_one_column = (295.5,65,301.8,759.5)
    box_check_for_one_column_first_page = (295.5,366,301.8,759.5)
    one_column_bbox =(0,64,p.width,799)
    one_column_bbox_first_page =(0,365,p.width,799)
    p_crop = p.crop(box_check_for_one_column)
    p_crop.to_image().show()
    # p_crop.to_image().show()
    print(p_crop.extract_text())
    




In [58]:
f = "pdf_files/1089.pdf"


with pdfplumber.open(f) as pdf:
    p = pdf.pages[7]
    box_check_for_one_column = (300,65,300.8,759.5)
    box_check_for_one_column_first_page = (295.5,366,301.8,759.5)
    one_column_bbox =(0,64,p.width,799)
    one_column_bbox_first_page =(0,365,p.width,799)
    p_crop = p.crop(box_check_for_one_column)
    # p_crop.to_image().show()
    # p_crop.to_image().show()
    print(p_crop.extract_text())
    

/


In [97]:
f = "pdf_files/1279.pdf"


with pdfplumber.open(f) as pdf:
    p = pdf.pages[42]
    box_check_for_one_column = (300,65,301.8,759.5)
    box_check_for_one_column_first_page = (295.5,400,301.8,759.5)
    one_column_bbox =(0,64,p.width,799)
    one_column_bbox_first_page =(0,400,p.width,799)
    p_crop = p.crop(box_check_for_one_column)
    # p_crop.to_image().show()
    # p_crop.to_image().show()
    print(p_crop.extract_text())
    




In [6]:
f = "pdf_files/1231.pdf"


with pdfplumber.open(f) as pdf:
    p = pdf.pages[2]
    box_check_for_one_column = (300,65,301.8,759.5)
    box_check_for_one_column_first_page = (295.5,400,301.8,759.5)
    one_column_bbox =(0,64,p.width,799)
    one_column_bbox_first_page =(0,400,p.width,799)
    p_crop = p.crop(box_check_for_one_column)
    # p_crop.to_image().show()
    # p_crop.to_image().show()
    print(p.extract_text())
    

3
ÂÁÎ·ÙÂÛÙËÌ¤ÓÔÈ ÛÙË ¯ÒÚ·, ÛÙË ¡.∞. ∂˘ÚÒË Î·È, ÁÂÓÈÎfi- (1.000) ÔÓÔÌ·ÛÙÈÎ¤˜ ÌÂÙÔ¯¤˜, ·Í›·˜ ÂÍ‹ÓÙ· (60) Â˘ÚÒ
ÙÂÚ·, ÛÙÔ ÂÍˆÙÂÚÈÎfi. ÂÎ¿ÛÙË˜. √È ÌÂÙÔ¯¤˜ ·˘Ù¤˜ ·Ó·Ï·Ì‚¿ÓÔÓÙ·È, ÛÙÔ Û‡ÓÔÏfi
ÛÙ) ∏ ÂÎfiÓËÛË ÌÂÏÂÙÒÓ, Ë ÂÎÙ¤ÏÂÛË, Ë ‰ÈÔ›ÎËÛË Î·È Ë ÙÔ˘˜, ·fi ÙÔ ¢ËÌfiÛÈÔ. °È· ÙËÓ ¿ÛÎËÛË ÙˆÓ ‰ÈÎ·ÈˆÌ¿ÙˆÓ
ÏÂÈÙÔ˘ÚÁ›· ¤ÚÁˆÓ ˘Ô‰ÔÌ‹˜, fiˆ˜ ¤ÚÁˆÓ Ô‰ÔÔÈ›·˜, ÙÔ˘ ˆ˜ ÌÂÙfi¯Ô˘, ÙÔ ¢ËÌfiÛÈÔ ÂÎÚÔÛˆÂ›Ù·È ·fi ÙÔ˘˜
‡‰ÚÂ˘ÛË˜, ·Ô¯¤ÙÂ˘ÛË˜, ÔÌ‚Ú›ˆÓ ˘‰¿ÙˆÓ, ËÏÂÎÙÚÔÊˆ- ÀÔ˘ÚÁÔ‡˜ √ÈÎÔÓÔÌ›·˜ Î·È √ÈÎÔÓÔÌÈÎÒÓ, ∞Ó¿Ù˘ÍË˜ Î·È
ÙÈÛÌÔ‡, ÎÙÈÚÈ·ÎÒÓ ˘Ô‰ÔÌÒÓ, ‰ÈÎÙ‡ˆÛË˜ ÌÂ ÙÂ¯ÓÔÏÔÁÈÎ‹ ª·ÎÂ‰ÔÓ›·˜ - £Ú¿ÎË˜. ªÂ ÎÔÈÓ‹ ·fiÊ·ÛË ÙˆÓ ›‰ÈˆÓ
„ËÊÈ·Î‹ ‰È¿ÛÙ·ÛË Î·È ¤ÚÁˆÓ ·Ó·‚¿ıÌÈÛË˜ ÙÔ˘ ÂÚÈ‚¿Ï- ÀÔ˘ÚÁÒÓ, ÌÂÙ¿ ·fi ÚfiÙ·ÛË ÙÔ˘ ¢ÈÔÈÎËÙÈÎÔ‡ ™˘Ì‚Ô˘Ï›-
ÏÔÓÙÔ˜, Î·ıÒ˜ Î·È Ë ·ÚÔ¯‹ ˘ËÚÂÛÈÒÓ ÛÙÔ˘˜ ÊÔÚÂ›˜ Ô˘ ÌÔÚÂ› Ó· ·˘Í¿ÓÂÙ·È ÙÔ ÌÂÙÔ¯ÈÎfi ÙË˜ ÎÂÊ¿Ï·ÈÔ, ÌÂ
Î·È ÛÙÈ˜ ÂÈ¯ÂÈÚ‹ÛÂÈ˜ Ô˘ ÂÁÎ·ı›ÛÙ·ÓÙ·È ÂÓÙfi˜ ÙË˜ ¤Î‰ÔÛË Ó¤ˆÓ ÌÂÙÔ¯ÒÓ. √È ·˘Í‹ÛÂÈ˜ ÙÔ˘ ÌÂÙÔ¯ÈÎÔ‡ ÎÂÊ·-
∑.∫∞π.£. Î·È ÙˆÓ £.À.∫.¢. Î·È ÛÂ fiÛÔ˘˜ ÂÚÁ¿˙ÔÓÙ·È Î·È Ï·›Ô˘ ÌÔÚÂ› Ó· Ú·ÁÌ·ÙÔÔÈÔ‡Ó