In [None]:
%%capture
!sudo apt-get update
!sudo apt install build-essential libpoppler-cpp-dev pkg-config python3-dev
!pip install pdftotext

In [None]:
import numpy as np
import os
import pandas as pd
import pdftotext
import re

from google.colab import drive
from os import listdir
from os.path import isfile, join
from tqdm.notebook import tqdm

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
df_ground_truth = pd.read_csv('/content/drive/MyDrive/ground_truth_rag.csv')
df_ground_truth_number_of_cases = pd.read_csv('/content/drive/MyDrive/ground_truth_rag_number_of_cases.csv')

In [None]:
def get_pages_by_indices(page_numbers, grammar):
    page_numbers_list = page_numbers.split(',')
    result = []
    for item in page_numbers_list:
        item = item.strip()
        if '-' in item:
            # Handle ranges (e.g., '45-51')
            start, end = map(int, item.split('-'))
            concatenated_elements = '\n'.join(str(grammar[i - 1]) for i in range(start, end + 1))
            result.append(concatenated_elements)
        else:
            # Handle single indices (e.g., '12', '101')
            i = int(item)
            result.append(grammar[i - 1])
    return page_numbers_list, result

In [None]:
def get_pages_ablation(feature, file_path, layout=False):
    if feature != 'WALS 49A':
        df = df_ground_truth
    else:
        df = df_ground_truth_number_of_cases

    ground_truth_pages = df.set_index('Filename')[f'{feature}: PDF Pages'].fillna('')
    filename = os.path.basename(file_path)[:-4]
    out_file_path = f'/content/drive/MyDrive/Grammars Paragraphs/{feature}/Ablation/{filename}.csv'

    if not isfile(out_file_path):
        with open(join(path_to_grammars, file_path), 'rb') as f:
            pdf = pdftotext.PDF(f, physical=layout)
        page_numbers = ground_truth_pages[filename]

        if len(page_numbers) > 0:
            page_number_list, pages = get_pages_by_indices(page_numbers, pdf)
            result = pd.DataFrame({'Paragraph': pages, 'Page number': page_number_list})
            result.to_csv(out_file_path, index=False)

In [None]:
path_to_grammars = '/content/drive/MyDrive/Grammars Benchmark'
grammar_files = [f for f in listdir(path_to_grammars) if isfile(join(path_to_grammars, f))]

## WALS 81A: Order of Subject, Object and Verb

In [None]:
for file_path in tqdm(grammar_files):
    get_pages_ablation('WALS 81A', file_path)

  0%|          | 0/148 [00:00<?, ?it/s]

## GB107: Can standard negation be marked by an affix, clitic or modification of the verb?

In [None]:
for file_path in tqdm(grammar_files):
    get_pages_ablation('GB 107', file_path)

  0%|          | 0/148 [00:00<?, ?it/s]

## WALS 116A: Polar Questions

In [None]:
for file_path in tqdm(grammar_files):
    get_pages_ablation('WALS 116A', file_path)

  0%|          | 0/148 [00:00<?, ?it/s]

## WALS 49A: Number of Cases

In [None]:
path_to_grammars = '/content/drive/MyDrive/Grammars Benchmark: Number of Cases'
grammar_files = [f for f in listdir(path_to_grammars) if isfile(join(path_to_grammars, f))]

In [None]:
for file_path in tqdm(grammar_files):
    get_pages_ablation('WALS 49A', file_path)

  0%|          | 0/148 [00:00<?, ?it/s]