In [None]:
%%capture
!sudo apt-get update
!sudo apt install build-essential libpoppler-cpp-dev pkg-config python3-dev
!pip install pdftotext rank_bm25

In [None]:
import numpy as np
import os
import pandas as pd
import pdftotext
import re
import requests
import random
import spacy

from bs4 import BeautifulSoup
from google.colab import drive
from os import listdir
from os.path import isfile, join
from rank_bm25 import BM25Okapi
from spacy.lang.en import stop_words
from tqdm.notebook import tqdm

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
base_dir = '/content/drive/My Drive/Grammars Paragraphs'
subdirs = ['GB 107', 'WALS 49A', 'WALS 81A', 'WALS 116A']
subsubdirs = ['Ablation', 'Reranker 20', 'Wikipedia 50']

for subdir in subdirs:
    for subsubdir in subsubdirs:
        os.makedirs(os.path.join(base_dir, subdir, subsubdir), exist_ok=True)

print('Directories created successfully!')

Directories created successfully!


In [None]:
stopwords = stop_words.STOP_WORDS
model = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
model.max_length = 5000000

In [None]:
def first_letter(s):
    m = re.search(r'[a-z]', s, re.I)
    if m is not None:
        return s[m.start()]
    return 'A'

In [None]:
def end_of_sentence(text):
    text = text.strip('\n ')
    stop = ('.', '?', '!', '…')
    for item in stop:
        if text.endswith(item) or text.endswith(item + '”') or text.endswith(item + '"') or text.endswith(item + '\''):
            return True
    return False

In [None]:
def preprocess(text):
    return re.sub('\[\d+\]|\[citation needed\]', '', text.strip('\n'))

def get_query(revision_id):
    url = f'https://en.wikipedia.org/w/api.php'
    params = {
        'action': 'parse',
        'oldid': revision_id,
        'prop': 'text',
        'format': 'json'
    }

    response = requests.get(url, params=params)
    data = response.json()
    html_content = data['parse']['text']['*']
    soup = BeautifulSoup(html_content, 'html.parser')

    start_summary = soup.find('p')
    start_next_section = soup.find('meta')
    summary = [preprocess(start_summary.text)]

    elems = start_summary.next_siblings
    for i, elem in enumerate(elems):
        if elem == start_next_section:
            break
        if elem.name is not None:
            text = preprocess(elem.text)
            if elem.name == 'ul':
                text = '\n' + text
            summary.append(text)

    return '\n'.join(summary)

In [None]:
def get_lemmatized_query(query):
    tokenized_query = model(query.lower())
    lemmatized_query = []

    for token in tokenized_query:
        if token.lemma_ not in stopwords and token.is_alpha:
            lemmatized_query.append(token.lemma_)

    return lemmatized_query

In [None]:
def get_new_paragraphs(paragraphs, page_numbers):
    new_paragraphs = [paragraphs[0]]
    new_dict = dict()
    new_dict[paragraphs[0]] = [page_numbers[0]]
    for i in range(1, len(paragraphs)):
        paragraph = paragraphs[i].strip('  ')
        if len(paragraph) > 0:
            if (not end_of_sentence(new_paragraphs[-1])) | first_letter(paragraphs[i]).islower():
                index = new_dict[new_paragraphs[-1]]
                del new_dict[new_paragraphs[-1]]
                new_paragraphs[-1] += paragraphs[i]
                if index[-1] != page_numbers[i]:
                    index.append(page_numbers[i])
                new_dict[new_paragraphs[-1]] = index
            else:
                new_paragraphs.append(paragraphs[i])
                new_dict[paragraphs[i]] = [page_numbers[i]]
    return new_dict

In [None]:
def get_paragraphs_bm25(lemmatized_query, path_to_grammars, filename, out_path, layout=False):
    if not isfile(f'{out_path}{os.path.basename(filename)[:-4]}.csv'):
        with open(join(path_to_grammars, filename), 'rb') as f:
            pdf = pdftotext.PDF(f, physical=layout)
        page_numbers = []
        paragraphs = []
        for j in range(1, len(pdf) + 1):
            addition = re.split('\n\n', pdf[j - 1])
            paragraphs.extend(addition)
            for paragraph in addition:
                page_numbers.append(j)
        new_dict = get_new_paragraphs(paragraphs, page_numbers)
        new_paragraphs = [key for key in new_dict]

        lemmatized_paragraphs = []

        for paragraph in new_paragraphs:
            lemmatized_paragraph = []
            doc = model(paragraph.lower())
            for token in doc:
                if token.lemma_ not in stopwords and token.is_alpha:
                    lemmatized_paragraph.append(token.lemma_)
            lemmatized_paragraphs.append(lemmatized_paragraph)

        bm25 = BM25Okapi(lemmatized_paragraphs)

        top_n = bm25.get_top_n(lemmatized_query, new_paragraphs, n=50)

        df = pd.DataFrame({'Paragraph': top_n, 'Page number': [new_dict[key] for key in top_n]})
        df.to_csv(f'{out_path}{os.path.basename(filename)[:-4]}.csv', index=False)

In [None]:
path_to_grammars = '/content/drive/MyDrive/Grammars Benchmark'
grammar_files = [f for f in listdir(path_to_grammars) if isfile(join(path_to_grammars, f))]

## WALS 81A: Order of Subject, Object and Verb

In [None]:
paragraphs_path = '/content/drive/MyDrive/Grammars Paragraphs/WALS 81A/Wikipedia 50/'

# Summary for the Wikipedia article titled "Word order"
# as of September 3rd, 2024
query = get_query(1240489972)
query

'In linguistics, word order (also known as linear order) is the order of the syntactic constituents of a language. Word order typology studies it from a cross-linguistic perspective, and examines how languages employ different orders. Correlations between orders found in different syntactic sub-domains are also of interest. The primary word orders that are of interest are\n\nthe constituent order of a clause, namely the relative order of subject, object, and verb;\nthe order of modifiers (adjectives, numerals, demonstratives, possessives, and adjuncts) in a noun phrase;\nthe order of adverbials.\nSome languages use relatively fixed word order, often relying on the order of constituents to convey grammatical information. Other languages—often those that convey grammatical information through inflection—allow more flexible word order, which can be used to encode pragmatic information, such as topicalisation or focus. However, even languages with flexible word order have a preferred or ba

In [None]:
lemmatized_query = get_lemmatized_query(query)
for filename in tqdm(grammar_files):
    get_paragraphs_bm25(lemmatized_query, path_to_grammars, filename, paragraphs_path)

  0%|          | 0/148 [00:00<?, ?it/s]

## GB107: Can standard negation be marked by an affix, clitic or modification of the verb?

In [None]:
paragraphs_path = '/content/drive/MyDrive/Grammars Paragraphs/GB 107/Wikipedia 50/'

# Summary for the Wikipedia article titled "Affirmation and negation"
# as of September 3rd, 2024
query = get_query(1202783032)
query

'In linguistics and grammar, affirmation (abbreviated AFF) and negation (NEG) are ways in which grammar encodes positive and negative polarity into verb phrases, clauses, or other utterances. An affirmative (positive) form is used to express the validity or truth of a basic assertion, while a negative form expresses its falsity. For example, the affirmative  sentence "Joe is here" asserts that it is true that Joe is currently located near the speaker. Conversely, the negative sentence "Joe is not here" asserts that it is not true that Joe is currently located near the speaker.\nThe grammatical category associated with affirmatives and negatives is called polarity. This means that a clause, sentence, verb phrase, etc. may be said to have either affirmative or negative polarity (its polarity may be either affirmative or negative). Affirmative is typically the unmarked polarity, whereas a negative statement is marked in some way. Negative polarity can be indicated by negating words or par

In [None]:
lemmatized_query = get_lemmatized_query(query)
for filename in tqdm(grammar_files):
    get_paragraphs_bm25(lemmatized_query, path_to_grammars, filename, paragraphs_path)

  0%|          | 0/148 [00:00<?, ?it/s]

## WALS 116A: Polar Questions

In [None]:
paragraphs_path = '/content/drive/MyDrive/Grammars Paragraphs/WALS 116A/Wikipedia 50/'

# Summary for the Wikipedia article titled "Yes–no question"
# as of September 3rd, 2024
query = get_query(1236424936)
query

'In linguistics, a yes–no question, also known as a binary question, a polar question, or a general question, is a question whose expected answer is one of two choices, one that provides an affirmative answer to the question versus one that provides a negative answer to the question. Typically, in English, the choices are either "yes" or "no". Yes–no questions present an exclusive disjunction, namely a pair of alternatives of which only one is a felicitous answer. In English, such questions can be formed in both positive and negative forms:\n\npositive yes/no question:  "Will you be here tomorrow?"\nnegative yes/no question:  "Won\'t you be here tomorrow?"\nYes–no questions are in contrast with non-polar wh-questions. The latter are also called content questions, and are formed with the five Ws plus an H ( "who", "what", "where", "when", "why", "how"). Rather than restricting the range of possible answers to two alternatives, content questions are compatible with a broad range of alter

In [None]:
lemmatized_query = get_lemmatized_query(query)
for filename in tqdm(grammar_files):
    get_paragraphs_bm25(lemmatized_query, path_to_grammars, filename, paragraphs_path)

  0%|          | 0/148 [00:00<?, ?it/s]

## WALS 49A: Number of Cases

In [None]:
path_to_grammars = '/content/drive/MyDrive/Grammars Benchmark: Number of Cases'
grammar_files = [f for f in listdir(path_to_grammars) if isfile(join(path_to_grammars, f))]

In [None]:
paragraphs_path = '/content/drive/MyDrive/Grammars Paragraphs/WALS 49A/Wikipedia 50/'

# Summary for the Wikipedia article titled "Grammatical case"
# as of September 3rd, 2024
query = get_query(1238822129)
query

'A grammatical case is a category of nouns and noun modifiers (determiners, adjectives, participles, and numerals) that corresponds to one or more potential grammatical functions for a nominal group in a wording. In various languages, nominal groups consisting of a noun and its modifiers belong to one of a few such categories. For instance, in English, one says I see them and they see me: the nominative pronouns I/they represent the perceiver and the accusative pronouns me/them represent the phenomenon perceived. Here, nominative and accusative are cases, that is, categories of pronouns corresponding to the functions they have in representation.\nEnglish has largely lost its inflected case system but personal pronouns still have three cases, which are simplified forms of the nominative, accusative (including functions formerly handled by the dative) and genitive cases. They are used with personal pronouns: subjective case (I, you, he, she, it, we, they, who, whoever), objective case (m

In [None]:
lemmatized_query = get_lemmatized_query(query)
for filename in tqdm(grammar_files):
    get_paragraphs_bm25(lemmatized_query, path_to_grammars, filename, paragraphs_path)

  0%|          | 0/148 [00:00<?, ?it/s]