In [1]:
import re
import pandas as pd
import spacy

from transformers import pipeline
from tqdm.auto import tqdm

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
data = pd.read_csv("/content/drive/MyDrive/paragraphs.csv")

In [4]:
data

Unnamed: 0,text
0,Every effort has been made to secure necessary...
1,The system of transliteration from Cyrillic us...
2,Introduction\n rona l d g r i g o r s u n y
3,The history of Russia in the twentieth century...
4,great divide between Soviet East and capitalis...
...,...
1696,"As has been noted, it was not accidental that ..."
1697,"29 Katherine Hodgson, “Soviet Women’s Poetry o..."
1698,P1: GDZ\n 0521834325c12.xml CY465-Chickering 0...
1699,"Women in the Soviet War Effort, 1941–1945 243"


In [5]:
candidate_labels = ["author information", "body text", "bibliography", "toc"]

label2idx = {label: idx for idx, label in enumerate(candidate_labels)}

def is_author_zero_shot(para: str, *, threshold: float = 0.6, target="author information") -> bool:
    res = zsc(para, candidate_labels, multi_label=False)
    return res["labels"][0] == target and res["scores"][label2idx[target]] >= threshold

In [6]:
nlp = spacy.load("en_core_web_sm")

def is_author_heuristic(para: str) -> bool:

    if re.fullmatch(r"^(?:[A-Z]\s?){5,}$", para.replace("  ", " ")):
        return True

    doc = nlp(para)
    persons = [ent for ent in doc.ents if ent.label_ == "PERSON"]
    tok_count = len([t for t in doc])
    if 0 < len(persons) <= 2 and tok_count <= 10:
        return True

    return False


In [7]:

zsc = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
nlp = spacy.load("en_core_web_sm")

author_indexes = []
for i, p in enumerate(tqdm(data.text)):
    if is_author_heuristic(p) or is_author_zero_shot(p, threshold=0.7):
        author_indexes.append(i)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Device set to use cuda:0


  0%|          | 0/1701 [00:00<?, ?it/s]

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


In [8]:
data.iloc[author_indexes]

Unnamed: 0,text
13,"1 Christopher Lasch, The American Liberals and..."
37,"38 Langston Hughes, I Wonder as I Wander: An A..."
52,58 The point about the shift from national cha...
96,"115 Meyer, ‘Coming to Terms with the Past’, p...."
114,"134 Richard Pipes, The Russian Revolution (New..."
...,...
1599,"1 Vernon Lidtke, The Alternative Culture: Soci..."
1602,3 The Workingman’s Programme (Arbeiter-Program...
1613,"6 A. G. Shliapnikov, Kanun semnadtsatogo goda...."
1619,"8 Neil Harding, Leninism (Durham, N.C.: Duke U..."


In [9]:
mask = ~data.index.isin(author_indexes)
data = data[mask]

In [10]:
data.to_csv("drive/MyDrive/paragraphs_without_authors.csv", index=False)