In [1]:
import re
import spacy
from spacy.lang.en import English

# Load the spaCy English tokenizer
nlp = English()

In [2]:
# a dictionary of oddities to clean up
odd_words_dict = {'\sderstanding': 'derstanding',
                  '\sforthe\s': ' for the ',
                  '\sject': 'ject',
                  '\sjects': 'jects',
                  '\sness': 'ness',
                  '\sper\scent\s': ' percent ',
                  '\sper\scent\.': ' percent.',
                  '\sper\scent,': ' percent,',
                  '\wi\son': 'ion',
                  '\spri\sori': ' priori',
                  '\stences\s': 'tences ',
                  '\sprincipleb': ' principle',
                  '\ssciousness': 'sciousness',
                  '\stion': 'tion',
                  '\spri\s': ' pri',
                  '\scluding': 'cluding',
                  '\sdom': 'dom',
                  '\sers': 'ers',
                  '\scritiq\s': ' critique ',
                  '\ssensati\s': ' sensation ',
                  '(?i)\syou\sll': " you'll",
                  '\sI\sll': " I'll",
                  '(?i)\swe\sll': " we'll",
                  '(?i)he\sll': " he'll",
                  '(?i)who\sll': "who'll",
                  '(?i)\sthere\sll\s': " there'll ",
                  '\seduca\s': ' education ',
                  '\slity\s': 'lity ',
                  '\smultaneously\s': 'multaneously ',
                  '\stically\s': 'tically ',
                  '\sDa\ssein\s': ' Dasein ',
                  '(?i)\sthey\sll\s': " they'll ",
                  '(?i)\sin\tum\s': ' in turn ',
                  '\scon~\s': ' con',
                  '\sà\s': ' a ',
                  '\sjor\s': ' for ',
                  '\sluminating\s': 'luminating ',
                  '\sselj\s': ' self ',
                  '\stial\s': 'tial ',
                  '\sversal\s': 'versal ',
                  '\sexis\st': ' exist',
                  '\splauded\s': 'plauded ',
                  '\suiry\s': 'uiry ',
                  '\svithin\s': ' within ',
                  '\soj\s': ' of ',
                  '\sposi\st': ' posit',
                  '\sra\sther\s': ' rather ',
                  '(?i)\sthat\sll\s': " that'll ",
                  '(?i)\sa\sll\s': ' all ',
                  '\so\sther\s': ' other ',
                  '\sra\sther\s': ' rather ',
                  '\snei\sther\s': ' neither ',
                  '\sei\sther\s': ' either ',
                  '\sfur\sther\s': ' further ',
                  '\sano\sther': ' another ',
                  '\sneces\s': ' neces',
                  'u\slar\s': 'ular ',
                  '\sference\s': 'ference ',
                  '(?i)it\sll\s': "it'll ",
                  '\stoge\sther': ' together ',
                  '\sknowledgeb\s': ' knowledge ',
                  'r\stain\s': 'rtain ',
                  'on\stain\s': 'ontain',
                  '(?i)j\sect\s': 'ject',
                  '\sob\sect\s': ' object ',
                  '\sbtle\s': 'btle ',
                  '\snition\s': 'nition ',
                  '\sdering\s': 'dering ',
                  '\sized\s': 'ized ',
                  '\sther\shand': ' other hand',
                  '\ture\s': 'ture ',
                  '\sabso\sl': ' absol',
                  '\stly\s': 'tly ',
                  '\serty\s': 'erty ',
                  '\sobj\se': ' obj',
                  '\sffiir\s': ' for ',
                  '\sndeed\s': ' indeed ',
                  '\sfonn\s': ' form ',
                  '\snally\s': 'nally ',
                  'ain\sty\s': 'ainty ',
                  'ici\sty\s': 'icity ',
                  '\scog\sni': ' cogni',
                  '\sacc\s': ' acc',
                  '\sindi\svid\sual': ' individual',
                  '\sintu\sit': ' intuit',
                  'r\sance\s': 'rance ',
                  '\ssions\s': 'sions ',
                  '\sances\s': 'ances ',
                  '\sper\sception\s': ' perception ',
                  '\sse\sries\s': ' series ',
                  '\sque\sries\s': ' queries ',
                  '\sessary\s': 'essary ',
                  '\sofa\s': ' of a ',
                  '\scer\stainty\s': ' certainty ',
                  'ec\stivity\s': 'ectivity ',
                  '\stivity\s': 'tivity ',
                  '\slation\s': 'lation ',
                  '\sir\sr': ' irr',
                  '\ssub\sstance\s': ' substance ',
                  'sec\sond\s': 'second ',
                  '\s\.rv': '',
                  '\story\s': 'tory ',
                  '\sture\s': 'ture ',
                  '\sminate\s': 'minate ',
                  '\sing\s': 'ing ',
                  '\splicity\s': 'plicity ',
                  '\ssimi\slar\s': ' similar ',
                  '\scom\smunity\s': ' community ',
                  '\sitselfa\s': ' itself a ',
                  '\ssimp\s': ' simply ',
                  '\scon\stex': ' contex',
                  '\scon\sseq': ' conseq',
                  '\scon\stai': ' contai',
                  '\sofwhat\s': ' of what ',
                  '\sui\s': 'ui',
                  '\sofan\s': ' of an ',
                  '\saccor\sdance\s': ' accordance ',
                  '\stranscen\sdental\s': ' transcendental ',
                  '\sap\spearances\s': ' appearances ',
                  'e\squences\s': 'equences ',
                  '\sorits\s': ' or its ',
                  '\simma\sn': ' imman',
                  '\seq\sua': ' equa',
                  '\simpl\sied\s': ' implied ',
                  '\sbuta\s': ' but a ',
                  '\sa\snd\s': ' and ',
                  '\sence\s': 'ence ',
                  '\stain\s': 'tain ',
                  '\sunder\sstanding\s': ' understanding ',
                  'i\sence\s': 'ience ',
                  'r\sence\s': 'rence ',
                  '\stical\s': 'tical ',
                  '\sobjectsb\s': ' objects ',
                  '\stbe\s': ' the ',
                  '\smul\st': ' mult',
                  '\sgen\seral\s': ' general ',
                  '\suniver\ssal\s': ' universal ',
                  '\scon\stent\s': ' content ',
                  '\spar\sticular\s': ' particular ',
                  'ver\ssity\s': 'versity ',
                  '\sCritiq\s': ' Critique ',
                  '\sphilo\ssophy\s': ' philosophy ',
                  '\seq\s': ' eq'}

In [3]:
# Define a function to remove unwanted characters and normalize text
def clean_text(text, capitals=True, bracketed_fn=False, odd_words_dict={}):
    # Remove utf8 encoding characters and some punctuations
    text = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f-\xff\xad\x0c6§\\£\Â*_<>""⎫•{}Γ~]', ' ', text)
    text = re.sub(r'[\u2014\u2013\u2012-]', ' ', text)

    # Replace whitespace characters with actual whitespace
    text = re.sub(r'\s', ' ', text)

    # Replace odd quotation marks with a standard
    text = re.sub(r'[‘’“”]', "'", text)

    # Replace the ligatures ﬀ, ﬃ and ﬁ with their appropriate counterparts
    text = re.sub(r'ﬀ', 'ff', text)
    text = re.sub(r'ﬁ', 'fi', text)
    text = re.sub(r'ﬃ', 'ffi', text)

    # Remove or standardize some recurring common and meaningless words/phrases
    text = re.sub(r'\s*This\s*page\s*intentionally\s*left\s*blank\s*', ' ', text)
    text = re.sub(r'(?i)Aufgabe\s+', ' ', text)
    text = re.sub(r',*\s+cf\.', ' ', text)

    # Some texts have footnotes conveniently in brackets - this removes them all,
    # with a safety measure for unpaired brackets, and deletes all brackets afterwards
    if bracketed_fn:
        text = re.sub(r'\[.{0,300}\]|\[.{0,300}\]|\[.{0,300}\}', ' ', text)
    text = re.sub(r'[{}\[\]]', ' ', text)

    # Unify some abbreviations
    text = re.sub(r'&', 'and', text)
    text = re.sub(r'\se\.g\.\s', ' eg ', text)
    text = re.sub(r'\si\.e\.\s', ' ie ', text)
    text = re.sub('coroll\.', 'coroll', text)
    text = re.sub('pt\.', 'pt', text)

    # Remove roman numerals, first capitalized ones
    text = re.sub(r'\s((I{2,}V*X*\.*)|(IV\.*)|(IX\.*)|(V\.*)|(V+I*\.*)|(X+L*V*I*]\.*))\s', ' ', text)
    # then lowercase
    text = re.sub(r'\s((i{2,}v*x*\.*)|(iv\.*)|(ix\.*)|(v\.*)|(v+i*\.*)|(x+l*v*i*\.*))\s', ' ', text)

    # Remove periods and commas flanked by numbers
    text = re.sub(r'\d\.\d', ' ', text)
    text = re.sub(r'\d,\d', ' ', text)

    # Remove the number-letter-number pattern used for many citations
    text = re.sub(r'\d*\w{,2}\d', ' ', text)

    # Remove numerical characters
    text = re.sub(r'\d+', ' ', text)

    # Remove words of 2+ characters that are entirely capitalized
    # (these are almost always titles, headings, or speakers in a dialogue)
    # remove capital I's that follow capital words - these almost always roman numerals
    # some texts do use these capitalizations meaningfully, so we make this optional
    if capitals:
        text = re.sub(r'[A-Z]{2,}\s+I', ' ', text)
        text = re.sub(r'[A-Z]{2,}', ' ', text)

    # Remove isolated colons and semicolons that result from removal of titles
    text = re.sub(r'\s+:\s*', ' ', text)
    text = re.sub(r'\s+;\s*', ' ', text)

    # Remove isolated letters (do it several times because strings of isolated letters do not get captured properly)
    text = re.sub(r'\s[^aAI\.]\s', ' ', text)
    text = re.sub(r'\s[^aAI\.]\s', ' ', text)
    text = re.sub(r'\s[^aAI\.]\s', ' ', text)
    text = re.sub(r'\s[^aAI\.]\s', ' ', text)
    text = re.sub(r'\s[^aAI\.]\s', ' ', text)
    text = re.sub(r'\s[^aAI\.]\s', ' ', text)

    # Remove isolated letters at the end of sentences or before commas
    text = re.sub(r'\s[^aI]\.', '.', text)
    text = re.sub(r'\s[^aI],', ',', text)

    # Deal with spaces around periods and commas
    text = re.sub(r'\s+,\s+', ', ', text)
    text = re.sub(r'\s+\.\s+', '. ', text)

    # Reduce multiple periods, commas, or whitespaces into a single one
    text = re.sub(r'\.+', '.', text)
    text = re.sub(r',+', ',', text)
    text = re.sub(r'\s+', ' ', text)

    # Deal with isolated problem cases discovered in the data:
    for key in odd_words_dict.keys():
        text = re.sub(r''+key+'', odd_words_dict[key], text)

    return text.strip()


In [4]:
import requests

In [5]:
from google.colab import drive
import os
import requests

drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:
drive_path = '/content/drive/MyDrive'


In [None]:
def get_text(path, encoding='utf-8'):
    f = open(path, 'r', encoding=encoding)
    text = f.read()
    f.close()
    return text
def get_guten(url):
    # retrieve the source text
    r = requests.get(url)
    r.encoding = 'utf-8'
    text = r.text
    return text

In [8]:
import pandas as pd

# Dictionary of texts by category
texts = {
    'plato': {
        'complete': get_text(drive_path + '/pj/Копия anselm_de_veritate.txt')
    },
    'aristotle': {
        'volume_1': get_text(drive_path + '/pj/Копия aristotle_complete_works_v1.txt'),
        'volume_2': get_text(drive_path + '/pj/Копия aristotle_complete_works_v2.txt')
    },
    'rationalists': {
        'spinoza_ethics': get_guten('http://www.gutenberg.org/cache/epub/3800/pg3800.txt'),
        'spinoza_improve_understanding': get_guten('http://www.gutenberg.org/cache/epub/1016/pg1016.txt'),
        'leibniz_theodicy': get_guten('http://www.gutenberg.org/cache/epub/17147/pg17147.txt'),
        'descartes_discourse_method': get_guten('http://www.gutenberg.org/cache/epub/59/pg59.txt'),
        'descartes_meditations': get_text(drive_path + '/pj/Копия descartes_meditations.txt'),
        'malebranche_search_truth': get_text(drive_path + '/pj/Копия malebranche_search_truth.txt')
    },
    'empiricists': {
        'locke_understanding_1': get_guten('http://www.gutenberg.org/cache/epub/10615/pg10615.txt'),
        'locke_understanding_2': get_guten('http://www.gutenberg.org/cache/epub/10616/pg10616.txt'),
        'locke_treatise_gov': get_guten('http://www.gutenberg.org/cache/epub/7370/pg7370.txt'),
        'hume_treatise': get_guten('http://www.gutenberg.org/cache/epub/4705/pg4705.txt'),
        'hume_natural_religion': get_guten('http://www.gutenberg.org/cache/epub/4583/pg4583.txt'),
        'berkeley_treatise': get_guten('http://www.gutenberg.org/cache/epub/4723/pg4723.txt'),
        'berkeley_three_dialogues': get_guten('http://www.gutenberg.org/cache/epub/4724/pg4724.txt')
    },
    'german_idealism': {
        'kant_practical_reason': get_text(drive_path + '/pj/Копия kant_critique_practical_reason.txt'),
        'kant_judgement': get_text(drive_path + '/pj/Копия kant_critique_judgement.txt'),
        'kant_pure_reason': get_text(drive_path + '/pj/Копия kant_pure_reason.txt'),
        'fichte_ethics': get_text(drive_path + '/pj/Копия fichte_system_of_ethics.txt'),
        'hegel_logic': get_text(drive_path + '/pj/Копия hegel_science_of_logic.txt'),
        'hegel_phenomenology': get_text(drive_path + '/pj/Копия hegel_phenomenology_of_spirit.txt'),
        'hegel_right': get_text(drive_path + '/pj/Копия hegel_elements_of_right.txt')
    },
    'analytic': {
        'russell_problems_of_phil': get_guten('http://www.gutenberg.org/cache/epub/5827/pg5827.txt'),
        'russell_analylsis_of_mind': get_guten('http://www.gutenberg.org/cache/epub/2529/pg2529.txt'),
        'moore_studies': get_guten('http://www.gutenberg.org/files/50141/50141-0.txt'),
        'wittgenstein_tractatus': get_text(drive_path + '/pj/Копия wittgenstein_tractatus.txt'),
        'wittgenstein_investigations': get_text(drive_path + '/pj/Копия wittgenstien_philosophical_investigations.txt'),
        'lewis_papers1': get_text(drive_path + '/pj/Копия lewis_papers_1.txt'),
        'lewis_papers2': get_text(drive_path + '/pj/Копия lewis_papers_2.txt'),
        'quine_quintessence': get_text(drive_path + '/pj/Копия quine_quintessence.txt'),
        'popper_science': get_text(drive_path + '/pj/Копия popper_logic_of_science.txt'),
        'kripke_troubles': get_text(drive_path + '/pj/Копия kripke_philosophical_troubles.txt'),
        'kripke_naming': get_text(drive_path + '/pj/Копия kripke_naming_necessity.txt')
    },
    'phenomenology': {
        'ponty_perception': get_text(drive_path + '/pj/Копия merleau-ponty_phenomenology_of_perception.txt'),
        'husserl_idea_of': get_text(drive_path + '/pj/Копия husserl_idea_of_phenomenology.txt'),
        'husserl_crisis': get_text(drive_path + '/pj/Копия husserl_crisis_of_euro_sciences.txt'),
        'heidegger_being_time': get_text(drive_path + '/pj/Копия heidegger_being_and_time.txt'),
        'heidegger_track': get_text(drive_path + '/pj/Копия heidegger_off_the_beaten_track.txt')
    },
    'continental': {
        'foucault_order': get_text(drive_path + '/pj/Копия foucault_order_of_things.txt'),
        'foucault_madness': get_text(drive_path + '/pj/Копия foucault_history_of_madness.txt'),
        'foucault_clinic': get_text(drive_path + '/pj/Копия foucault_birth_of_clinic.txt'),
        'derrida_writing': get_text(drive_path + '/pj/Копия derrida_writing_difference.txt'),
        'deleuze_oedipus': get_text(drive_path + '/pj/Копия deleuze_guattari_anti-oedipus.txt'),
        'deleuze_difference': get_text(drive_path + '/pj/Копия deleuze_difference_repetition.txt')
    },
    'marxism': {
        'marx_kapital': get_text(drive_path + '/pj/Копия marx_kapital.txt'),
        'marx_manifesto': get_text(drive_path + '/pj/Копия marx_manifesto.txt'),
        'lenin_essential': get_text(drive_path + '/pj/Копия lenin_essential_works.txt')
    },
    'capitalist_economics': {
        'smith_wealth': get_guten('http://www.gutenberg.org/files/3300/3300-0.txt'),
        'ricardo_political_economy': get_guten('http://www.gutenberg.org/cache/epub/33310/pg33310.txt'),
        'keynes_employment': get_text(drive_path + '/pj/Копия keynes_theory_of_employment.txt')
    },
    'abai 45 words' : {
        'abai1': get_text(drive_path + '/pj/abai/word_1.txt'),
        'abai2': get_text(drive_path + '/pj/abai/word_2.txt'),
        'abai3': get_text(drive_path + '/pj/abai/word_3.txt'),
        'abai4': get_text(drive_path + '/pj/abai/word_4.txt'),
        'abai5': get_text(drive_path + '/pj/abai/word_5.txt'),
        'abai6': get_text(drive_path + '/pj/abai/word_6.txt'),
        'abai7': get_text(drive_path + '/pj/abai/word_7.txt'),
        'abai8': get_text(drive_path + '/pj/abai/word_8.txt'),
        'abai9': get_text(drive_path + '/pj/abai/word_9.txt'),
        'abai10': get_text(drive_path + '/pj/abai/word_10.txt'),
        'abai11': get_text(drive_path + '/pj/abai/word_11.txt'),
        'abai12': get_text(drive_path + '/pj/abai/word_12.txt'),
        'abai13': get_text(drive_path + '/pj/abai/word_13.txt'),
        'abai14': get_text(drive_path + '/pj/abai/word_14.txt'),
        'abai15': get_text(drive_path + '/pj/abai/word_15.txt'),
        'abai16': get_text(drive_path + '/pj/abai/word_16.txt'),
        'abai17': get_text(drive_path + '/pj/abai/word_17.txt'),
        'abai18': get_text(drive_path + '/pj/abai/word_18.txt'),
        'abai19': get_text(drive_path + '/pj/abai/word_19.txt'),
        'abai20': get_text(drive_path + '/pj/abai/word_20.txt'),
        'abai21': get_text(drive_path + '/pj/abai/word_21.txt'),
        'abai22': get_text(drive_path + '/pj/abai/word_22.txt'),
        'abai23': get_text(drive_path + '/pj/abai/word_23.txt'),
        'abai24': get_text(drive_path + '/pj/abai/word_24.txt'),
        'abai25': get_text(drive_path + '/pj/abai/word_25.txt'),
        'abai26': get_text(drive_path + '/pj/abai/word_26.txt'),
        'abai27': get_text(drive_path + '/pj/abai/word_27.txt'),
        'abai28': get_text(drive_path + '/pj/abai/word_28.txt'),
        'abai29': get_text(drive_path + '/pj/abai/word_29.txt'),
        'abai30': get_text(drive_path + '/pj/abai/word_30.txt'),
        'abai31': get_text(drive_path + '/pj/abai/word_31.txt'),
        'abai32': get_text(drive_path + '/pj/abai/word_32.txt'),
        'abai33': get_text(drive_path + '/pj/abai/word_33.txt'),
        'abai34': get_text(drive_path + '/pj/abai/word_34.txt'),
        'abai35': get_text(drive_path + '/pj/abai/word_35.txt'),
        'abai36': get_text(drive_path + '/pj/abai/word_36.txt'),
        'abai37': get_text(drive_path + '/pj/abai/word_37.txt'),
        'abai38': get_text(drive_path + '/pj/abai/word_38.txt'),
        'abai39': get_text(drive_path + '/pj/abai/word_39.txt'),
        'abai40': get_text(drive_path + '/pj/abai/word_40.txt'),
        'abai41': get_text(drive_path + '/pj/abai/word_41.txt'),
        'abai42': get_text(drive_path + '/pj/abai/word_42.txt'),
        'abai43': get_text(drive_path + '/pj/abai/word_43.txt'),
        'abai44': get_text(drive_path + '/pj/abai/word_44.txt'),
        'abai45': get_text(drive_path + '/pj/abai/word_45.txt'),

    }
}




In [9]:
nlp.add_pipe('sentencizer')


<spacy.pipeline.sentencizer.Sentencizer at 0x7e75db4b52c0>

In [10]:
nlp.max_length = 5000000

# Function to tokenize text into sentences
def extract_sentences(text):
    doc = nlp(text)
    return [sent.text.strip() for sent in doc.sents]

# Convert texts dictionary to a DataFrame
data = []
for school, texts_dict in texts.items():
    for title, text in texts_dict.items():
        # Tokenize each text into sentences and flatten them into the data list
        sentences = extract_sentences(text)
        for sentence in sentences:
            data.append({'school': school, 'title': title, 'sentence_str': sentence})

df = pd.DataFrame(data)

In [11]:
df['cleaned_text'] = df['sentence_str'].apply(clean_text)


In [12]:
df.head(-5)

Unnamed: 0,school,title,sentence_str,cleaned_text
0,plato,complete,of of Translated by and The Arthur.,of of Translated by and The Arthur.
1,plato,complete,Banning Press Minneapolis In the notes to the ...,Banning Press Minneapolis In the notes to the ...
2,plato,complete,A reference such as 'S indicates 'F. Schmitt's...,A reference such as 'S indicates 'F. Schmitt's...
3,plato,complete,line.',line.'
4,plato,complete,Library of Congress Control Number: Printed in...,Library of Congress Control Number: Printed in...
...,...,...,...,...
340278,abai 45 words,abai45,"The proof of the existence of one God, unique ...","The proof of the existence of one God, unique ..."
340279,abai 45 words,abai45,"We are not demiurges, but mortals who know thi...","We are not demiurges, but mortals who know thi..."
340280,abai 45 words,abai45,We are the servants of love and justice.,We are the servants of love and justice.
340281,abai 45 words,abai45,And we differ from one another in how well we ...,And we differ from one another in how well we ...


In [13]:
#@title
# ## some code for checking oddities

# # unfortunately it has to be done ad hoc and as they are discovered,
# # so we will not go too deep into it
# #
# # the code is commented out to make running the notebook smoother


word_list = [
 'tent',
 'per',
 'cent'
 'imma']

word_checker = []
for word in word_list:
   word_check_slice = df[(df['sentence_str'].str.contains('\s'+word+'\s'.lower()))].copy()
   word_check_slice['word'] = word
   word_checker.append(word_check_slice)

print(len(word_checker))
print(len(word_list))



3
3


In [14]:
pd.options.display.max_colwidth = 200
df.sample(10)

Unnamed: 0,school,title,sentence_str,cleaned_text
5043,aristotle,volume_1,"In regard to subjects which must have one and one only of two predicates, as (e.g.) a man must have either illness or health, supposing we are well supplied as regards the one for arguing its pres...","In regard to subjects which must have one and one only of two predicates, as (e.g.) a man must have either illness or health, supposing we are well supplied as regards the one for arguing its pres..."
240040,phenomenology,heidegger_being_time,What is at hand is not thereby observed and stared at simply as something objectively present.,What is at hand is not thereby observed and stared at simply as something objectively present.
114081,german_idealism,kant_judgement,"It is so presupposed a priori, and without regard to the practical, by judgement.","It is so presupposed a priori, and without regard to the practical, by judgement."
45147,aristotle,volume_2,"The law about the Spartan admirals has often been censured, and with justice; it is a source of dissension, for the kings are perpetual generals, and this office of admiral is but the setting up o...","The law about the Spartan admirals has often been censured, and with justice; it is a source of dissension, for the kings are perpetual generals, and this office of admiral is but the setting up o..."
107201,german_idealism,kant_practical_reason,"Gram, (Iowa City, la.:","Gram, (Iowa City, la.:"
223820,analytic,kripke_naming,"Everest, Existential statements, Fermat's Last Theorem, Feynman, Richard, Fictional entities, see also Legendary characters, Santa Claus, Unicorns Fixing the reference (referent) of a term, distin...","Everest, Existential statements, Fermat's Last Theorem, Feynman, Richard, Fictional entities, see also Legendary characters, Santa Claus, Unicorns Fixing the reference (referent) of a term, distin..."
101212,empiricists,hume_treatise,"When any object is presented, the idea of its usual attendant immediately strikes us, as something real and solid.","When any object is presented, the idea of its usual attendant immediately strikes us, as something real and solid."
260492,continental,foucault_madness,"When he mentions an experience, he never does anything other than to point to a difference in historical configurations of practices, beliefs and institutions.","When he mentions an experience, he never does anything other than to point to a difference in historical configurations of practices, beliefs and institutions."
314272,marxism,lenin_essential,"One of the manifestations of this change is the separation of industry from agriculture, the release of the social relationships in industry from the traditions of serfdom and the patriarchal syst...","One of the manifestations of this change is the separation of industry from agriculture, the release of the social relationships in industry from the traditions of serfdom and the patriarchal syst..."
182135,analytic,lewis_papers1,"that is, we replace Va by Va(/a/ and a by a(la/ and throughout.","that is, we replace Va by Va(/a/ and a by a(la/ and throughout."


In [15]:
df['school'].value_counts(normalize=True)


school
analytic                0.174105
german_idealism         0.172862
aristotle               0.157176
continental             0.121841
rationalists            0.093080
phenomenology           0.092801
marxism                 0.064992
empiricists             0.059053
capitalist_economics    0.058547
abai 45 words           0.003958
plato                   0.001584
Name: proportion, dtype: float64

In [16]:
#drop short sentences
df['sentence_length'] = df['sentence_str'].map(lambda x: len(x))
df = df[df['sentence_length'] >= 20]

print(len(df))
print(df.sample(1))

315945
                 school                title  \
148059  german_idealism  hegel_phenomenology   

                                          sentence_str  \
148059  which are all irremediably general in meaning.   

                                          cleaned_text  sentence_length  
148059  which are all irremediably general in meaning.               46  


In [17]:
df['sentence_lowered'] = df['sentence_str'].map(lambda x: x.lower())


In [18]:
fn_words = ['ch\.', 'bk', 'sect\.', 'div\.', 'cf', 'ibid', 'prop\.', 'Q\.E\.D\.',
            'pt\.', 'coroll\.', 'cf\.']

# Initialize a list to collect DataFrames of sentences with footnote words
found_words_list = []

# Search for sentences containing the footnote words and collect them
for word in fn_words:
    found_word = df[df['sentence_lowered'].str.contains('\s' + re.escape(word.lower()))].copy()
    found_word['word'] = word
    found_words_list.append(found_word)

fn_df = pd.concat(found_words_list, ignore_index=True)



In [19]:
len(fn_df)
fn_df.sample(5)

Unnamed: 0,school,title,sentence_str,cleaned_text,sentence_length,sentence_lowered,word
220,german_idealism,hegel_right,"Motiv, Trieb, Triebfider consciousness BewujJtsein Beziehung reference; relation(ship) cf.","Motiv, Trieb, Triebfider consciousness BewujJtsein Beziehung reference; relation(ship)",90,"motiv, trieb, triebfider consciousness bewujjtsein beziehung reference; relation(ship) cf.",cf
504,phenomenology,ponty_perception,"Stumpf, quoted by Koehler, ibid.,.","Stumpf, quoted by Koehler, ibid.,.",34,"stumpf, quoted by koehler, ibid.,.",ibid
399,german_idealism,hegel_right,"Erkennen, Kenntnis science; learning welfare volition; willing dignity cf.","Erkennen, Kenntnis science; learning welfare volition; willing dignity",74,"erkennen, kenntnis science; learning welfare volition; willing dignity cf.",cf
394,german_idealism,hegel_right,electoral contract cf.,electoral contract,22,electoral contract cf.,cf
126,german_idealism,hegel_logic,"In its true presentation, this exposition is the preceding whole of the logical Cf.","In its true presentation, this exposition is the preceding whole of the logical Cf.",83,"in its true presentation, this exposition is the preceding whole of the logical cf.",cf


In [20]:
df = df.drop(df[df['sentence_lowered'].str.contains('\s+bk'.lower())].index)
len(df)

315921

In [21]:
#selfmention
authors = [
    'Plato', 'Aristotle', 'Spinoza', 'Leibniz', 'Descartes', 'Malebranche',
    'Locke', 'Hume', 'Berkeley', 'Kant', 'Fichte', 'Hegel',
    'Russell', 'Moore', 'Wittgenstein', 'Lewis', 'Quine', 'Popper', 'Kripke',
    'Ponty', 'Husserl', 'Heidegger',
    'Foucault', 'Derrida', 'Deleuze',
    'Marx', 'Lenin',
    'Smith', 'Ricardo', 'Keynes',
    'Abai'
]
df = df[~df['sentence_lowered'].apply(lambda sentence: any(author.lower() in sentence for author in authors))]
len(df)

298405

In [22]:
len(df['sentence_str'])-len(df['sentence_str'].drop_duplicates())


5606

In [23]:
for school in df['school'].unique():
  print(school)
  print(len(df.loc[df['school'] == school]['sentence_str']) -
        len(df.loc[df['school'] == school]['sentence_str'].drop_duplicates()))

plato
10
aristotle
2511
rationalists
372
empiricists
639
german_idealism
653
analytic
384
phenomenology
149
continental
215
marxism
131
capitalist_economics
165
abai 45 words
5


In [24]:
doubles_df = pd.concat(g for _, g in df.groupby("sentence_str") if len(g) > 1)
doubles_df.sample(5)

Unnamed: 0,school,title,sentence_str,cleaned_text,sentence_length,sentence_lowered
217027,analytic,kripke_troubles,The Ways of Paradox and Other Essays.,The Ways of Paradox and Other Essays.,37,the ways of paradox and other essays.
43032,aristotle,volume_2,"The three exceptions are the Categories and de Interpretatione, where the translations of.","The three exceptions are the Categories and de Interpretatione, where the translations of.",90,"the three exceptions are the categories and de interpretatione, where the translations of."
18116,aristotle,volume_1,The general editors of the original Translation did not require from their translators any uniformity in the rendering of technical and semitechnical terms.,The general editors of the original Translation did not require from their translators any uniformity in the rendering of technical and semitechnical terms.,156,the general editors of the original translation did not require from their translators any uniformity in the rendering of technical and semitechnical terms.
66822,rationalists,descartes_discourse_method,Contact the Foundation as set forth in Section below.,Contact the Foundation as set forth in Section below.,53,contact the foundation as set forth in section below.
4690,aristotle,volume_1,These asterisks appear both in the Table of Contents and on the title pages of the individual works concerned.,These asterisks appear both in the Table of Contents and on the title pages of the individual works concerned.,110,these asterisks appear both in the table of contents and on the title pages of the individual works concerned.


In [25]:
doubles_df[doubles_df['school'] != 'abai'].sample(5)


Unnamed: 0,school,title,sentence_str,cleaned_text,sentence_length,sentence_lowered
15068,aristotle,volume_1,Spriggs InteLex Corporation.,Spriggs InteLex Corporation.,28,spriggs intelex corporation.
110935,german_idealism,kant_practical_reason,"Stuttgart Bad Cannstatt: Frommann Holzboog,.","Stuttgart Bad Cannstatt: Frommann Holzboog,.",44,"stuttgart bad cannstatt: frommann holzboog,."
16746,aristotle,volume_1,"References consist of a page number, a column letter, and a line number.","References consist of a page number, a column letter, and a line number.",72,"references consist of a page number, a column letter, and a line number."
108800,german_idealism,kant_practical_reason,"the Critique of Pure Reason, A. Cf.","the Critique of Pure Reason,. Cf.",35,"the critique of pure reason, a. cf."
66358,rationalists,leibniz_theodicy,"If you do not charge anything for copies of this eBook, complying with the trademark license is very easy.","If you do not charge anything for copies of this eBook, complying with the trademark license is very easy.",106,"if you do not charge anything for copies of this ebook, complying with the trademark license is very easy."


In [26]:
#kant kas short sentences still why
non_kant_indexes = df[(df['title'] != 'critique of pure reason') &
                       (df['sentence_str'].duplicated(keep=False))].index
kant_short_indexes = df[(df['title'] == 'critique of pure reason') &
                        (df['sentence_str'].duplicated(keep=False)) &
                        (df['sentence_length'] < 40)].index
kant_long_indexes = df[(df['title'] == 'critique of pure reason') &
                        (df['sentence_str'].duplicated(keep='first')) &
                        (df['sentence_length'] >= 40)].index

indexes_to_drop = [non_kant_indexes, kant_short_indexes, kant_long_indexes]
for index in indexes_to_drop:
  df = df.drop(index)

len(df)

291413

In [27]:
#foreign words - german
(df[df['sentence_str'].str.contains('\sder\s')]).sample(5)


Unnamed: 0,school,title,sentence_str,cleaned_text,sentence_length,sentence_lowered
231688,phenomenology,ponty_perception,"Goldstein, ber die Abh ngigkeit der Bewegungen von optischen Vorg ngen, Monatschrift Psychiatrie und Neurologie, Festschrift Liepmann,.","Goldstein, ber die Abh ngigkeit der Bewegungen von optischen Vorg ngen, Monatschrift Psychiatrie und Neurologie, Festschrift Liepmann,.",135,"goldstein, ber die abh ngigkeit der bewegungen von optischen vorg ngen, monatschrift psychiatrie und neurologie, festschrift liepmann,."
110042,german_idealism,kant_practical_reason,"Frankfurt and Leipzig: Gebr der Pf hler,.","Frankfurt and Leipzig: Gebr der Pf hler,.",41,"frankfurt and leipzig: gebr der pf hler,."
134760,german_idealism,hegel_logic,"anonymous nesidemus, oder über der vom Herrn Prof. Reinhold in Jena gelieferten Elementarphilosophie, nebst einer Verteidigung gegen die Anmassungen der Venunftkritik,.","anonymous nesidemus, oder über der vom Herrn Prof. Reinhold in Jena gelieferten Elementarphilosophie, nebst einer Verteidigung gegen die Anmassungen der Venunftkritik,.",169,"anonymous nesidemus, oder über der vom herrn prof. reinhold in jena gelieferten elementarphilosophie, nebst einer verteidigung gegen die anmassungen der venunftkritik,."
227529,phenomenology,ponty_perception,"The colour of the Goldstein and Rosenthal, Zum Problem der Wirkung der Farben auf den Organismus, pp.","The colour of the Goldstein and Rosenthal, Zum Problem der Wirkung der Farben auf den Organismus, pp.",101,"the colour of the goldstein and rosenthal, zum problem der wirkung der farben auf den organismus, pp."
203808,analytic,popper_science,"Reininger, Metaphysik der Wirklichkeit, .","Reininger, Metaphysik der Wirklichkeit, .",41,"reininger, metaphysik der wirklichkeit, ."


In [28]:
df = df.drop(df[df['sentence_str'].str.contains('\sder\s')].index)

len(df)

290801

In [29]:
#french
df[df['sentence_str'].str.contains('\sil\s')].sample(5)


Unnamed: 0,school,title,sentence_str,cleaned_text,sentence_length,sentence_lowered
158035,german_idealism,hegel_right,"The buds have the tree within Ciand il coiitain its entire strength, although they are not yet the tree itself.","The buds have the tree within Ciand il coiitain its entire strength, although they are not yet the tree itself.",111,"the buds have the tree within ciand il coiitain its entire strength, although they are not yet the tree itself."
120607,german_idealism,kant_pure_reason,"The second part of the transcendental logic must therefore be a critique of this dialectical illusion, and is called transcendental dialectic, not as an art of dogmatically arousing such il lusion...","The second part of the transcendental logic must therefore be a critique of this dialectical illusion, and is called transcendental dialectic, not as an art of dogmatically arousing such il lusion...",679,"the second part of the transcendental logic must therefore be a critique of this dialectical illusion, and is called transcendental dialectic, not as an art of dogmatically arousing such il lusion..."
274773,continental,foucault_clinic,a il de la difference dans les systernes de classification dont on se Bert avec avantage dans ('etude de I'histoire naturelle et ceux qui peuvent gitre profitables a la connaissance des maladies? (,a il de la difference dans les systernes de classification dont on se Bert avec avantage dans ('etude de I'histoire naturelle et ceux qui peuvent gitre profitables a la connaissance des maladies? (,197,a il de la difference dans les systernes de classification dont on se bert avec avantage dans ('etude de i'histoire naturelle et ceux qui peuvent gitre profitables a la connaissance des maladies? (
310990,marxism,marx_kapital,"De jour en jour il deviant donc plus clair que les rapports de production dans lesquels se meut la bourgeoisie n'ont pas un caract re un, un catact re simple, mais un caract re de duplicit que dan...","De jour en jour il deviant donc plus clair que les rapports de production dans lesquels se meut la bourgeoisie n'ont pas un caract re un, un catact re simple, mais un caract re de duplicit que dan...",643,"de jour en jour il deviant donc plus clair que les rapports de production dans lesquels se meut la bourgeoisie n'ont pas un caract re un, un catact re simple, mais un caract re de duplicit que dan..."
311013,marxism,marx_kapital,"Les nations pauvres, c'est le peuple est son aise; et les nations riches, c'est il est ordinairement pauvre.","Les nations pauvres, c'est le peuple est son aise; et les nations riches, c'est il est ordinairement pauvre.",108,"les nations pauvres, c'est le peuple est son aise; et les nations riches, c'est il est ordinairement pauvre."


In [30]:
df = df.drop(df[df['sentence_str'].str.contains('\sil\s')].index)

len(df)

290750

In [31]:
# miscellaneous nonsense sentences
df = df.drop(df[df['sentence_str'].str.contains('\spp\s')].index)
df = df.drop(df[df['sentence_str'].str.contains('\stotam\s')].index)
df = df.drop(df[df['sentence_str'].str.contains('\srree\s')].index)
df = df.drop(df[df['sentence_str'].str.contains('\sflir\s')].index)
df = df.drop(df[df['sentence_str'].str.contains('\smodis\s')].index)

len(df)

290739

In [32]:

# markers of french and notes
df = df.drop(df[df['sentence_str'].str.contains('\schapitre')].index)
df = df.drop(df[df['sentence_str'].str.contains('\salisme')].index)
df = df.drop(df[df['sentence_str'].str.contains('\sHahn')].index)

len(df)

290727

In [33]:

# some notes in Kant
df = df.drop(df[df['sentence_str'].str.contains('\sVorl\s')].index)

len(df)

290707

In [34]:
# a common phrase in Plato / Aristotle footnotes
df = df.drop(df[(df['sentence_str'].str.contains('(?i)reading')) & (df['sentence_length'] < 40)].index)
df = df.drop(df[(df['sentence_str'].str.contains('(?i)reading')) & (df['sentence_length'] < 40)].index)

len(df)

290470

In [35]:
df['school'].value_counts(normalize=True)


school
analytic                0.180797
aristotle               0.160041
german_idealism         0.157476
continental             0.120367
phenomenology           0.096371
rationalists            0.090519
capitalist_economics    0.063728
empiricists             0.062919
marxism                 0.061707
abai 45 words           0.004455
plato                   0.001622
Name: proportion, dtype: float64

In [36]:
from gensim.utils import simple_preprocess


In [37]:
non_content_patterns = [
    r'\btranslated by\b', r'\bprinted in\b', r'\blibrary of congress control number:\b',
    r'\bpress\b', r'\bminneapolis\b', r'\breference such as\b',
    r'\bpage intentionally left blank\b', r'\bpublisher\b', r'\bauthor\b',
    r'\bedition\b', r'\bvolume\b', r'\bintroduction\b', r'\bforeword\b',
    r'\babstract\b'
]

# Compile the patterns into a single regex pattern for efficiency
combined_pattern = re.compile('|'.join(non_content_patterns), re.IGNORECASE)

# Filter out sentences matching the non-content patterns
df = df[~df['sentence_str'].apply(lambda x: bool(combined_pattern.search(x)))]

In [47]:
def tokenize_text(text):
    return simple_preprocess(text.lower(), deacc=True, max_len=200)


def lemmatize_sentence(sentence):
    if not sentence or pd.isna(sentence):
        print("Skipping empty or NaN sentence.")
        return ""
    try:
        doc = nlp(sentence)
        lemmatized_txt = ' '.join([token.lemma_ for token in doc])
        return lemmatized_txt
    except Exception as e:
        print(f"Error processing sentence: {sentence}. Error: {e}")
        return ""



In [45]:
#check lemmatization
doc = nlp("The cats are running")
print([token.lemma_ for token in doc])


['the', 'cat', 'be', 'run']


In [48]:
df['tokenized_txt'] = df['sentence_str'].map(tokenize_text)

df['lemmatized_str'] = df['sentence_str'].apply(lemmatize_sentence)

# Print the DataFrame to ensure updates are visible
print(df[['sentence_str', 'lemmatized_str']])

                                                                                                                                                                                                   sentence_str  \
6                                                                                                                                                                                          All rights reserved.   
7                                                Preface to the Three Dialogues on Truth, Freedom, and Evil At different times in the past I wrote three treatises pertaining to the study of Sacred Scripture.   
8                                                        They are similar in having been writ ten in dialogue form; the person inquiring is designated 'the Stu dent,' and the person answering, 'the Teacher.'   
9       Because a fourth treatise which begins with the words 'De Grammatico,' and which I also published in dialogue form and regard as not with out use to

In [49]:
df.sample(5)


Unnamed: 0,school,title,sentence_str,cleaned_text,sentence_length,sentence_lowered,tokenized_txt,lemmatized_str
32056,aristotle,volume_2,"This can be illustrated from musical instruments; for, all other conditions being the same, it is the thinner strings that give shriller sounds.","This can be illustrated from musical instruments; for, all other conditions being the same, it is the thinner strings that give shriller sounds.",144,"this can be illustrated from musical instruments; for, all other conditions being the same, it is the thinner strings that give shriller sounds.","[this, can, be, illustrated, from, musical, instruments, for, all, other, conditions, being, the, same, it, is, the, thinner, strings, that, give, shriller, sounds]","this can be illustrate from musical instrument ; for , all other condition be the same , it be the thin string that give shriller sound ."
45194,aristotle,volume_2,"The Carthaginians are also considered to have an excellent form of government, which differs from that of any other state in several respects, though it is in some very like the Lacedaemonian.","The Carthaginians are also considered to have an excellent form of government, which differs from that of any other state in several respects, though it is in some very like the Lacedaemonian.",192,"the carthaginians are also considered to have an excellent form of government, which differs from that of any other state in several respects, though it is in some very like the lacedaemonian.","[the, carthaginians, are, also, considered, to, have, an, excellent, form, of, government, which, differs, from, that, of, any, other, state, in, several, respects, though, it, is, in, some, very,...","the Carthaginians be also consider to have an excellent form of government , which differ from that of any other state in several respect , though it be in some very like the Lacedaemonian ."
231276,phenomenology,ponty_perception,"phenomenology of perception which is given as part of the human lot, is not one for me as pure consciousness: it is still I who makes another to be for me and makes each of us be as human beings.","phenomenology of perception which is given as part of the human lot, is not one for me as pure consciousness: it is still I who makes another to be for me and makes each of us be as human beings.",195,"phenomenology of perception which is given as part of the human lot, is not one for me as pure consciousness: it is still i who makes another to be for me and makes each of us be as human beings.","[phenomenology, of, perception, which, is, given, as, part, of, the, human, lot, is, not, one, for, me, as, pure, consciousness, it, is, still, who, makes, another, to, be, for, me, and, makes, ea...","phenomenology of perception which be give as part of the human lot , be not one for I as pure consciousness : it be still I who make another to be for I and make each of we be as human being ."
143625,german_idealism,hegel_logic,"It is therefore fitting and unavoidable to have these names, 'subject' and 'predicate,' for the determinations of the judgment; as names, they are something indeterminate, still in need of determi...","It is therefore fitting and unavoidable to have these names, 'subject' and 'predicate,' for the determinations of the judgment; as names, they are something indeterminate, still in need of determi...",236,"it is therefore fitting and unavoidable to have these names, 'subject' and 'predicate,' for the determinations of the judgment; as names, they are something indeterminate, still in need of determi...","[it, is, therefore, fitting, and, unavoidable, to, have, these, names, subject, and, predicate, for, the, determinations, of, the, judgment, as, names, they, are, something, indeterminate, still, ...","it be therefore fitting and unavoidable to have these name , ' subject ' and ' predicate , ' for the determination of the judgment ; as name , they be something indeterminate , still in need of de..."
9185,aristotle,volume_1,"For since it has been shown that the quicker will pass over an equal magnitude in less time than the slower, suppose that A is quicker and slower, and that the slower has traversed the magnitude i...","For since it has been shown that the quicker will pass over an equal magnitude in less time than the slower, suppose that A is quicker and slower, and that the slower has traversed the magnitude i...",207,"for since it has been shown that the quicker will pass over an equal magnitude in less time than the slower, suppose that a is quicker and slower, and that the slower has traversed the magnitude i...","[for, since, it, has, been, shown, that, the, quicker, will, pass, over, an, equal, magnitude, in, less, time, than, the, slower, suppose, that, is, quicker, and, slower, and, that, the, slower, h...","for since it have be show that the quicker will pass over an equal magnitude in less time than the slow , suppose that a be quick and slow , and that the slower have traverse the magnitude in the ..."


In [51]:
from google.colab import files
df.to_csv('nlp_project_final.csv', index=False)
files.download('nlp_project_final.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [54]:
len(df)

285350