In [1]:
import re
import pandas as pd

def clean_attribution(attribution):
    # Remove the --_ at the beginning and the _ at the end of the attribution
    return attribution.replace("--_", "").strip("_")

def clean_quote(quote):
    # Replace "> " with "" if it is not followed by a number and a dot
    cleaned_quote = re.sub(r'> (?!((\d+)\.))', '', quote)
    # Additionally, replace "> " specifically at the beginning of the string
    cleaned_quote = re.sub(r'^> ', '', cleaned_quote, flags=re.MULTILINE)
    return cleaned_quote

def extract_quote_and_attribution(text):
    # Regular expression pattern to match each quote section ending with the attribution
    pattern = re.compile(r'>(.*?)\*\*(--_.+?_)\*\*', re.DOTALL)

    # Find all matches in the text
    matches = pattern.findall(text)

    # List to hold the extracted quotes and attributions
    extracted_data = []

    # Process each match
    for match in matches:
        quote, attribution = match
        # Clean up the quote and attribution strings
        cleaned_quote = clean_quote(quote).replace('\n', ' ')  # Clean the quote and replace newlines with spaces
        cleaned_attribution = clean_attribution(attribution)  # Clean the attribution
        # Add the cleaned up quote and attribution as a tuple to the list
        extracted_data.append((cleaned_quote, cleaned_attribution))

    return extracted_data

# Read the markdown file
with open('quotes.md', 'r', encoding='utf-8') as file:
    markdown = file.read()

# Split markdown by headings to extract sections
sections = re.split(r'\n#{2,3} ', markdown)


# ************************************************
# Regular expression to find the section headings
# This will match any line that starts with '### ' followed by any characters until the end of the line
pattern = re.compile(r'^### (.+)$', re.MULTILINE)

# Find all matches in the file contents
sectionHeadings = pattern.findall(markdown)

# Print the list of section headings
print(sectionHeadings)
# ************************************************


# Assuming 'sections' is a list of text sections that has been defined elsewhere
# and 'extract_quote_and_attribution' is a function that extracts quotes and
# attributions from a given text section and returns a list of tuples.

# Initialize an empty list to store the data
data = []

# Iterate through each section, skipping the first (sections[1:])
for section_index, section in enumerate(sections[1:], start=1):
    # Call the function with the 'section' text
    extracted_quotes = extract_quote_and_attribution(section)

    # Iterate over each extracted quote and its attribution
    for quote, attribution in extracted_quotes:
        # Append a dictionary to the data list with the section number, quote, and attribution
        data.append({
            "Section": sectionHeadings[section_index-1],
            "Quote": quote,
            "Attribution": attribution
        })

# Create a DataFrame from the data list
df = pd.DataFrame(data)

# Now 'df' is a DataFrame with each quote, attribution, and the section number
print(df)

# If you want to save this DataFrame to a CSV file, you can do so with the following line:
df.to_csv('quotes_and_attributions.csv', index=False)


['Academics', 'Computation', 'Creativity', 'Culture', 'Faith', 'Meaning', 'Beauty', 'Happiness', 'Absurdity', 'Nihilism', 'Good and Evil', 'Suffering', 'Love', 'Anger', 'Passion', 'Planning', 'Psychology', 'Science', 'Teaching', 'Technology', 'Logic', 'Moderation', 'Math', 'Math Jokes', 'Philosophy', 'Philosophical jokes', 'Uncertainty', 'Probability', 'Probability and Statistics jokes']
                              Section  \
0                           Academics   
1                           Academics   
2                           Academics   
3                           Academics   
4                           Academics   
..                                ...   
157                       Probability   
158                       Probability   
159  Probability and Statistics jokes   
160  Probability and Statistics jokes   
161  Probability and Statistics jokes   

                                                 Quote  \
0     "Hold your peace, senor," said Sancho; "faith...   
