In [2]:
import pandas as pd
import re

def parse_quotes_md(file_path):
    # Read the markdown file
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
    
    # Find section headers
    sections = re.findall(r'###\s+(.+)', content)
    
    # Create a pattern to match blockquotes with their attributions
    quote_pattern = re.compile(r'(> "[\s\S]+?")\s*\n>\s*\n> \*\*--_(.+?)_\*\*')
    
    # Find all quotes and their indices
    quotes_attributions_indices = [(match.group(1), match.group(2), match.start())
                                   for match in quote_pattern.finditer(content)]
    
    # Data to create DataFrame
    data = []
    
    # Track the current section index
    current_section = ''
    current_section_index = 0

    # Iterate through each quote and attribution pair with its index
    for quote, attribution, index in quotes_attributions_indices:
        # Find current section based on index
        while current_section_index + 1 < len(sections) and \
              content.find('### ' + sections[current_section_index + 1]) < index:
            current_section_index += 1
        current_section = sections[current_section_index]
        
        # Clean the quote string
        quote = quote.strip('> " ').replace('\n', ' ')
        
        # Append the quote, its attribution, and the current section to the data list
        data.append({'Section': current_section, 'Quote': quote, 'Attribution': attribution})
    
    # Create a DataFrame
    df = pd.DataFrame(data)
    
    # Save to CSV
    df.to_csv('quotes.csv', index=False)
    
    return df

# Parse the markdown and get the DataFrame
file_path = 'quotes.md'
df = parse_quotes_md(file_path)

# Display the DataFrame
print(df)


                             Section  \
0                          Academics   
1                              Faith   
2                          Suffering   
3                            Passion   
4                         Psychology   
5                            Science   
6                            Science   
7                            Science   
8                            Science   
9                            Science   
10                           Science   
11                           Science   
12                           Science   
13                           Science   
14                           Science   
15                           Science   
16                           Science   
17                          Teaching   
18                          Teaching   
19                          Teaching   
20                          Teaching   
21                          Teaching   
22                          Teaching   
23                        Technology   


In [3]:
import pandas as pd
import re

def parse_quotes_md(file_path):
    # Read the markdown file
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
    
    # Find section headers
    sections = re.findall(r'###\s+(.+)', content)
    
    # Create a pattern to match blockquotes with their attributions
    quote_pattern = re.compile(r'(> "[\s\S]+?(?="))\s*>[\s\S]+?\*\*--_(.+?)_\*\*', re.MULTILINE)
    
    # Find all quotes and their indices
    quotes_attributions_indices = [(match.group(1), match.group(2), match.start())
                                   for match in quote_pattern.finditer(content)]
    
    # Data to create DataFrame
    data = []
    
    # Track the current section index
    current_section = ''
    current_section_index = 0

    # Iterate through each quote and attribution pair with its index
    for quote, attribution, index in quotes_attributions_indices:
        # Find current section based on index
        while current_section_index + 1 < len(sections) and \
              content.find('### ' + sections[current_section_index + 1]) < index:
            current_section_index += 1
        current_section = sections[current_section_index]
        
        # Clean the quote string
        quote = quote.strip('> " ').replace('\n', ' ').replace('\n> ', ' ')
        
        # Append the quote, its attribution, and the current section to the data list
        data.append({'Section': current_section, 'Quote': quote, 'Attribution': attribution})
    
    # Create a DataFrame
    df = pd.DataFrame(data)
    
    # Save to CSV
    df.to_csv('quotes.csv', index=False)
    
    return df

# Parse the markdown and get the DataFrame
file_path = 'quotes.md'
df = parse_quotes_md(file_path)

# Display the DataFrame
print(df)


Empty DataFrame
Columns: []
Index: []


In [4]:
import pandas as pd
import re

def parse_quotes_md(file_path):
    # Read the markdown file
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
    
    # Remove the Table of Contents for easier processing
    content = re.sub(r'# Table of Contents[\s\S]*?Back to Brandon\'s page\)', '', content)

    # Find section headers
    sections = re.findall(r'###\s+(.+?)\n', content)
    
    # Create a pattern to match blockquotes with their attributions
    # The regex below does the following:
    # - Match a greater-than sign followed by optional whitespace and a quote mark
    # - Match the quote content lazily, allowing for newlines
    # - Match the attribution pattern which starts and ends with asterisks
    quote_pattern = re.compile(
        r'>\s*"([\s\S]+?)\s*"\s*\n>\s*\n>\s*\*\*--_([\s\S]+?)_\*\*', 
        re.MULTILINE)
    
    # Initialize list to hold each quote and its details
    quotes = []
    
    # Initialize variables to keep track of the current section and its starting index in content
    current_section = ''
    section_start_index = 0
    
    # Iterate over each section to find the quotes within it
    for section in sections:
        # Find the index where the current section header is located in the content
        section_start_index = content.index(section, section_start_index) + len(section)
        
        # Determine the end index of the current section (start of next section or EOF)
        next_section_start_index = content.find('###', section_start_index)
        if next_section_start_index == -1:
            next_section_start_index = len(content)
        
        # Extract the content of the current section
        section_content = content[section_start_index:next_section_start_index]
        
        # Find all quotes within the current section
        for quote_match in quote_pattern.finditer(section_content):
            # Clean and normalize the quote and attribution
            quote = quote_match.group(1).strip().replace('\n>', '').replace('\n', ' ')
            attribution = quote_match.group(2).strip()
            
            # Append to the list of quotes
            quotes.append({'Section': section, 'Quote': quote, 'Attribution': attribution})

    # Create a DataFrame from the list of quotes
    df = pd.DataFrame(quotes)
    
    # Save to CSV
    df.to_csv('quotes.csv', index=False)
    
    return df

# Define the path to the markdown file
file_path = 'quotes.md'

# Parse the markdown file and get the DataFrame
df = parse_quotes_md(file_path)

# Display the DataFrame
print(df)


                             Section  \
0                           Nihilism   
1                          Suffering   
2                            Passion   
3                         Psychology   
4                            Science   
5                            Science   
6                            Science   
7                            Science   
8                            Science   
9                            Science   
10                           Science   
11                           Science   
12                           Science   
13                           Science   
14                           Science   
15                           Science   
16                           Science   
17                          Teaching   
18                          Teaching   
19                          Teaching   
20                          Teaching   
21                          Teaching   
22                          Teaching   
23                        Technology   


In [5]:
import pandas as pd
import re

def parse_quotes_md(file_path):
    # Read the markdown file
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
    
    # Remove the Table of Contents for easier processing
    content = re.sub(r'# Table of Contents[\s\S]*?Back to Brandon\'s page\)', '', content)

    # Find section headers
    sections = re.findall(r'###\s+(.+?)\n', content)
    
    # Create a pattern to match blockquotes with their attributions
    # The regex below matches lines starting with > and ending with _**
    quote_pattern = re.compile(
        r'^>\s*(.*?)\s*—\s*[_*]{2}(.*?)_*\*{2}$', 
        re.MULTILINE)
    
    # Initialize list to hold each quote and its details
    quotes = []
    
    # Initialize variables to keep track of the current section and its starting index in content
    current_section = ''
    section_start_index = 0
    
    # Iterate over each section to find the quotes within it
    for section in sections:
        # Find the index where the current section header is located in the content
        section_start_index = content.index(section, section_start_index) + len(section)
        
        # Determine the end index of the current section (start of next section or EOF)
        next_section_start_index = content.find('###', section_start_index)
        if next_section_start_index == -1:
            next_section_start_index = len(content)
        
        # Extract the content of the current section
        section_content = content[section_start_index:next_section_start_index]
        
        # Find all quotes within the current section
        for quote_match in quote_pattern.finditer(section_content):
            # Clean and normalize the quote and attribution
            quote = quote_match.group(1).strip().replace('\n>', '').replace('\n', ' ')
            attribution = quote_match.group(2).strip()
            
            # Append to the list of quotes
            quotes.append({'Section': section, 'Quote': quote, 'Attribution': attribution})

    # Create a DataFrame from the list of quotes
    df = pd.DataFrame(quotes)
    
    # Save to CSV
    df.to_csv('quotes.csv', index=False)
    
    return df

# Define the path to the markdown file
file_path = 'quotes.md'

# Parse the markdown file and get the DataFrame
df = parse_quotes_md(file_path)

# Display the DataFrame
print(df)


Empty DataFrame
Columns: []
Index: []


In [6]:
import pandas as pd
import re

def parse_quotes_md(file_path):
    # Read the markdown file
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()

    # Define the regex pattern to match sections
    section_pattern = re.compile(r'### (.+)')
    
    # Define the regex pattern to match quotes. This pattern captures text starting with ">"
    # possibly spanning multiple lines, followed by an attribution which is in bold.
    quote_pattern = re.compile(
        r'^>\s*"?(.*?)"?\s*\n?>?\s*(\*\*--_[\s\S]+?_\*\*)', 
        re.MULTILINE)
    
    quotes = []  # Initialize list to hold quotes
    
    # Split the document into sections by titles
    parts = section_pattern.split(content)
    
    # Using enumerate to skip the first split which is the header part, and then pair the title with the content
    for section_title, section_content in zip(parts[1::2], parts[2::2]):
        # Find all quotes in the section
        for quote in quote_pattern.findall(section_content):
            # Clean up the quote and the attribution
            text, attribution = quote
            text = text.replace('\n>', '').replace('\n', ' ').strip()
            attribution = attribution.strip().replace('\n', ' ')
            # Add to the list of quotes
            quotes.append({'Section': section_title.strip(), 'Quote': text, 'Attribution': attribution})
    
    # Create DataFrame from the quotes
    df = pd.DataFrame(quotes)
    
    # Save to CSV
    df.to_csv('quotes.csv', index=False)
    
    return df

# Define the path to the markdown file
file_path = 'quotes.md'

# Parse the markdown file and get the DataFrame
df = parse_quotes_md(file_path)

# Display the DataFrame
print(df)


                              Section Quote  \
0                           Academics         
1                           Academics         
2                           Academics         
3                           Academics         
4                           Academics         
..                                ...   ...   
149                       Uncertainty         
150                       Probability         
151                       Probability         
152  Probability and Statistics jokes         
153  Probability and Statistics jokes         

                                           Attribution  
0    **--_Miguel de Cervantes, The Ingenious Gentle...  
1           **--_Herbert Spencer, Apr 27 1820-1903._**  
2    **--_Noam Chomsky, A Review of B. F. Skinner's...  
3                        **--_Mark Twain, 1835-1910_**  
4                     **--_H. L. Mencken, 1880-1956_**  
..                                                 ...  
149              **--_Richard P. Fey

In [7]:
import pandas as pd
import re

def parse_quotes_md(file_path):
    # Read the markdown file
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()

    # Define the regex pattern to match sections
    section_pattern = re.compile(r'### (.+)')
    
    # Define the regex pattern to match quotes. This pattern captures text that may span multiple lines, 
    # and is followed by an attribution in bold.
    quote_pattern = re.compile(
        r'>(?:\s*>\s*)*"?(.*?)"?\s*\n(?=\s*> \*\*--_)', 
        re.MULTILINE | re.DOTALL)
    
    # Define the regex pattern to match attributions
    attribution_pattern = re.compile(
        r'> \*\*--_(.+?)_\*\*',
        re.MULTILINE)
    
    quotes = []  # Initialize list to hold quotes
    
    # Split the document into sections by titles
    parts = section_pattern.split(content)
    
    # Iterate over the sections
    for section_title, section_content in zip(parts[1::2], parts[2::2]):
        # Find all quotes in the section
        for quote_text in quote_pattern.findall(section_content):
            # Clean up the quote text
            quote_text = quote_text.strip().replace('\n>', '').replace('\n', ' ').strip()
            # Find the corresponding attribution
            attribution_match = attribution_pattern.search(section_content)
            attribution = attribution_match.group(1).strip().replace('\n', ' ') if attribution_match else 'Unknown'
            # Add to the list of quotes
            quotes.append({'Section': section_title.strip(), 'Quote': quote_text, 'Attribution': attribution})
            # Remove the matched text to avoid capturing the same attribution multiple times
            section_content = section_content.replace(attribution_match.group(0), '', 1)
    
    # Create DataFrame from the quotes
    df = pd.DataFrame(quotes)
    
    # Save to CSV
    df.to_csv('quotes.csv', index=False)
    
    return df

# Define the path to the markdown file
file_path = 'quotes.md'

# Parse the markdown file and get the DataFrame
df = parse_quotes_md(file_path)

# Display the DataFrame
print(df)

                              Section  \
0                           Academics   
1                           Academics   
2                           Academics   
3                           Academics   
4                           Academics   
..                                ...   
143                       Uncertainty   
144                       Probability   
145                       Probability   
146  Probability and Statistics jokes   
147  Probability and Statistics jokes   

                                                 Quote  \
0    "Hold your peace, senor," said Sancho; "faith,...   
1    **--_Miguel de Cervantes, The Ingenious Gentle...   
2    **--_Herbert Spencer, Apr 27 1820-1903._**  Ev...   
3    **--_Noam Chomsky, A Review of B. F. Skinner's...   
4    **--_Mark Twain, 1835-1910_**  For every probl...   
..                                                 ...   
143  **--_Richard P. Feynman, 1918-1988_**  “I can ...   
144  ...no test based upon a theory of prob