In [2]:
import pandas as pd

df = pd.read_csv("data_dif_div/aveni-tables-20250520.csv")

In [7]:
# Filter out contexts in top 5%

# Calculate the 95th percentile threshold
threshold = df['text'].str.len().quantile(0.95)

# Filter directly
df = df[df['text'].str.len() <= threshold]

In [9]:
import re

def count_markdown_tables(text):
    """Count the number of markdown tables in a text string"""
    # Pattern to match table separator lines like |---|---|---|
    # This looks for lines with at least one pipe and dashes
    table_pattern = r'^\s*\|[\s\-:|]+\|\s*$'
    
    # Split text into lines and find matches
    lines = text.split('\n')
    table_separators = [line for line in lines if re.match(table_pattern, line)]
    
    # Each table has one separator line (usually after the header)
    return len(table_separators)

# Apply to your dataframe
df['table_count'] = df['text'].apply(count_markdown_tables)

# View the results
print(df[['text', 'table_count']].head())

                                                text  table_count
0  AJ Bell Dodl order execution policy\nWe are re...            0
1  If you’re not sure about transferring your pen...            1
2  Key features of the Dodl pension\nContents\n- ...            0
3  Cookie policy\nLike most websites, Dodl uses c...            0
4  The charges\nCompare your Dodl account to othe...            1


In [10]:
# Summary statistics
print(f"Total entries: {len(df)}")
print(f"Entries with tables: {(df['table_count'] > 0).sum()}")
print(f"Average tables per entry: {df['table_count'].mean():.2f}")
print(f"Max tables in an entry: {df['table_count'].max()}")

# Distribution of table counts
print("\nTable count distribution:")
print(df['table_count'].value_counts().sort_index())

Total entries: 93869
Entries with tables: 73016
Average tables per entry: 2.28
Max tables in an entry: 273

Table count distribution:
table_count
0      20853
1      38400
2      12751
3       5255
4       6873
       ...  
214        1
226        1
248        1
264        1
273        1
Name: count, Length: 126, dtype: int64


In [21]:
# Filter to keep only rows with 1-6 tables
df_filtered = df[(df['table_count'] > 0) & (df['table_count'] <= 6)]

# Check the results
print(f"Original DataFrame: {len(df)} rows")
print(f"Filtered DataFrame: {len(df_filtered)} rows")
print(f"Rows removed: {len(df) - len(df_filtered)}")

# See the distribution of table counts in the filtered data
print("\nTable count distribution in filtered data:")
print(df_filtered['table_count'].value_counts().sort_index())

Original DataFrame: 93869 rows
Filtered DataFrame: 66700 rows
Rows removed: 27169

Table count distribution in filtered data:
table_count
1    38400
2    12751
3     5255
4     6873
5     2192
6     1229
Name: count, dtype: int64


In [35]:
# Split the dataset
df_single_table = df_filtered[df_filtered['table_count'] == 1]
df_multiple_tables = df_filtered[df_filtered['table_count'] > 1]

# Check the split
print(f"Single table entries: {len(df_single_table)}")
print(f"Multiple table entries: {len(df_multiple_tables)}")
print(f"Total: {len(df_single_table) + len(df_multiple_tables)}")

# Save to CSV files
df_single_table.to_csv('data_dif_div/single_table_dataset.csv', index=False)
df_multiple_tables.to_csv('data_dif_div/multiple_tables_dataset.csv', index=False)

print("\nFiles saved:")
print("- single_table_dataset.csv")
print("- multiple_tables_dataset.csv")

Single table entries: 38400
Multiple table entries: 28300
Total: 66700

Files saved:
- single_table_dataset.csv
- multiple_tables_dataset.csv


In [33]:
import random

# Get a random index
random_idx = random.choice(df_filtered.index)

# Get the text and table count for that index
random_text = df_filtered.loc[random_idx, 'text']
table_count = df_filtered.loc[random_idx, 'table_count']

# Print the results
print(f"Random entry (index {random_idx}):")
print(f"Number of tables: {table_count}")
print("-" * 50)
print(random_text)

Random entry (index 65262):
Number of tables: 1
--------------------------------------------------
Robinhood Markets (HOOD -1.90%), the online brokerage that popularized commission-free trading, went public in July 2021 at $38 a share. Its stock hit an all-time high of $70.29 less than a week later, but it dropped below $7 by the following June.
Robinhood's stock plummeted as rising interest rates curbed the market's appetite for the higher-risk stocks, options, and cryptocurrencies that have driven most of its growth during the pandemic. However, its stock recovered over the following two years as interest rates peaked and investors poured more cash back into its platform.
Robinhood's stock trades at about $36 as of this writing, which marks a five-bagger gain from its all-time low but still falls shy of its IPO price. Let's take a fresh look at its business and see where its stock might head over the next three years.
What happened to Robinhood over the past few years?
Robinhood's gr