## Only Books

In [2]:
# Import necessary libraries
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from google.colab import files # For file upload in Colab

#Download NLTK's 'punkt' tokenizer models

try:
    # Attempt to load the resource directly to check if it exists
    nltk.data.find('tokenizers/punkt_tab/english/')
except LookupError:
    print("NLTK 'punkt_tab' resource not found. Downloading...")
    # The traceback suggests 'punkt_tab', so download that specific resource
    nltk.download('punkt_tab')
    print("'punkt_tab' downloaded successfully.")

#Upload your CSV file
print("Please upload your Readwise CSV export file:")
uploaded = files.upload()

# Check if a file was uploaded and get its name
if not uploaded:
    print("\nNo file was uploaded. Please run the cell again and select your CSV file.")
    file_name = None
else:
    file_name = next(iter(uploaded))
    print(f"\nSuccessfully uploaded '{file_name}'.")

#Read the CSV into a pandas DataFrame
if file_name:
    try:
        df = pd.read_csv(file_name)
        print("\nCSV file loaded successfully into a DataFrame.")
    except Exception as e:
        print(f"\nError loading CSV file: {e}")
        print("Please ensure the uploaded file is a valid CSV and the file name is correct.")
        df = pd.DataFrame()
else:
    df = pd.DataFrame()

#FILTER: Keep only books
if not df.empty and 'Amazon Book ID' in df.columns:
    df = df[df['Amazon Book ID'].notnull() & (df['Amazon Book ID'] != '')]
    print(f"\nAfter filtering, {len(df)} rows remain with a non-null Amazon Book ID.")
else:
    print("\nWarning: 'Amazon Book ID' column not found. No filter applied.")

# Proceed only if the DataFrame is not empty
if not df.empty:
    #Inspect the dataframe
    print("\n--- DataFrame Inspection ---")
    print("last 5 rows of your data:")
    print(df.tail())
    print("\nDataFrame Info (columns, data types, non-null counts):")
    df.info()

    #Check if you have Higlight and Book Title columns
    highlight_column_name = 'Highlight'  #CHANGE THIS IF YOUR COLUMN NAME IS DIFFERENT
    book_title_column_name = 'Book Title' #CHANGE THIS IF YOUR COLUMN NAME IS DIFFERENT !!!


    # Check if the specified highlight column exists
    if highlight_column_name not in df.columns:
        print(f"\nError: The column '{highlight_column_name}' was not found in your CSV.")
        print(f"Available columns are: {df.columns.tolist()}")
        print("Please update the 'highlight_column_name' variable in the script and re-run.")
        df = pd.DataFrame() # Effectively stop processing
    else:
        print(f"\nUsing column '{highlight_column_name}' for highlight analysis.")

    # Check if the specified book title column exists (only if df is still valid)
    if not df.empty and book_title_column_name not in df.columns:
        print(f"\nError: The column '{book_title_column_name}' was not found in your CSV.")
        print(f"Available columns are: {df.columns.tolist()}")
        print(f"Please update the 'book_title_column_name' variable in the script and re-run if you want per-book stats.")
        # We can still proceed with overall stats, but per-book stats will be skipped.
        can_do_book_stats = False
    elif not df.empty:
        print(f"Using column '{book_title_column_name}' for per-book analysis.")
        can_do_book_stats = True
    else:
        can_do_book_stats = False


# Proceed only if the DataFrame and highlight column are valid
if not df.empty:
    #Basic Text Analytics (per highlight) ---

    # Ensure the highlight column is treated as string and handle potential missing values (NaN)
    df[highlight_column_name] = df[highlight_column_name].astype(str).fillna('')

    #Word Count (simple method: split by space)
    df['word_count_simple'] = df[highlight_column_name].apply(lambda x: len(x.split()))

    #Token Count (using NLTK's word_tokenize for better accuracy)
    df['token_count_nltk'] = df[highlight_column_name].apply(lambda x: len(word_tokenize(x)))

    #Display Per-Highlight Results (Sample) ---
    print("\n--- Per-Highlight Analytics Results (Sample) ---")
    print("First 5 rows with new analytics columns ('word_count_simple', 'token_count_nltk'):")
    print(df[[highlight_column_name, 'word_count_simple', 'token_count_nltk']].head())

    #Aggregate Statistics (Overall) ---
    print("\n--- Aggregate Statistics (Overall) ---")
    total_highlights = len(df)
    total_words_simple = df['word_count_simple'].sum()
    total_tokens_nltk = df['token_count_nltk'].sum()

    print(f"Total number of highlights processed: {total_highlights}")
    print(f"Total estimated words (space-separated): {total_words_simple}")
    print(f"Total tokens (NLTK): {total_tokens_nltk}")

    if total_highlights > 0:
        average_words_per_highlight = df['word_count_simple'].mean()
        average_tokens_per_highlight = df['token_count_nltk'].mean()
        print(f"Average words per highlight (simple): {average_words_per_highlight:.2f}")
        print(f"Average NLTK tokens per highlight: {average_tokens_per_highlight:.2f}")
    else:
        print("No highlights processed to calculate averages.")

    # Per-Book Title Statistics ---
    if can_do_book_stats:
        print("\n--- Aggregate Statistics (Per Book Title) ---")
        # Ensure the book title column is treated as string and handle potential missing values
        df[book_title_column_name] = df[book_title_column_name].astype(str).fillna('Unknown Title')

        # Group by book title and sum the word and token counts
        # Removed the 'highlight_id' count from the initial agg call as it caused a KeyError
        book_stats = df.groupby(book_title_column_name).agg(
            total_words_in_book=('word_count_simple', 'sum'),
            total_tokens_in_book=('token_count_nltk', 'sum')
        ).reset_index() # Reset index to make 'Book Title' a column again

        # Add highlight count separately using the highlight_column_name
        highlight_counts = df.groupby(book_title_column_name)[highlight_column_name].count().reset_index(name='total_highlights_in_book')
        book_stats = pd.merge(book_stats, highlight_counts, on=book_title_column_name)

        print("Word and Token Counts Per Book Title:")
        # To display all rows of book_stats if it's long, you might want to adjust pandas display options
        # pd.set_option('display.max_rows', None)
        print(book_stats.sort_values(by='total_tokens_in_book', ascending=False))
        # pd.reset_option('display.max_rows')
    else:
        print("\nSkipping per-book statistics because the book title column was not found or properly specified.")



else:
    if file_name:
        print("\nProcessing halted due to issues with the DataFrame or column selection.")

print("\n--- Script Finished ---")


Please upload your Readwise CSV export file:


Saving readwise-data-Nov 5-2024.csv to readwise-data-Nov 5-2024 (1).csv

Successfully uploaded 'readwise-data-Nov 5-2024 (1).csv'.

CSV file loaded successfully into a DataFrame.

After filtering, 24658 rows remain with a non-null Amazon Book ID.

--- DataFrame Inspection ---
last 5 rows of your data:
                                               Highlight  \
28725  Make Your Own Mentors: A PhD from the Universi...   
28726  At Oakland, Al Davis introduced me (and anyone...   
28727  THE WALSH WAY The Fog Cutter Randy Cross, San ...   
28728  Good luck meeting that one every year. Neverth...   
28729  PART V Thin Skin, Baloney, and “The Star-Spang...   

                                              Book Title  \
28725  The Score Takes Care of Itself: My Philosophy ...   
28726  The Score Takes Care of Itself: My Philosophy ...   
28727  The Score Takes Care of Itself: My Philosophy ...   
28728  The Score Takes Care of Itself: My Philosophy ...   
28729  The Score Takes Care of Itsel

### Save Output

In [None]:
book_stats.to_csv('book_stats.csv', index=False)

## Summary of Tokens count and Words count

In [3]:
# Calculate the sum of 'total_tokens_in_book' and 'total_words_in_book'
total_tokens_sum = book_stats['total_tokens_in_book'].sum()
total_words_sum = book_stats['total_words_in_book'].sum()

# Format the sums with thousands separators
formatted_total_tokens = f"{total_tokens_sum:,}"
formatted_total_words = f"{total_words_sum:,}"

# Print the formatted sums
print("\n--- Totals for Books ---")
print(f"Total tokens across all books: {formatted_total_tokens}")
print(f"Total words across all books (simple count): {formatted_total_words}")



--- Totals for Books ---
Total tokens across all books: 1,984,734
Total words across all books (simple count): 1,700,234
