# Reviewing db stuff

In [1]:
import os
import sys
import sqlite3
from sqlite3 import Error

In [2]:
db_name = 'pdf_texts.db'

In [3]:
conn = sqlite3.connect(db_name)
cursor = conn.cursor()
cursor.execute('SELECT * FROM pdf_texts')
rows = cursor.fetchall()
conn.close()

In [4]:
# get columns in database
conn = sqlite3.connect(db_name)
cursor = conn.cursor()
cursor.execute('PRAGMA table_info(pdf_texts)')
columns = cursor.fetchall()
conn.close()

In [7]:
print(f"Database: {db_name}")
print(f"Number of rows: {len(rows)}")
print("Columns:")
for column in columns:
    print(f"  {column[1]} (Type: {column[2]})")
# sample the url column
if len(rows) > 0:
    print("\PDF URLs:")
    for row in rows[:-5]:  # Display first 5 rows
        print(f"  {row[2]}")  # Assuming the URL is in the second column (index 1)
if len(rows) == 0:
    print("No rows found in the database.")


Database: pdf_texts.db
Number of rows: 1091
Columns:
  id (Type: INTEGER)
  text (Type: TEXT)
  url (Type: TEXT)
  timestamp (Type: TEXT)
  html_metadata (Type: TEXT)
\PDF URLs:
  https://supreme.justia.com/cases/federal/us/502/244/case.pdf
  https://supreme.justia.com/cases/federal/us/502/279/case.pdf
  https://supreme.justia.com/cases/federal/us/502/410/case.pdf
  https://supreme.justia.com/cases/federal/us/502/21/case.pdf
  https://supreme.justia.com/cases/federal/us/502/346/case.pdf
  https://supreme.justia.com/cases/federal/us/502/215/case.pdf
  https://supreme.justia.com/cases/federal/us/502/93/case.pdf
  https://supreme.justia.com/cases/federal/us/502/105/case.pdf
  https://supreme.justia.com/cases/federal/us/502/81/case.pdf
  https://supreme.justia.com/cases/federal/us/502/301/case.pdf
  https://supreme.justia.com/cases/federal/us/502/164/case.pdf
  https://supreme.justia.com/cases/federal/us/502/251/case.pdf
  https://supreme.justia.com/cases/federal/us/502/62/case.pdf
  https

# REVIEW

In [65]:
## I want random snippets somewhere in the middle of each row of text
print("\nRandom snippets from the text column:")
for row in rows[:5]:  # Display first 5 rows
    text = row[4]
    if len(text) > 100:  # Ensure there's enough text to sample
        start = len(text) // 4  # Start sampling from the middle
        end = start + 100  # Get a snippet of 100 characters
        snippet = text[start:end]
        print(f"  Snippet: {snippet}...")  # Display the snippet
    else:
        print("  Text too short for snippet.")
        


Random snippets from the text column:
  Snippet: h-below-tablet item">
<strong>Argued:</strong>
<span>December 2, 1991</span>
</div>
<div class="flex...
  Snippet: h-below-tablet item">
<strong>Argued:</strong>
<span>October 7, 1991</span>
</div>
<div class="flex-...
  Snippet: h-below-tablet item">
<strong>Argued:</strong>
<span>October 15, 1991</span>
</div>
<div class="flex...
  Snippet: h-below-tablet item">
<strong>Argued:</strong>
<span>October 15, 1991</span>
</div>
<div class="flex...
  Snippet: h-below-tablet item">
<strong>Argued:</strong>
<span>November 5, 1991</span>
</div>
<div class="flex...


## Read as a pandas dataframe

In [108]:
import pandas as pd
conn = sqlite3.connect(db_name)
df = pd.read_sql_query("SELECT * FROM pdf_texts", conn)
conn.close()
print("\nDataFrame loaded successfully.")
print(df.head())



DataFrame loaded successfully.
   id                                               text  \
0   1  502us2$20Z 01-22-99 14:22:41 PAGES OPINPGT\n24...   
1   2  502us2$22M 08-19-96 17:40:23 PAGES OPINPGT\n27...   
2   3  502us2$27Z 01-22-99 08:37:00 PAGES OPINPGT\n41...   
3   4  502us1$$4Z 08-21-96 15:22:03 PAGES OPINPGT\n21...   
4   5  502us2$25Z 01-22-99 08:28:07 PAGES OPINPGT\n34...   

                                                 url  \
0  https://supreme.justia.com/cases/federal/us/50...   
1  https://supreme.justia.com/cases/federal/us/50...   
2  https://supreme.justia.com/cases/federal/us/50...   
3  https://supreme.justia.com/cases/federal/us/50...   
4  https://supreme.justia.com/cases/federal/us/50...   

                    timestamp  \
0  2025-06-25T23:40:05.507904   
1  2025-06-25T23:40:10.965825   
2  2025-06-25T23:40:16.437248   
3  2025-06-25T23:40:21.753877   
4  2025-06-25T23:40:27.145292   

                                       html_metadata  
0  <div class="f

In [109]:
for txt in df['text'][100].split('\n'):
    print(f"{txt}")  # Display the text from the first row

505us2109K 07-09-96 20:05:24 PAGES OPINPGT
557 OCTOBER TERM, 1991
Syllabus
CITY OF BURLINGTON v.DAGUE et al.
certiorari to the united states court of appeals for
the second circuit
No. 91±810. Argued April 21, 1992ÐDecided June 24, 1992
After ruling on the merits for respondents, the District Court determined
that they were ªsubstantially prevailingº parties entitled to ªreason-ableº attorney's fees under the attorney's fee provisions of the SolidWaste Disposal Act and the Clean Water Act. The District Court cal-culated the fee award by, inter alia, enhancing the ªlodestarº amount
by 25% on the grounds that respondents' attorneys were retained on acontingent-fee basis and that without such enhancement respondentswould have faced substantial dif®culties in obtaining suitable counsel.The Court of Appeals af®rmed the fee award.
Held: The fee-shifting statutes at issue do not permit enhancement of a
fee award beyond the lodestar amount to re¯ect the fact that a party'sattorneys were retain

In [118]:
search_word = 'dissenting'
matches = df[df['text'].str.lower().str.contains(search_word.lower(), na=False)]

if not matches.empty:
    print(f"\nFound {len(matches)} matches for '{search_word}':")
    for index, row in matches.iterrows():
        text_lower = row['text'].lower()
        idx = text_lower.find(search_word.lower())
        if idx != -1:
            start = max(idx - 50, 0)
            end = min(idx + len(search_word) + 50, len(row['text']))
            snippet = row['text'][start:end].replace('\n', ' ')
            print(f"  Row {index}: ...{snippet}... [link: {row['url']}]")
        else:
            print(f"  Row {index}: {row['text'][:100]}... [link: {row['url']}]")
else:
    print(f"\nNo matches found for '{search_word}' in the text column.")



Found 156 matches for 'dissenting':
  Row 1: ...nor, and Kennedy, JJ., joined. Scalia, J., ®led a dissenting opinion, post, p. 296. Thomas, J., took no part i... [link: https://supreme.justia.com/cases/federal/us/502/279/case.pdf]
  Row 2: ...nor, and Kennedy, JJ., joined. Scalia, J., ®led a dissenting opinion, in which Souter, J., joined, post, p. 42... [link: https://supreme.justia.com/cases/federal/us/502/410/case.pdf]
  Row 4: ... Maryland v.Craig, 497 U. S. 836, 864±865 (1990) (dissenting opinion). The dif®culty with the Wigmore-Harlan v... [link: https://supreme.justia.com/cases/federal/us/502/346/case.pdf]
  Row 11: ...ordered. Justice Blackmun, concurring in part and dissenting in part. I have wandered the maze of Indian statu... [link: https://supreme.justia.com/cases/federal/us/502/251/case.pdf]
  Row 12: ...nor, J., ®led an opin- ion concurring in part and dissenting in part, in which Stevens, J., joined, post, p. 7... [link: https://supreme.justia.com/cases/federal/us/502/

In [121]:
import re

# Pattern: e.g., "Scalia, J., dissenting" at the end of a line or followed by a line break
# Only match if '., dissenting' is at the end of a line or followed by a line break
# Handles both Unix (\n) and Windows (\r\n) line endings
dissent_pattern = re.compile(r"([A-Z][a-zA-Z’\.\-]+(?:,? (?:J\.|JJ\.|C\.J\.))?(?:,? and [A-Z][a-zA-Z’\.\-]+,? (?:J\.|JJ\.|C\.J\.))?), dissenting(?=\s*$|\r?\n)", re.IGNORECASE|re.MULTILINE)

print("Concise dissenting section headers found in the text column:")
for idx, row in df.iterrows():
    matches = dissent_pattern.findall(row['text'])
    for match in matches:
        print(f"Row {idx}: {match.strip()}, dissenting [link: {row['url']}]" )

Concise dissenting section headers found in the text column:
Row 1: Scalia, J., dissenting [link: https://supreme.justia.com/cases/federal/us/502/279/case.pdf]
Row 1: Scalia, J., dissenting [link: https://supreme.justia.com/cases/federal/us/502/279/case.pdf]
Row 1: Scalia, J., dissenting [link: https://supreme.justia.com/cases/federal/us/502/279/case.pdf]
Row 1: Scalia, J., dissenting [link: https://supreme.justia.com/cases/federal/us/502/279/case.pdf]
Row 1: Scalia, J., dissenting [link: https://supreme.justia.com/cases/federal/us/502/279/case.pdf]
Row 2: Scalia, J., dissenting [link: https://supreme.justia.com/cases/federal/us/502/410/case.pdf]
Row 2: Scalia, J., dissenting [link: https://supreme.justia.com/cases/federal/us/502/410/case.pdf]
Row 2: Scalia, J., dissenting [link: https://supreme.justia.com/cases/federal/us/502/410/case.pdf]
Row 2: Scalia, J., dissenting [link: https://supreme.justia.com/cases/federal/us/502/410/case.pdf]
Row 2: Scalia, J., dissenting [link: https://sup

In [122]:
# Extract and print 'Opinion of JUSTICE NAME' section headers
opinion_pattern = re.compile(r"Opinion of ([A-Z][a-zA-Z’\.\-]+,? (?:J\.|JJ\.|C\.J\.))", re.IGNORECASE)

print("Opinion section headers found in the text column:")
for idx, row in df.iterrows():
    matches = opinion_pattern.findall(row['text'])
    for match in matches:
        print(f"Row {idx}: Opinion of {match.strip()} [link: {row['url']}]" )

Opinion section headers found in the text column:
Row 4: Opinion of Thomas, J. [link: https://supreme.justia.com/cases/federal/us/502/346/case.pdf]
Row 4: Opinion of Thomas, J. [link: https://supreme.justia.com/cases/federal/us/502/346/case.pdf]
Row 4: Opinion of Thomas, J. [link: https://supreme.justia.com/cases/federal/us/502/346/case.pdf]
Row 4: Opinion of Thomas, J. [link: https://supreme.justia.com/cases/federal/us/502/346/case.pdf]
Row 4: Opinion of Thomas, J. [link: https://supreme.justia.com/cases/federal/us/502/346/case.pdf]
Row 4: Opinion of Thomas, J. [link: https://supreme.justia.com/cases/federal/us/502/346/case.pdf]
Row 4: Opinion of Thomas, J. [link: https://supreme.justia.com/cases/federal/us/502/346/case.pdf]
Row 4: Opinion of Thomas, J. [link: https://supreme.justia.com/cases/federal/us/502/346/case.pdf]
Row 4: Opinion of Thomas, J. [link: https://supreme.justia.com/cases/federal/us/502/346/case.pdf]
Row 10: Opinion of Scalia, J. [link: https://supreme.justia.com/case