In [1]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')

import spacy
spacy.cli.download("en_core_web_sm")
nlp = spacy.load('en_core_web_sm')


[nltk_data] Downloading package punkt to /Users/araj/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/araj/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Collecting en-core-web-sm==3.8.0
  Using cached https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [5]:
import json
import pandas as pd
import os

# Path to the METADATA folder
metadata_path = '../cap_data/METADATA'

# Initialize an empty list to store metadata
metadata_list = []

# Iterate through each JSON file in the METADATA folder
for file in os.listdir(metadata_path):
    if file.endswith('.json'):
        with open(os.path.join(metadata_path, file), 'r') as f:
            data = json.load(f)
            
            # Flatten the jurisdictions field if it's a simple list
            if isinstance(data.get('jurisdictions'), list):
                data['jurisdictions_flat'] = ', '.join(map(str, data['jurisdictions']))
            
            metadata_list.append(data)

# Check the first metadata record
print(metadata_list[0])

# Convert the list of metadata to a DataFrame
df_metadata = pd.json_normalize(metadata_list)

# Check the resulting DataFrame
print(df_metadata.head())


AttributeError: 'list' object has no attribute 'get'

In [None]:
from bs4 import BeautifulSoup

# Path to the HTML folder
html_path = 'path_to_downloaded_data/HTML/'

# Initialize lists to store case names and their corresponding text
case_names = []
case_texts = []

# Iterate through each HTML file in the HTML folder
for file in os.listdir(html_path):
    if file.endswith('.html'):
        with open(os.path.join(html_path, file), 'r', encoding='utf-8') as f:
            soup = BeautifulSoup(f, 'html.parser')
            # Extract case name from metadata (assuming filename matches)
            case_name = file.replace('.html', '')
            case_names.append(case_name)
            # Extract main text of the opinion
            # This may vary based on HTML structure; adjust selectors as needed
            opinion = soup.find('div', class_='opinion-text')  # Example selector
            if opinion:
                case_texts.append(opinion.get_text(separator=' ', strip=True))
            else:
                case_texts.append('')

# Create a DataFrame with case names and texts
df_text = pd.DataFrame({
    'case_name': case_names,
    'opinion_text': case_texts
})

print(df_text.head())


In [None]:
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Define stopwords
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    # Remove citations and special characters
    text = re.sub(r'\[\d+\]', '', text)  # Remove [1], [2], etc.
    text = re.sub(r'\(\d+\)', '', text)  # Remove (1), (2), etc.
    text = re.sub(r'\*\d+', '', text)    # Remove *123
    # Lowercase
    text = text.lower()
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Tokenize
    words = word_tokenize(text)
    # Remove stopwords
    words = [word for word in words if word not in stop_words]
    # Join back to string
    return ' '.join(words)

# Apply preprocessing
df_text['clean_opinion'] = df_text['opinion_text'].apply(preprocess_text)
print(df_text[['case_name', 'clean_opinion']].head())


In [None]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')

sid = SentimentIntensityAnalyzer()

def sentiment_score(text):
    return sid.polarity_scores(text)['compound']

df_text['sentiment_score'] = df_text['clean_opinion'].apply(sentiment_score)
print(df_text[['case_name', 'sentiment_score']].head())


In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

# Vectorize the text
vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
dtm = vectorizer.fit_transform(df_text['clean_opinion'])

# Define number of topics
num_topics = 5

# Initialize LDA
lda = LatentDirichletAllocation(n_components=num_topics, random_state=42)
lda.fit(dtm)

# Display topics
for index, topic in enumerate(lda.components_):
    print(f'Topic #{index + 1}:')
    print([vectorizer.get_feature_names_out()[i] for i in topic.argsort()[-10:]])
    print('\n')
