# Legal Document Classification with BERT - V2 (Full Dataset)

## Part 2: Data Exploration

Explore and visualize the full 45K dataset to better understand its characteristics.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from collections import Counter
import nltk
from nltk.tokenize import word_tokenize

# Download NLTK resources
nltk.download('punkt')

# Load the dataset (use the path from Part 1)
dataset_path = '/content/drive/MyDrive/legal_bert_classification_v2/full_bert_dataset.csv'
df = pd.read_csv(dataset_path)

print(f"Dataset shape: {df.shape}")
print(f"Number of unique labels: {df['label'].nunique()}")

In [None]:
# Display label distribution
print("Label distribution:")
label_counts = df['label'].value_counts()
print(label_counts)

# Calculate percentages
label_percentages = df['label'].value_counts(normalize=True) * 100
print("\nLabel percentages:")
for label, percentage in label_percentages.items():
    print(f"  {label}: {percentage:.2f}%")

In [None]:
# Plot label distribution
plt.figure(figsize=(10, 6))
ax = sns.barplot(x=label_counts.index, y=label_counts.values)

# Add count labels on top of bars
for i, count in enumerate(label_counts.values):
    ax.text(i, count + 100, f"{count:,}", ha='center')

plt.title('Document Type Distribution', fontsize=14)
plt.xlabel('Document Type', fontsize=12)
plt.ylabel('Count', fontsize=12)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Text length analysis
df['text_length'] = df['text'].apply(len)
df['word_count'] = df['text'].apply(lambda x: len(word_tokenize(x[:10000])))

print("\nText length statistics (characters):")
print(df['text_length'].describe())

print("\nWord count statistics:")
print(df['word_count'].describe())

print("\nText length by label (mean characters):")
print(df.groupby('label')['text_length'].mean().sort_values(ascending=False))

print("\nWord count by label (mean words):")
print(df.groupby('label')['word_count'].mean().sort_values(ascending=False))

In [None]:
# Plot text length distribution by document type
plt.figure(figsize=(14, 8))

# Create a box plot to show distribution
sns.boxplot(x='label', y='text_length', data=df)
plt.title('Text Length Distribution by Document Type', fontsize=14)
plt.xlabel('Document Type', fontsize=12)
plt.ylabel('Text Length (characters)', fontsize=12)
plt.xticks(rotation=45)
plt.ylim(0, df['text_length'].quantile(0.95))  # Limit y-axis to 95th percentile
plt.tight_layout()
plt.show()

In [None]:
# Histogram of text lengths
plt.figure(figsize=(12, 6))
sns.histplot(data=df, x='text_length', hue='label', bins=50, element='step', log_scale=(False, True))
plt.title('Text Length Distribution by Document Type (Log Scale)', fontsize=14)
plt.xlabel('Text Length (characters)', fontsize=12)
plt.ylabel('Count (log scale)', fontsize=12)
plt.xlim(0, df['text_length'].quantile(0.99))  # Limit x-axis to 99th percentile
plt.legend(title='Document Type')
plt.tight_layout()
plt.show()

In [None]:
# Check for potential BERT max token limit issues
# BERT typically has a 512 token limit
# We'll estimate tokens as words / 0.75 (rough approximation)

df['estimated_tokens'] = df['word_count'] / 0.75
exceeding_limit = df[df['estimated_tokens'] > 512]

print(f"Number of documents potentially exceeding BERT's 512 token limit: {len(exceeding_limit)} ({len(exceeding_limit)/len(df)*100:.2f}%)")
print("\nDistribution by label:")
print(exceeding_limit['label'].value_counts())
print("\nPercentage by label:")
for label in df['label'].unique():
    total = len(df[df['label'] == label])
    exceeding = len(exceeding_limit[exceeding_limit['label'] == label])
    print(f"  {label}: {exceeding/total*100:.2f}% ({exceeding}/{total})")

In [None]:
# Sample document display
print("\nSample document from each class:")
for label in df['label'].unique():
    sample = df[df['label'] == label].iloc[0]
    print(f"\n--- {label} Example ---")
    print(f"Text length: {len(sample['text'])} characters")
    print(f"Word count: ~{len(word_tokenize(sample['text'][:10000]))} words")
    print(f"Text (first 300 chars): {sample['text'][:300]}...")

In [None]:
# Check for potential data quality issues
# 1. Very short texts
short_texts = df[df['text_length'] < 100]
print(f"Number of very short texts (<100 chars): {len(short_texts)}")
if len(short_texts) > 0:
    print("Sample of short texts:")
    print(short_texts[['text', 'label']].head(3))

# 2. Check text format consistency
print("\nChecking for text format consistency...")
has_linebreaks = df['text'].apply(lambda x: '\n' in x).mean() * 100
print(f"Percentage of texts with line breaks: {has_linebreaks:.2f}%")

# 3. Check character distribution
def get_char_types(text_sample):
    if pd.isna(text_sample):
        return 0, 0, 0
    
    alphabet = sum(c.isalpha() for c in text_sample)
    digits = sum(c.isdigit() for c in text_sample)
    spaces = sum(c.isspace() for c in text_sample)
    
    return alphabet, digits, spaces

# Sample 1000 random texts for character analysis
sample_df = df.sample(min(1000, len(df)))
sample_stats = sample_df['text'].apply(get_char_types)

alphabet_chars = np.mean([x[0] for x in sample_stats])
digit_chars = np.mean([x[1] for x in sample_stats])
space_chars = np.mean([x[2] for x in sample_stats])
total = alphabet_chars + digit_chars + space_chars

print(f"\nAverage character composition in sample texts:")
print(f"  Alphabetic: {alphabet_chars/total*100:.2f}%")
print(f"  Digits: {digit_chars/total*100:.2f}%")
print(f"  Spaces: {space_chars/total*100:.2f}%")
print(f"  Other (punctuation, etc.): {(1-(alphabet_chars+digit_chars+space_chars)/total)*100:.2f}%")