In [1]:
import os

if not os.path.exists('/content/police-records-project'):
    !git clone https://github.com/c-goenka/police-records-project.git
    %cd /content/police-records-project
    !pip install -r requirements.txt
else:
    %cd /content/police-records-project

from google.colab import drive
drive.mount('/content/drive')

Cloning into 'police-records-project'...
remote: Enumerating objects: 116, done.[K
remote: Counting objects: 100% (116/116), done.[K
remote: Compressing objects: 100% (70/70), done.[K
remote: Total 116 (delta 59), reused 95 (delta 38), pack-reused 0 (from 0)[K
Receiving objects: 100% (116/116), 104.89 KiB | 10.49 MiB/s, done.
Resolving deltas: 100% (59/59), done.
/content/police-records-project
Collecting pymupdf (from -r requirements.txt (line 8))
  Downloading pymupdf-1.26.7-cp310-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Collecting pdf2image (from -r requirements.txt (line 9))
  Downloading pdf2image-1.17.0-py3-none-any.whl.metadata (6.2 kB)
Collecting pytesseract (from -r requirements.txt (line 10))
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Collecting setfit (from -r requirements.txt (line 14))
  Downloading setfit-1.1.3-py3-none-any.whl.metadata (12 kB)
Collecting evaluate>=0.3.0 (from setfit->-r requirements.txt (line 14))
  Downloading evaluat

In [3]:
import pandas as pd
import numpy as np
import plotly.express as px

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

In [4]:
data_dir = "/content/drive/MyDrive/police-records-project-data/processed/extracted_data.csv"
df = pd.read_csv(data_dir)

In [5]:
# Class Distribution
class_counts = df['label'].value_counts()
class_counts

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
emails-memorandum-correspondence,32
reports-supplemental,17
reports-incident,15
discovery-package,8
reports-use-of-force,8
reports-investigation,5
police-commision-agenda,4
press-release,3
reports-coroners,3
reports-criminal,2


In [6]:
# Word Count Statistics
df['word_count'].describe()

Unnamed: 0,word_count
count,98.0
mean,5420.836735
std,10526.293245
min,104.0
25%,501.25
50%,1700.0
75%,5984.75
max,68132.0


In [7]:
# Documents per Class (Horizontal Bar Chart)
class_counts_df = df['label'].value_counts().reset_index()
class_counts_df.columns = ['label', 'count']

fig1 = px.bar(
  class_counts_df,
  x='count',
  y='label',
  orientation='h',
  title='Documents per Class',
  text='count',
  color='count',
  color_continuous_scale='Bluyl'
)

fig1.update_layout(yaxis={'categoryorder':'total ascending'}, height=500)
fig1.show()

# Document Length Distribution (Histogram + Box Plot)
fig2 = px.histogram(
  df,
  x='word_count',
  nbins=30,
  marginal='box',
  title='Document Length Distribution',
  color_discrete_sequence=['coral']
)

fig2.add_vline(x=df['word_count'].median(), line_dash="dash", line_color="red", annotation_text="Median")
fig2.update_traces(marker_line_width=1, marker_line_color="black")
fig2.show()