In [None]:
import os

if not os.path.exists('/content/police-records-project'):
    !git clone https://github.com/c-goenka/police-records-project.git
    %cd /content/police-records-project
    !pip install -r requirements.txt
else:
    %cd /content/police-records-project

from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
import plotly.express as px

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

In [None]:
data_dir = "/content/drive/MyDrive/police-records-project-data/processed/extracted_data.csv"
df = pd.read_csv(data_dir)

In [None]:
# Class Distribution
class_counts = df['label'].value_counts()
class_counts

In [None]:
# Word Count Statistics
df['word_count'].describe()

In [None]:
# Documents per Class (Horizontal Bar Chart)
class_counts_df = df['label'].value_counts().reset_index()
class_counts_df.columns = ['label', 'count']

fig1 = px.bar(
  class_counts_df,
  x='count',
  y='label',
  orientation='h',
  title='Documents per Class',
  text='count',
  color='count',
  color_continuous_scale='Bluyl'
)

fig1.update_layout(yaxis={'categoryorder':'total ascending'}, height=500)
fig1.show()

# Document Length Distribution (Histogram + Box Plot)
fig2 = px.histogram(
  df,
  x='word_count',
  nbins=30,
  marginal='box',
  title='Document Length Distribution',
  color_discrete_sequence=['coral']
)

fig2.add_vline(x=df['word_count'].median(), line_dash="dash", line_color="red", annotation_text="Median")
fig2.update_traces(marker_line_width=1, marker_line_color="black")
fig2.show()