# Clean for visualizations

In [12]:
import pandas as pd
from io import StringIO
import datetime
import os
import re

pd.set_option('display.max_rows', 1000) 
pd.set_option('display.max_columns', 1000)
pd.set_option("display.max_colwidth", None) 

In [13]:
crimes = pd.read_csv("bart_crime_updated.csv")
crimes.head()

Unnamed: 0,description,incident_num,location,agency,date,time
0,DRUG EQUIPMENT VIOLATIONS,2510-0416,2000 BLOCK Mission St,BART Police,10-7-2025,2:08 PM
1,DRUG/NARCOTIC VIOLATIONS,2510-0174,2000 BLOCK Mission St,BART Police,10-3-2025,12:35 PM
2,ALL OTHER LARCENY,2510-0208,2000 BLOCK Mission St,BART Police,10-3-2025,9:40 AM
3,DRUG/NARCOTIC VIOLATIONS,2510-0129,2000 BLOCK Mission St,BART Police,10-2-2025,6:39 PM
4,FALSE PRETENSE/SWINDLE/CONFIDENCE GAME,2510-0032,2000 BLOCK Mission St,BART Police,10-1-2025,1:16 PM


### Clean data

In [14]:
crimes['description'] = crimes['description'].str.lower().str.strip()

crimes['location'] = crimes['location'].str.lower().str.strip()

crimes['incident_num'] = crimes['incident_num'].str.strip()

crimes['agency'] = crimes['agency'].str.lower().str.strip()

crimes['date'] = crimes['date'].str.strip()
crimes['time'] = crimes['time'].str.strip()

crimes['date'] = pd.to_datetime(crimes['date'], format='%m-%d-%Y')
crimes['month_year'] = crimes['date'].dt.strftime('%b %Y')

crimes = crimes.sort_values('date', ascending=False).reset_index(drop=True)

In [15]:
crimes.head()

Unnamed: 0,description,incident_num,location,agency,date,time,month_year
0,simple assault,2510-1447,2000 block mission st,bart police,2025-10-23,6:14 PM,Oct 2025
1,intimidation,2510-1179,2000 block mission st,bart police,2025-10-19,10:57 PM,Oct 2025
2,drug equipment violations,2510-1028,2000 block mission st,bart police,2025-10-17,8:03 AM,Oct 2025
3,simple assault,2510-0983,2000 block mission st,bart police,2025-10-16,3:34 PM,Oct 2025
4,drug equipment violations,2510-0988,2000 block mission st,bart police,2025-10-16,4:20 PM,Oct 2025


### Graphs 

1. per month all crimes
2. Broken down small multiples column chart 

In [16]:
crimes["description"].value_counts()

description
drug/narcotic violations                    17
simple assault                              11
drug equipment violations                   11
all other larceny                            5
disorderly conduct                           3
intimidation                                 2
destructive/damage/vandalism of property     2
false pretense/swindle/confidence game       1
arson                                        1
Name: count, dtype: int64

In [23]:
descriptions_df = crimes["description"].value_counts().reset_index()
descriptions_df.columns = ['description', 'count']

In [24]:
by_month = crimes["month_year"].value_counts().reset_index()
by_month.columns = ['time', 'count']

In [25]:
descriptions_df.to_csv("viz/bart/crime_desc.csv", index = False)
by_month.to_csv("viz/bart/crime_time.csv", index = False)