# Project 1
- NYPD Arrest Data Analysis
- This project analyzes NYPD arrest data using both `pandas` and the Python standard library.
- It calculates the mean, median, and mode of arrest precincts, and includes text-based
visualizations of arrests by precinct.
- source: https://data.cityofnewyork.us/Public-Safety/NYPD-Arrest-Data-Year-to-Date-/uip8-fykc/about_data

In [32]:
import pandas as pd

df_arrest = pd.read_csv("NYPD_Arrest_Data.csv")

In [33]:
# Explore columns
df_arrest.info()
df_arrest.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 212486 entries, 0 to 212485
Data columns (total 19 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   ARREST_KEY         212486 non-null  int64  
 1   ARREST_DATE        212486 non-null  object 
 2   PD_CD              212486 non-null  int64  
 3   PD_DESC            212486 non-null  object 
 4   KY_CD              212470 non-null  float64
 5   OFNS_DESC          212486 non-null  object 
 6   LAW_CODE           212486 non-null  object 
 7   LAW_CAT_CD         211482 non-null  object 
 8   ARREST_BORO        212486 non-null  object 
 9   ARREST_PRECINCT    212486 non-null  int64  
 10  JURISDICTION_CODE  212486 non-null  int64  
 11  AGE_GROUP          212486 non-null  object 
 12  PERP_SEX           212486 non-null  object 
 13  PERP_RACE          212486 non-null  object 
 14  X_COORD_CD         212486 non-null  int64  
 15  Y_COORD_CD         212486 non-null  int64  
 16  La

Unnamed: 0,ARREST_KEY,ARREST_DATE,PD_CD,PD_DESC,KY_CD,OFNS_DESC,LAW_CODE,LAW_CAT_CD,ARREST_BORO,ARREST_PRECINCT,JURISDICTION_CODE,AGE_GROUP,PERP_SEX,PERP_RACE,X_COORD_CD,Y_COORD_CD,Latitude,Longitude,Location
0,298760433,01/02/2025,782,"WEAPONS, POSSESSION, ETC",236.0,DANGEROUS WEAPONS,PL 2650101,M,Q,115,3,(null),(null),BLACK,0,0,0.0,0.0,POINT (0 0)
1,299030225,01/07/2025,105,STRANGULATION 1ST,106.0,FELONY ASSAULT,PL 1211200,F,M,28,0,25-44,M,BLACK,997439,233857,40.808558,-73.952357,POINT (-73.952357 40.808558)
2,299127494,01/08/2025,849,"NY STATE LAWS,UNCLASSIFIED VIO",677.0,OTHER STATE LAWS,LOC00000V0,V,K,81,1,(null),(null),WHITE,0,0,0.0,0.0,POINT (0 0)
3,299188536,01/09/2025,259,"CRIMINAL MISCHIEF,UNCLASSIFIED 4",351.0,CRIMINAL MISCHIEF & RELATED OF,PL 1450001,M,M,7,2,(null),(null),BLACK,0,0,0.0,0.0,POINT (0 0)
4,299533742,01/16/2025,155,RAPE 2,104.0,RAPE,PL 1303001,F,K,81,0,18-24,F,BLACK,1005319,190473,40.689464,-73.924029,POINT (-73.9240290899499 40.6894642952604)


In [34]:
# Using pandas

# Pick one numeric column - arrest precinct
col = "ARREST_PRECINCT"

mean = df_arrest[col].mean()
median = df_arrest[col].median()
mode = df_arrest[col].mode()[0]

print("Mean:", mean)
print("Median:", median)
print("Mode:", mode)

Mean: 63.07742627749593
Median: 62.0
Mode: 14


In [35]:
# The harder way
with open("NYPD_Arrest_Data.csv", "r") as f:
    first_line = f.readline()
    lines = f.readlines()

print(first_line)

"ARREST_KEY","ARREST_DATE","PD_CD","PD_DESC","KY_CD","OFNS_DESC","LAW_CODE","LAW_CAT_CD","ARREST_BORO","ARREST_PRECINCT","JURISDICTION_CODE","AGE_GROUP","PERP_SEX","PERP_RACE","X_COORD_CD","Y_COORD_CD","Latitude","Longitude","Location"



In [37]:
# Split CSV line by commas, but ignore commas inside quotes
def smart_split(line):
    parts = []
    current = ''
    inside_quotes = False

    for ch in line:
        if ch == '"':
            inside_quotes = not inside_quotes
        elif ch == ',' and not inside_quotes:
            parts.append(current)
            current = ''
        else:
            current += ch
    parts.append(current)
    return parts

In [38]:
# The hard way -- Continued
header = [h.replace('"', '').strip() for h in first_line.split(",")]
col_index = header.index("ARREST_PRECINCT")

data = []
for line in lines[1:]:
    parts = smart_split(line.strip())
    if len(parts) > col_index:
        value = parts[col_index].replace('"', '').strip()
        if value.isdigit():
            data.append(int(value))

# Mean
mean_h = sum(data) / len(data)

# Median
sorted_data = sorted(data)
n = len(sorted_data)
if n % 2 == 0:
    median_h = (sorted_data[n//2 - 1] + sorted_data[n//2]) / 2
else:
    median_h = sorted_data[n//2]

# Mode
freq = {}
for val in data:
    if val in freq:
        freq[val] = freq[val] + 1
    else:
        freq[val] = 1

most_common = None
highest_count = 0
for val in freq:
    if freq[val] > highest_count:
        highest_count = freq[val]
        most_common = val

mode_h = most_common

print("Mean:", mean_h)
print("Median:", median_h)
print("Mode:", mode_h)

Mean: 63.077181918723674
Median: 62
Mode: 14


In [None]:
# Data visualization

# 1. All precincts
# Count how many arrests per precinct
precinct_counts = df_arrest[col].value_counts().sort_index()

# Scale bar lengths so it fits nicely on screen
max_count = precinct_counts.max()
scale = 50 / max_count  # each bar up to 50 chars long

print("\nArrests by Precinct (each █ ~ proportional to arrests)")
print("=" * 60)

# Draw bar chart using print() and string multiplication
for precinct, count in precinct_counts.items():
    bar = "█" * int(count * scale)
    print(f"{int(precinct):>3}: {bar} ({count})")

print("=" * 60)




Arrests by Precinct (each █ ~ proportional to arrests)
  1: ██████████████████████ (3510)
  5: ██████████████████ (2880)
  6: ███████████ (1905)
  7: ██████████ (1593)
  9: ██████████ (1613)
 10: █████████ (1456)
 13: ███████████████████ (3029)
 14: ██████████████████████████████████████████████████ (7947)
 17: █████ (899)
 18: ██████████████████████████ (4205)
 19: ████████████ (2005)
 20: ██████ (1045)
 22:  (92)
 23: █████████████ (2201)
 24: █████████ (1461)
 25: ████████████████ (2624)
 26: ███████ (1143)
 28: ███████████████ (2389)
 30: ████████ (1428)
 32: ██████████████ (2367)
 33: ████████████ (1933)
 34: ███████████████████ (3067)
 40: ██████████████████████████████████████████████ (7423)
 41: ███████████████████████ (3748)
 42: ████████████████████ (3279)
 43: ████████████████████████ (3819)
 44: █████████████████████████████████████ (5978)
 45: ██████████████ (2283)
 46: ████████████████████████████████ (5104)
 47: ████████████████████████████ (4560)
 48: █████████████████

In [None]:
# Show the top 10 precincts so that I can match the precinct numbers with their names
df_arrest["ARREST_PRECINCT"].value_counts().head(10)


ARREST_PRECINCT
14     7947
40     7423
75     6953
103    6010
44     5978
46     5104
120    4877
73     4580
47     4560
110    4409
Name: count, dtype: int64

In [42]:
# 2. Top 10 precincts with names

# Top 10 precincts and their counts
top10 = df_arrest["ARREST_PRECINCT"].value_counts().head(10)

# Precinct name mapping (from NYPD)
precinct_names = {
    14: "Midtown South (Manhattan)",
    40: "Bronx – Mott Haven",
    75: "Brooklyn – East New York",
    103: "Queens – Jamaica",
    44: "Bronx – Concourse",
    46: "Bronx – University Heights",
    120: "Staten Island – St. George",
    73: "Brooklyn – Brownsville",
    47: "Bronx – Edenwald/Wakefield",
    110: "Queens – Elmhurst"
}

print("\nTop 10 Precincts by Arrests (each █ ~ proportional to arrests)")
print("=" * 70)
max_top = top10.max()
scale_top = 50 / max_top

for precinct, count in top10.items():
    bar = "█" * int(count * scale_top)
    name = precinct_names.get(int(precinct), "")
    print(f"{int(precinct):>3}: {bar} ({count}) {name}")
print("=" * 70)


Top 10 Precincts by Arrests (each █ ~ proportional to arrests)
 14: ██████████████████████████████████████████████████ (7947) Midtown South (Manhattan)
 40: ██████████████████████████████████████████████ (7423) Bronx – Mott Haven
 75: ███████████████████████████████████████████ (6953) Brooklyn – East New York
103: █████████████████████████████████████ (6010) Queens – Jamaica
 44: █████████████████████████████████████ (5978) Bronx – Concourse
 46: ████████████████████████████████ (5104) Bronx – University Heights
120: ██████████████████████████████ (4877) Staten Island – St. George
 73: ████████████████████████████ (4580) Brooklyn – Brownsville
 47: ████████████████████████████ (4560) Bronx – Edenwald/Wakefield
110: ███████████████████████████ (4409) Queens – Elmhurst
