In [1]:
## imports
import pandas as pd
import numpy as np
import random

## print multiple things from same cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from datetime import datetime, timedelta

# Load data

In [2]:
## load data on 2020 crimes in DC
dc_crim_2020 = pd.read_csv("https://opendata.arcgis.com/datasets/f516e0dd7b614b088ad781b0c4002331_2.csv")

## create date of crime
dc_crim_2020['report_day'] = pd.to_datetime(dc_crim_2020.REPORT_DAT).dt.date

# 1. Example task and function

**Task**: write a function that takes in two arguments--a dataframe and an integer of a Ward number
    
- The function should subset to:
    - That ward
    - The ward immediately "below" that ward (if focal ward is Ward 2, Ward 1)
    - The ward immediately "above" that ward (if focal ward is Ward 2, Ward 3)
- Find the number of unique crime reports (unique CCN) in each of those 3 wards
- Should print the name + # of crimes in the ward with the most unique crime reports of that comparison set 

For now, don't worry about cases where you input a number outside of 2-7 (so all wards inputted will have a ward above and below)

## 1.1 Step one: before writing the function, write the code that goes inside the "meat" of the function 

To do so, focus on making the code work for ward 3

In [3]:
## first, get neighboring wards
## for ward 3
neighbor_wards = [3 - 1, 3 + 1]
wards_touse = [3] + neighbor_wards 
wards_touse

## then, use isin command to subset the data
## to those words
df_focal = dc_crim_2020[dc_crim_2020.WARD.isin(wards_touse)].copy()
df_focal.WARD.value_counts()

## then, use groupby to find unique
ward_ccn = df_focal.groupby('WARD')['CCN'].nunique().reset_index()
ward_ccn

## finally, get the top one (multiple ways)
top_ward = ward_ccn.sort_values(by = 'CCN', ascending = False).head(1)
print("Ward with most reports of neighbors is WARD " + str(top_ward['WARD'].values[0]) +
     " with N reports: " + str(top_ward.CCN.values[0]))

[3, 2, 4]

2    4685
4    2702
3    1708
Name: WARD, dtype: int64

Unnamed: 0,WARD,CCN
0,2,4685
1,3,1708
2,4,2700


Ward with most reports of neighbors is WARD 2 with N reports: 4685


## 1.2 Step two: generalize OUTSIDE the function by creating objects that contain what you'll eventually pass in as parameters

In [4]:
## creating one object with focal ward
## and using it in first part of code
focal_ward = 3
neighbor_wards = [focal_ward - 1, focal_ward + 1]
wards_touse = [focal_ward] + neighbor_wards 

## creating another object with the dataframe
## and then replacing the specific dataframe name (dc_crim_2020)
df = dc_crim_2020.copy()
df_focal = df[df.WARD.isin(wards_touse)].copy()

## then, use groupby to find unique
ward_ccn = df_focal.groupby('WARD')['CCN'].nunique().reset_index()

## finally, get the top one (multiple ways)
top_ward = ward_ccn.sort_values(by = 'CCN', ascending = False).head(1)
print("Ward with most reports of neighbors is WARD " + str(top_ward['WARD'].values[0]) +
     " with N reports: " + str(top_ward.CCN.values[0]))

Ward with most reports of neighbors is WARD 2 with N reports: 4685


## 1.3 Step three: copy and paste that code into a function, add parameters, and add documentation 


In [5]:
def compare_wards(focal_ward: int, df: pd.DataFrame):
    """
    Prints the ward with the most crime reports of its two neighbors
    and the number of reports in that ward
        Parameters:
            focal_ward (int): An integer between 2 and 7 for wards
            df (pd.DataFrame): Dataframe to find the wards in

        Returns:
            Nothing
    """
    ## get list of wards to use
    neighbor_wards = [focal_ward - 1, focal_ward + 1]
    wards_touse = [focal_ward] + neighbor_wards 
    ## subset to those
    df_focal = df[df.WARD.isin(wards_touse)].copy()
    ## find crimes per ward
    ward_ccn = df_focal.groupby('WARD')['CCN'].nunique().reset_index()
    ## finally, get the top one 
    top_ward = ward_ccn.sort_values(by = 'CCN', ascending = False).head(1)
    ## print
    print("Ward with most reports of neighbors is WARD " + \
          str(top_ward['WARD'].values[0]) +
     " with N reports: " + str(top_ward.CCN.values[0]) + \
    " out of following wards " + "; ".join([str(x) for x in sorted(wards_touse)]))

## 1.4 Step four: execute the function

Execute the function two times: once with ward 3 and another time with ward 6

In [6]:
compare_wards(focal_ward = 3, df = dc_crim_2020)
compare_wards(focal_ward = 6, df = dc_crim_2020)


Ward with most reports of neighbors is WARD 2 with N reports: 4685 out of following wards 2; 3; 4
Ward with most reports of neighbors is WARD 5 with N reports: 4391 out of following wards 5; 6; 7


Repeat the execution but instead of copy/pasting two separate lines of code, use list comprehension to execute repeatedly

In [7]:
[compare_wards(focal_ward = i, df = dc_crim_2020)
for i in [3, 6]]

Ward with most reports of neighbors is WARD 2 with N reports: 4685 out of following wards 2; 3; 4
Ward with most reports of neighbors is WARD 5 with N reports: 4391 out of following wards 5; 6; 7


[None, None]

## 1.5 Step five: modify as needed to handle more cases

**Example**: if `focal_ward` is outside 2-7 range (either integer outside that range or wrong type), skip the calculation and print statements that tell user correct format for parameter to pass in

Test these statements by executing the function three times and only last one should print the crime counts:
- One with ward as a string
- Another with ward equal to 20
- A third time with ward equal to 5 

In [8]:
def compare_wards(focal_ward: int, df: pd.DataFrame):
    """
    Prints the ward with the most crime reports of its two neighbors
    and the number of reports in that ward
        Parameters:
            focal_ward (int): An integer between 2 and 7 for wards
            df (pd.DataFrame): Dataframe to find the wards in

        Returns:
            Nothing
    """
    ## print statement if input not correct
    if not isinstance(focal_ward, int):
        print("Try again; focal ward needs to be integer")
    elif focal_ward < 2 or focal_ward > 7:
        print("Try again; focal ward integer needs to be 2 through 7")
    else:
        ## get list of wards to use
        neighbor_wards = [focal_ward - 1, focal_ward + 1]
        wards_touse = [focal_ward] + neighbor_wards 
        ## subset to those
        df_focal = df[df.WARD.isin(wards_touse)].copy()
        ## find crimes per ward
        ward_ccn = df_focal.groupby('WARD')['CCN'].nunique().reset_index()
        ## finally, get the top one 
        top_ward = ward_ccn.sort_values(by = 'CCN', ascending = False).head(1)
        ## print
        print("Ward with most reports of neighbors is WARD " + \
          str(top_ward['WARD'].values[0]) +
         " with N reports: " + str(top_ward.CCN.values[0]) + \
        " out of following wards " + "; ".join([str(x) for x in sorted(wards_touse)]))

In [9]:
compare_wards(focal_ward = '5', df = dc_crim_2020)
compare_wards(focal_ward = 20, df = dc_crim_2020)
compare_wards(focal_ward = 5, df = dc_crim_2020)

Try again; focal ward needs to be integer
Try again; focal ward integer needs to be 2 through 7
Ward with most reports of neighbors is WARD 5 with N reports: 4391 out of following wards 4; 5; 6


# 2. Group activity: writing your own function

**Task**: we want to start to explore clustering in the timing/location of crimes. 

For each crime in CCN_examples, we want to want to look in the remaining crime reports for crime reports that are:

- Located in the same ward as the focal crime
- Reported within 2 days (plus or minus; inclusive) of the focal crime's date (`report_day`)

We want the function to return the percentage of those same-ward, near-time crimes that have the same `OFFENSE` as the focal crime 

In [10]:
## CCN examples
CCN_examples = ['20165648', '20123250']

## 2.1 Here's code that finds matches for the first crime in CCN examples

You can adapt this code for your function

In [11]:
## information for focal crime
crime_num = CCN_examples[0]
focal_crime_df = dc_crim_2020[dc_crim_2020.CCN.astype(str) == crime_num].copy()
focal_crime_df

## crimes in same ward that are not the focal crime
crimes_lookin = dc_crim_2020[~dc_crim_2020.CCN.isin([crime_num])].copy()
same_ward = crimes_lookin[crimes_lookin.WARD.isin(focal_crime_df.WARD)].copy()
same_ward[['CCN', 'WARD']].head()

## get the dates 2 days before and after
lower_bound = focal_crime_df.report_day - timedelta(days = 2)
upper_bound = focal_crime_df.report_day + timedelta(days = 2)

lower_bound
upper_bound

Unnamed: 0,X,Y,CCN,REPORT_DAT,SHIFT,METHOD,OFFENSE,BLOCK,XBLOCK,YBLOCK,...,CENSUS_TRACT,VOTING_PRECINCT,LATITUDE,LONGITUDE,BID,START_DATE,END_DATE,OBJECTID,OCTO_RECORD_ID,report_day
26219,-77.00748,38.904698,20165648,2020/11/20 02:25:50+00,EVENING,OTHERS,MOTOR VEHICLE THEFT,1 - 99 BLOCK OF PIERCE STREET NE,399351.44,137425.96,...,10603.0,Precinct 144,38.90469,-77.007477,NOMA,2020/11/20 00:45:25+00,2020/11/20 00:50:03+00,276322035,,2020-11-20


Unnamed: 0,CCN,WARD
9,20011239,6
12,20011260,6
15,20011280,6
36,20015147,6
42,20015199,6


26219    2020-11-18
Name: report_day, dtype: object

26219    2020-11-22
Name: report_day, dtype: object

In [12]:

## filter to same ward within that buffer
sw_timebuff = same_ward[(same_ward.report_day >= lower_bound.values[0]) &
         (same_ward.report_day <= upper_bound.values[0])]

## % of reports that are same offense as focal report
sw_timebuff[sw_timebuff.OFFENSE == focal_crime_df.OFFENSE.values[0]].shape[0]/sw_timebuff.shape[0]

0.16129032258064516

## 2.2 Generalize into a function that:

- Takes in an arbitrary CCN identifier string
- Takes in an arbitrary number of days to look for crimes in range of (doesn't need to be flexible enough to handle other time buffers like minutes)

Execute the function with the first crime we focus on: `20165648'

In [13]:
## your code here
def get_similar_crime_prop(crime_num: str, num_days: int):
    """
    Returns the proportion of crimes in the same ward and +/- num_days days of the focal crime that have the same OFFENSE as the focal crime
    Parameters:
        crime_num (string): Focal crime CCN number
        num_days (int): Number of days to look before and after focal crime

    Returns:
        float: Float representing the proportion
    """
    ## information for focal crime
    focal_crime_df = dc_crim_2020[dc_crim_2020.CCN.astype(str) == crime_num].copy()

    ## crimes in same ward that are not the focal crime
    crimes_lookin = dc_crim_2020[~dc_crim_2020.CCN.isin([crime_num])].copy()
    same_ward = crimes_lookin[crimes_lookin.WARD.isin(focal_crime_df.WARD)].copy()

    ## get the dates 2 days before and after
    lower_bound = focal_crime_df.report_day - timedelta(days = num_days)
    upper_bound = focal_crime_df.report_day + timedelta(days = num_days)

    ## filter to same ward within that buffer
    sw_timebuff = same_ward[(same_ward.report_day >= lower_bound.values[0]) &
            (same_ward.report_day <= upper_bound.values[0])]

    ## % of reports that are same offense as focal report
    return sw_timebuff[sw_timebuff.OFFENSE == focal_crime_df.OFFENSE.values[0]].shape[0]/sw_timebuff.shape[0]
    
get_similar_crime_prop("20165648", 2)

0.16129032258064516

## 2.3 Use list comprehension to iterate over the two focal crimes in CCN_examples and execute the function repeatedly

Since the function returns a specific value (the percentage), eventually store the two results in a pandas dataframe that also contains the crime ID (CCN) that corresponds to that percentage

In [14]:
## your code here
rates = [get_similar_crime_prop(ccn, 2) for ccn in CCN_examples]

similar_crime_rate = pd.DataFrame()
similar_crime_rate["CCN"] = CCN_examples
similar_crime_rate["similar_crime_rate"] = rates

similar_crime_rate

Unnamed: 0,CCN,similar_crime_rate
0,20165648,0.16129
1,20123250,0.136364
