In [1]:
import numpy as np
import pandas as pd
from scipy import linalg
from matplotlib import pyplot as plt

In [2]:
def get_fresh_boston_data(keep_locations=False):
    weird_dtypes = {
        s: 'object' for s in [
            'INCIDENT_NUMBER',
            'OFFENSE_CODE_GROUP',
            'OFFENSE_DESCRIPTION',
            'DISTRICT',
            'REPORTING_AREA',
            'SHOOTING',
            'OCCURRED_ON_DATE',
            'DAY_OF_WEEK',
            'UCR_PART',
            'STREET'
        ]
    }
    df = pd.read_csv("boston_crime.csv",encoding = "ISO-8859-1",dtype=weird_dtypes)
    df.rename({name : name.lower() for name in df.columns}, axis=1, inplace=True)
    df["shooting"] = df["shooting"] == 'Y'
    
    
    ############# nominal / ordinal encoding #############
    
    
    # ucr_part is nominal (part I is the most extreme)
    # We use one-hot encoding similar to that done on the prelim;
    # 'ucr_le_two' indicates that it was less severe than part 1 (i.e., part 2, part 3, or part 4)
    # (note that part 3 is less severe than part 2)
    
    df["ucr_part"].fillna("Other",inplace=True) # we assume that most NaN are the same as "Other"
    
    def real_encode_ucr(s):
        num = s.rsplit(" ", 1)[-1]
        if num == "One":
            return 1
        if num == "Two":
            return 2
        if num == "Three":
            return 3
        if num == "Other":
            return 4 
            # We want "Other" and NaN to be represented as "no information".
            # We encode it as 4 (less severe than the least severe code), so 
            # that it will be encoded as "1" in all three features we generate below.
            # We hope that this will provide the least biased information.
    
    real_encoded_ucr_part = df["ucr_part"].apply(real_encode_ucr)
    
    df["ucr_lt_1"] = real_encoded_ucr_part > 1 # "le" stands for "severity less than"
    df["ucr_lt_2"] = real_encoded_ucr_part > 2
    df["ucr_lt_3"] = real_encoded_ucr_part > 3 
    # note that 4 is a fake part that we made up to deal with missing data
    
    df["is_weekend"] = df["day_of_week"].apply(lambda s: s == "Saturday" or s == "Sunday")
    
    if keep_locations:
        return df
    
    
    ################## do processing in order to be blind to neighborhood location #############
    df.drop(labels=["incident_number"], axis=1, inplace=True)
    
    # drop latitude and longitude
    df.drop(labels=["lat","long","location","district","reporting_area"], axis=1, inplace=True)
    
    # reporting_area
    df["street_type"] = df["street"].apply(lambda s: "" if not isinstance(s,str) else s.rsplit(" ", 1)[-1])
    street_counts = df["street_type"].value_counts()
    def replace_streets(streetpostfix):
        common_street_names = [
            'BROADWAY','TER',''
            
        ]
        if (street_counts[streetpostfix] < 350 or streetpostfix in common_street_names):
            return "OTHER"
        return streetpostfix
    df["street_type"] = df["street_type"].apply(replace_streets)
    df.drop(labels=["street"], axis=1, inplace=True)
    
    
    return df

In [4]:
def print_summary_stats(xs,indent=0):
    init_tabs = " "*indent
    print(init_tabs+"mean:\t", np.mean(xs))
    print(init_tabs+"median:\t", np.median(xs))
    print(init_tabs+"mode:\t", np.unique(xs)[-1])
    print(init_tabs+"sd:\t", np.std(xs))
    print(init_tabs+"min:\t", np.min(xs))
    print(init_tabs+"max:\t", np.max(xs))
    print(init_tabs+"count:\t", len(xs))
    print(init_tabs+"missing:", sum(np.isnan(xs)))
    
    print(init_tabs+"Percentiles:")
    for p in [10,25,50,75,90]:
        print(init_tabs+"\t",str(p) + "%:", np.percentile(xs,p))

In [5]:
def sample_shooting_data(train_percentage=None,test_percentage=0.2):
    if train_percentage == None:
        train_percentage = 1 - test_percentage
    train_size = len(df.index) * train_percentage
    train = set()
    test = set()
    shootings_list = [i for i in range(len(df.index)) if df.loc[i,"shooting"] == True]
    shootings = set(shootings_list)
    nonshootings = set(i for i in range(len(df.index)) if i not in shootings)
    
    num_shootings = len(shootings)
    num_nonshootings = len(nonshootings)
    
    for i in np.random.choice(list(shootings), int(train_percentage * num_shootings), replace=False):
        train.add(i)
    for i in np.random.choice(list(nonshootings), int(train_percentage * num_nonshootings), replace=False):
        train.add(i)
    
    for i in train:
        if i in shootings:
            shootings.remove(i)
        else:
            nonshootings.remove(i)
    for i in np.random.choice(list(shootings), int(test_percentage * num_shootings), replace=False):
        test.add(i)
    for i in np.random.choice(list(nonshootings), int(test_percentage * num_nonshootings), replace=False):
        test.add(i)
    
    return list(train), list(test)

In [3]:
df = get_fresh_boston_data()