In [1]:
import pandas as pd
import numpy as np

In [2]:
data_raw = pd.read_csv('shootings.csv')

## Some simple cleaning

In [3]:
data_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10911 entries, 0 to 10910
Data columns (total 25 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   the_geom              10892 non-null  object 
 1   the_geom_webmercator  10892 non-null  object 
 2   objectid              10911 non-null  int64  
 3   year                  10911 non-null  int64  
 4   dc_key                10911 non-null  int64  
 5   code                  10823 non-null  float64
 6   date_                 10911 non-null  object 
 7   time                  10823 non-null  object 
 8   race                  10823 non-null  object 
 9   sex                   10911 non-null  object 
 10  age                   10763 non-null  float64
 11  wound                 10803 non-null  object 
 12  officer_involved      10911 non-null  object 
 13  offender_injured      10911 non-null  object 
 14  offender_deceased     10911 non-null  object 
 15  location           

In [4]:
# rename date column
data1 = data_raw.rename(columns = {"date_": "date"})

In [5]:
def remove_col(df, list_of_col = ['the_geom', 'the_geom_webmercator','dc_key', 'code']):
    '''
    This function removes multiple columns at once.
    '''
    
    df = df.drop(columns = list_of_col)
    return df

In [6]:
data1 = remove_col(data1)
data1

Unnamed: 0,objectid,year,date,time,race,sex,age,wound,officer_involved,offender_injured,...,location,latino,point_x,point_y,dist,inside,outside,fatal,lat,lng
0,1525444,2021,10/30/2021,23:12:00,B,M,28.0,Multiple,N,N,...,2700 BLOCK S 54TH ST,0.0,-81.581379,28.419715,12.0,0.0,1.0,0.0,28.419715,-81.581379
1,1525466,2021,11/3/2021,0:14:00,W,M,36.0,Back,N,N,...,2000 BLOCK E TIOGA ST,0.0,-75.103619,39.997162,24.0,0.0,1.0,1.0,39.997162,-75.103619
2,1525467,2021,11/3/2021,2:39:00,A,M,52.0,Multiple,N,N,...,2200 BLOCK S 23RD ST,0.0,-75.184806,39.924141,1.0,0.0,1.0,0.0,39.924141,-75.184806
3,1525468,2021,11/3/2021,2:39:00,A,F,47.0,Multiple,N,N,...,2200 BLOCK S 23RD ST,0.0,-75.184806,39.924141,1.0,0.0,1.0,0.0,39.924141,-75.184806
4,1525469,2021,11/3/2021,2:51:00,B,M,22.0,Head,N,N,...,300 BLOCK E TUSCULUM ST,0.0,-75.126196,39.991840,24.0,0.0,1.0,1.0,39.991840,-75.126196
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10906,1515213,2018,3/20/2018,6:22:00,A,M,53.0,torso,N,N,...,6500 BLOCK Kingsessing Ave,0.0,-75.237622,39.927305,12.0,0.0,1.0,1.0,39.927305,-75.237622
10907,1515214,2018,3/20/2018,6:22:00,A,M,50.0,head,N,N,...,6500 BLOCK Kingsessing Ave,0.0,-75.237622,39.927305,12.0,0.0,1.0,1.0,39.927305,-75.237622
10908,1515215,2018,2/11/2018,14:18:00,B,M,23.0,multi,N,N,...,2400 BLOCK S 75th St,0.0,-75.246248,39.911314,12.0,0.0,1.0,1.0,39.911314,-75.246248
10909,1515216,2018,2/11/2018,14:18:00,B,M,24.0,buttocks,N,N,...,2400 BLOCK S 75th St,0.0,-75.246248,39.911314,12.0,0.0,1.0,0.0,39.911314,-75.246248


In [7]:
# check duplicates - there is no duplicates found
data1.duplicated().sum()

0

In [8]:
print(data1.officer_involved.value_counts())
print(data1.offender_injured.value_counts())
print(data1.offender_deceased.value_counts())

N    10823
Y       88
Name: officer_involved, dtype: int64
N    10857
Y       54
Name: offender_injured, dtype: int64
N    10892
Y       19
Name: offender_deceased, dtype: int64


In [9]:
# change col officer_involved, offender_injured, offender_deceased to binary
data1.officer_involved.replace({'N': 0, 'Y':1}, inplace = True)
data1.offender_injured.replace({'N': 0, 'Y':1}, inplace = True)
data1.offender_deceased.replace({'N': 0, 'Y':1}, inplace = True)

print(data1.officer_involved.value_counts())
print(data1.offender_injured.value_counts())
print(data1.offender_deceased.value_counts())

0    10823
1       88
Name: officer_involved, dtype: int64
0    10857
1       54
Name: offender_injured, dtype: int64
0    10892
1       19
Name: offender_deceased, dtype: int64


In [10]:
data1.fatal.value_counts()

0.0    8690
1.0    2133
Name: fatal, dtype: int64

In [11]:
def convert_to_bool(df, col):
    '''
    This function changes the binary col into boolean data type (limited by np.nan/ None).
    '''
    df[col].replace({0: False, 1:True}, inplace = True)
    
    return df
    

In [12]:
# convert the following columns into boolean. Yet, when there are nan/ none, the data type cant be changed to boolean.

data1 = convert_to_bool(data1, 'officer_involved')
data1 = convert_to_bool(data1, 'offender_injured')
data1 = convert_to_bool(data1, 'offender_deceased')
data1 = convert_to_bool(data1, 'inside')
data1 = convert_to_bool(data1, 'outside')
data1 = convert_to_bool(data1, 'fatal')

In [13]:
data1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10911 entries, 0 to 10910
Data columns (total 21 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   objectid           10911 non-null  int64  
 1   year               10911 non-null  int64  
 2   date               10911 non-null  object 
 3   time               10823 non-null  object 
 4   race               10823 non-null  object 
 5   sex                10911 non-null  object 
 6   age                10763 non-null  float64
 7   wound              10803 non-null  object 
 8   officer_involved   10911 non-null  bool   
 9   offender_injured   10911 non-null  bool   
 10  offender_deceased  10911 non-null  bool   
 11  location           10911 non-null  object 
 12  latino             10823 non-null  float64
 13  point_x            10903 non-null  float64
 14  point_y            10903 non-null  float64
 15  dist               10909 non-null  float64
 16  inside             108

In [14]:
from datetime import datetime
datetime.strptime((data1['date'][0] + "-" + data1['time'][0]), '%m/%d/%Y-%H:%M:%S').month

10

In [15]:
# format date and time column

def reformat_datetime(df, date_col, time_col):
    '''
    This function creates a new column with date and time info, and generates separate columns to capture month, day of 
    month, and hour of day. All these columns are added to the original dataframe.
    '''
    from datetime import datetime
    
    df['datetime'] = ''
    df['month'] = ''
    df['day'] = ''
    df['hour'] = ''
    df[date_col] = df[date_col].fillna('')
    df[time_col] = df[time_col].fillna('')
    
    for i in range(0, len(df)):
        if df['date'][i]:
            if df['time'][i]:
                df['datetime'][i] = None
                df['datetime'][i] = datetime.strptime((df['date'][i] + "-" + df['time'][i]), '%m/%d/%Y-%H:%M:%S')
                df['month'][i] = df['datetime'][i].month
                df['day'][i] = df['datetime'][i].day
                df['hour'][i] = df['datetime'][i].hour
            else:
                df['datetime'][i] = None
                df['datetime'][i] = datetime.strptime((df['date'][i] + "-00:00:00"), '%m/%d/%Y-%H:%M:%S')
                df['month'][i] = df['datetime'][i].month
                df['day'][i] = df['datetime'][i].day
                df['hour'][i] = df['datetime'][i].hour
        else: 
            df['datetime'][i] = None
            df['month'][i] = None
            df['day'][i] = None
            df['hour'][i] = None
    
    df['datetime'] = pd.to_datetime(df['datetime'])
    return df

In [16]:
data2 = reformat_datetime(data1, 'date', 'time')
data2

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['datetime'][i] = None
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['datetime'][i] = datetime.strptime((df['date'][i] + "-" + df['time'][i]), '%m/%d/%Y-%H:%M:%S')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['month'][i] = df['datetime'][i].month
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['

Unnamed: 0,objectid,year,date,time,race,sex,age,wound,officer_involved,offender_injured,...,dist,inside,outside,fatal,lat,lng,datetime,month,day,hour
0,1525444,2021,10/30/2021,23:12:00,B,M,28.0,Multiple,False,False,...,12.0,False,True,False,28.419715,-81.581379,2021-10-30 23:12:00,10,30,23
1,1525466,2021,11/3/2021,0:14:00,W,M,36.0,Back,False,False,...,24.0,False,True,True,39.997162,-75.103619,2021-11-03 00:14:00,11,3,0
2,1525467,2021,11/3/2021,2:39:00,A,M,52.0,Multiple,False,False,...,1.0,False,True,False,39.924141,-75.184806,2021-11-03 02:39:00,11,3,2
3,1525468,2021,11/3/2021,2:39:00,A,F,47.0,Multiple,False,False,...,1.0,False,True,False,39.924141,-75.184806,2021-11-03 02:39:00,11,3,2
4,1525469,2021,11/3/2021,2:51:00,B,M,22.0,Head,False,False,...,24.0,False,True,True,39.991840,-75.126196,2021-11-03 02:51:00,11,3,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10906,1515213,2018,3/20/2018,6:22:00,A,M,53.0,torso,False,False,...,12.0,False,True,True,39.927305,-75.237622,2018-03-20 06:22:00,3,20,6
10907,1515214,2018,3/20/2018,6:22:00,A,M,50.0,head,False,False,...,12.0,False,True,True,39.927305,-75.237622,2018-03-20 06:22:00,3,20,6
10908,1515215,2018,2/11/2018,14:18:00,B,M,23.0,multi,False,False,...,12.0,False,True,True,39.911314,-75.246248,2018-02-11 14:18:00,2,11,14
10909,1515216,2018,2/11/2018,14:18:00,B,M,24.0,buttocks,False,False,...,12.0,False,True,False,39.911314,-75.246248,2018-02-11 14:18:00,2,11,14


In [17]:
data2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10911 entries, 0 to 10910
Data columns (total 25 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   objectid           10911 non-null  int64         
 1   year               10911 non-null  int64         
 2   date               10911 non-null  object        
 3   time               10911 non-null  object        
 4   race               10823 non-null  object        
 5   sex                10911 non-null  object        
 6   age                10763 non-null  float64       
 7   wound              10803 non-null  object        
 8   officer_involved   10911 non-null  bool          
 9   offender_injured   10911 non-null  bool          
 10  offender_deceased  10911 non-null  bool          
 11  location           10911 non-null  object        
 12  latino             10823 non-null  float64       
 13  point_x            10903 non-null  float64       
 14  point_

In [18]:
# check point_x with lng
# check point_y with lat

def verify_lat_lng(df, lat_col1, lat_col2, lng_col1, lng_col2):
    '''
    This function calculates the differences between the two sets of longitude and latitude columns and return a
    data frame with entries of non-zero difference for longitude and latitude respectively. 
    '''

    df['lng_diff'] = df[lng_col1] - df[lng_col2]
    df_lng_diff = df[[lng_col1, lng_col2, 'lng_diff']][df['lng_diff'] != 0]
    
    df['lat_diff'] = df[lat_col1] - df[lat_col2]
    df_lat_diff = df[[lat_col1, lat_col2, 'lat_diff']][df['lat_diff'] != 0]
    
    return [df_lng_diff, df_lat_diff]

In [19]:
df_lng_diff, df_lat_diff= verify_lat_lng(data2,'lat', 'point_y','lng', 'point_x')

In [20]:
# check the size of difference in longitude -- acceptable
df_lng_diff['lng_diff'].describe()

count    5.240000e+02
mean    -1.278601e-05
std      2.922426e-04
min     -6.689550e-03
25%     -9.999994e-09
50%      9.999994e-09
75%      1.000001e-08
max      4.356000e-05
Name: lng_diff, dtype: float64

In [21]:
# check the size of difference in longitude -- acceptable
df_lat_diff['lat_diff'].describe()

count    5.800000e+02
mean    -5.247909e-05
std      1.270420e-03
min     -3.059540e-02
25%     -1.000000e-08
50%     -1.000000e-08
75%     -1.000000e-08
max      3.918000e-05
Name: lat_diff, dtype: float64

With the above info, we can determine whether we want to update any info. All columns are retained for now for users' choice. 

## Clean Up Wound Column

Understand how the wound column looks like in the raw data.

In [22]:
data_raw.wound.unique()

array(['Multiple', 'Back', 'Head', 'Leg', 'Chest', 'Multiple/Head',
       'Buttocks', 'Neck', 'Shoulder', 'Abdomen', 'Ankle', 'Arm', 'Hip',
       'Wrist', 'Foot', 'leg', 'side', 'buttocks', 'feet', 'foot',
       'head-m', 'multi', 'chest', 'hand', 'arm', 'stomach', 'back',
       'head', 'shoulder', 'torso', 'neck', 'abdomen', 'hip', 'legs',
       'buttock', 'multi/head', 'groin', nan, 'ankle', 'thigh', 'knee',
       'calf', 'mutli', 'BACK', 'mult', 'head-md', 'leg/multi', 'shou',
       'unk', 'elbow', 'multii', 'ear', 'multi/ hea', 'arms',
       'head/multi', 'finger', 'wrist', 'mullti', 'abdom', 'Unknown',
       'Hand', 'Pelvis', 'Stomach', 'ARM', 'FOOT', 'Groin', 'LEG',
       'ABDOMEN', 'MULTI', 'KNNES', 'head/neck', 'knees', 'NOSE',
       'mult/headi', 'HEAD', 'NECK', 'CHEST', 'THIGH', 'SHOULDER',
       'STOMACH', 'forearm', 'chect', 'butt', 'HAND', 'armpit', 'toe',
       'thumb', 'shoulders', 'ANKLE', 'thighs', 'face', 'mutli/head',
       'cheek', 'eye', 'chest/back',

In [23]:
len(data_raw.wound.unique())

125

In [24]:
len(data_raw[data_raw.wound.isna()])

108

In [25]:
def col_lower_case_and_fillna(column_name):
    '''
    This function returns lower case and fill out null value with empty string for a column
    '''
    
    new_col = column_name.str.lower()
    new_col = new_col.fillna('')
    return new_col

In [26]:
data2['wound_updated'] = col_lower_case_and_fillna(data2.wound)

In [27]:
data2.wound_updated.unique()

array(['multiple', 'back', 'head', 'leg', 'chest', 'multiple/head',
       'buttocks', 'neck', 'shoulder', 'abdomen', 'ankle', 'arm', 'hip',
       'wrist', 'foot', 'side', 'feet', 'head-m', 'multi', 'hand',
       'stomach', 'torso', 'legs', 'buttock', 'multi/head', 'groin', '',
       'thigh', 'knee', 'calf', 'mutli', 'mult', 'head-md', 'leg/multi',
       'shou', 'unk', 'elbow', 'multii', 'ear', 'multi/ hea', 'arms',
       'head/multi', 'finger', 'mullti', 'abdom', 'unknown', 'pelvis',
       'knnes', 'head/neck', 'knees', 'nose', 'mult/headi', 'forearm',
       'chect', 'butt', 'armpit', 'toe', 'thumb', 'shoulders', 'thighs',
       'face', 'mutli/head', 'cheek', 'eye', 'chest/back', 'head/back',
       'multli', 'shoul', 'should', 'multi/arm', 'stom', 'multi leg',
       'shin', 'abdome', 'shouldeer', 'multi/face', 'mukti', 'cheat',
       'multi tors', 'waist', 'back/head', 'ribs', 'temple', 'throat',
       'leg/buttoc', 'head/mullt', 'body', 'flank', 'head/chest',
       'shou

In [28]:
# Only necessary if textblob is not installed
# !pip install textblob

In [29]:
# check whether there is still na in the newly created wound_updated column
len(data2[data2.wound_updated.isna()])

0

In [30]:
# https://stackabuse.com/spelling-correction-in-python-with-textblob/

# To understand how textblob works when using built in training data
from textblob import TextBlob

textBlb = TextBlob('multi/ hea')            # Making our first textblob
textCorrected = textBlb.correct()   # Correcting the text
print(textCorrected)

multi/ he


In [31]:
# Acknowldgement:
    # https://stackabuse.com/spelling-correction-in-python-with-textblob/

def correct_wound(source_file, str_to_correct):
    """ 
    This function trains textblob using a source file carrying known common bodyparts (user can modify as necessary)
    and correct the wound (string).
    The corrected spelling is then saved as a string.
    """

    from textblob.en import Spelling   
    from textblob import TextBlob
    import re

    ### Train textblob     
    textToLower = ""
    
    with open(source_file,"r") as f1:           # Open our source file for training
        text = f1.read()                                  # Read the file                 
        textToLower = text.lower()                        # Lower all the capital letters
    
    words = re.findall("[a-z]+", textToLower)             # Find all the words and place them into a list    
    oneString = " ".join(words)                           # Join them into one string
    
    pathToFile = "train.txt"                              # The path we want to store our stats file at
    spelling = Spelling(path = pathToFile)                # Connect the path to the Spelling object
    spelling.train(oneString, pathToFile)                 # Train
    
    ### Correct spelling
    pathToFile = "train.txt" 
    spelling = Spelling(path = pathToFile)

    updated_word = ''
    updated_word = spelling.suggest(str_to_correct)[0][0] # Spell checking word by word
    
    return updated_word

In [32]:
# test out function
updated_word = correct_wound('bodyparts.txt', data2.iloc[17,7])
updated_word

'Buttock'

In [33]:
def standardize_wound(df, wound_col, source_file):
    
    ''' 
    This function takes the wound column that is already in lower case and correct the mispelling using correct_wound
    function built previously. Morevoer, it cleans up a few common alternative spellings (multi, butt, unk) and the
    misspelling regarding shoulder that is not fixed previously.
    
    The corrected strings are saved as a new column in the input dataframe. 
    '''
    
    df['wound_rec'] = None
    import re
    
    ### Correct lower case wounds
    for i, wound in enumerate(df[wound_col]):
        
        if wound != None:
            
            ### treat those with '/' differently
            if re.search('/', wound):
                wound_list = re.split('[/]', wound)
                updated_wound = set() # set as set
                for part in wound_list:
                    updated_wound.add(correct_wound(source_file, part))
                #df['wound_rec'][i] = updated_wound
                df['wound_rec'][i] = ",".join(updated_wound)
                
            else:
                updated_wound = correct_wound(source_file, wound)
                df['wound_rec'][i] = updated_wound
        else: 
           # print('this is nan:', i)
            df['wound_rec'][i] = None
    
    ### clean up a few more spellings: multi, butt, shoul, unk
    for j, wound in enumerate(df['wound_rec']):
        if re.search('shou[.]*', wound):
            df['wound_rec'][j] = 'shoulder'
        if re.search('^butt[.]*', wound):
            df['wound_rec'][j] = 'buttock'
        if re.search('multiple', wound):
            pass
        else:
            if re.search('multi', wound):
                df['wound_rec'][j]= re.sub("multi", 'multiple', wound)
        if re.search('unk', wound):
            df['wound_rec'][j]= 'unknown'

    df = df.drop(columns = ['wound','wound_updated'])
    
    return df

In [34]:
data3 = standardize_wound(data2, 'wound_updated', 'bodyparts.txt')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['wound_rec'][i] = updated_wound
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['wound_rec'][i] = ",".join(updated_wound)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['wound_rec'][j] = 'buttock'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['wound_rec'][j] = 'shoulder'
A value is trying to be s

In [35]:
data3.wound_rec.unique()

array(['multiple', 'back', 'head', 'leg', 'chest', 'head,multiple',
       'buttock', 'neck', 'shoulder', 'abdomen', 'ankle', 'arm', 'hip',
       'wrist', 'foot', 'side', 'hand', 'stomach', 'torso', 'groin', '',
       'thigh', 'knee', 'calf', 'head-md', 'leg,multiple', 'shin',
       'unknown', 'elbow', 'ear', 'finger', 'pelvis', 'head,neck', 'nose',
       'forearm', 'armpit', 'toe', 'thumb', 'face', 'cheek', 'eye',
       'back,chest', 'head,back', 'arm,multiple', 'face,multiple',
       'multiple tors', 'waist', 'rib', 'temple', 'throat', 'body',
       'flank', 'head,chest', 'testicle'], dtype=object)

In [36]:
len(data3.wound_rec.unique())

54

In [37]:
def format_wound(df, wound_col):
    '''
    This function standardizes the format of wound, mainly for those containing more than one bodyparts. 
    E.g., "head,multiple", "multiple,head" are the same and hence are formatted to show the same value.
    The results are saved as a new column in the input dataframe. 
    '''
    
    import re
    
    ### separate words by comma
    df['wound_rev'] = None
    for i, wound in enumerate(df[wound_col]):
        set_of_wound = set()
        if re.search(',|/', wound): 
            set_of_wound = set(wound.split('[,/]'))
            df['wound_rev'][i] = ",".join(set_of_wound)
        else:
            df['wound_rev'][i] = wound
    
    df = df.drop(columns = ['wound_rec'])
    return df

In [38]:
data4 = format_wound(data3, 'wound_rec')
data4

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['wound_rev'][i] = wound
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['wound_rev'][i] = ",".join(set_of_wound)


Unnamed: 0,objectid,year,date,time,race,sex,age,officer_involved,offender_injured,offender_deceased,...,fatal,lat,lng,datetime,month,day,hour,lng_diff,lat_diff,wound_rev
0,1525444,2021,10/30/2021,23:12:00,B,M,28.0,False,False,False,...,False,28.419715,-81.581379,2021-10-30 23:12:00,10,30,23,0.0,0.0,multiple
1,1525466,2021,11/3/2021,0:14:00,W,M,36.0,False,False,False,...,True,39.997162,-75.103619,2021-11-03 00:14:00,11,3,0,0.0,0.0,back
2,1525467,2021,11/3/2021,2:39:00,A,M,52.0,False,False,False,...,False,39.924141,-75.184806,2021-11-03 02:39:00,11,3,2,0.0,0.0,multiple
3,1525468,2021,11/3/2021,2:39:00,A,F,47.0,False,False,False,...,False,39.924141,-75.184806,2021-11-03 02:39:00,11,3,2,0.0,0.0,multiple
4,1525469,2021,11/3/2021,2:51:00,B,M,22.0,False,False,False,...,True,39.991840,-75.126196,2021-11-03 02:51:00,11,3,2,0.0,0.0,head
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10906,1515213,2018,3/20/2018,6:22:00,A,M,53.0,False,False,False,...,True,39.927305,-75.237622,2018-03-20 06:22:00,3,20,6,0.0,0.0,torso
10907,1515214,2018,3/20/2018,6:22:00,A,M,50.0,False,False,False,...,True,39.927305,-75.237622,2018-03-20 06:22:00,3,20,6,0.0,0.0,head
10908,1515215,2018,2/11/2018,14:18:00,B,M,23.0,False,False,False,...,True,39.911314,-75.246248,2018-02-11 14:18:00,2,11,14,0.0,0.0,multiple
10909,1515216,2018,2/11/2018,14:18:00,B,M,24.0,False,False,False,...,False,39.911314,-75.246248,2018-02-11 14:18:00,2,11,14,0.0,0.0,buttock


In [39]:
data4['wound_rev'].unique()

array(['multiple', 'back', 'head', 'leg', 'chest', 'head,multiple',
       'buttock', 'neck', 'shoulder', 'abdomen', 'ankle', 'arm', 'hip',
       'wrist', 'foot', 'side', 'hand', 'stomach', 'torso', 'groin', '',
       'thigh', 'knee', 'calf', 'head-md', 'leg,multiple', 'shin',
       'unknown', 'elbow', 'ear', 'finger', 'pelvis', 'head,neck', 'nose',
       'forearm', 'armpit', 'toe', 'thumb', 'face', 'cheek', 'eye',
       'back,chest', 'head,back', 'arm,multiple', 'face,multiple',
       'multiple tors', 'waist', 'rib', 'temple', 'throat', 'body',
       'flank', 'head,chest', 'testicle'], dtype=object)

In [40]:
len(data4['wound_rev'].unique())

54

In [41]:
data4.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10911 entries, 0 to 10910
Data columns (total 27 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   objectid           10911 non-null  int64         
 1   year               10911 non-null  int64         
 2   date               10911 non-null  object        
 3   time               10911 non-null  object        
 4   race               10823 non-null  object        
 5   sex                10911 non-null  object        
 6   age                10763 non-null  float64       
 7   officer_involved   10911 non-null  bool          
 8   offender_injured   10911 non-null  bool          
 9   offender_deceased  10911 non-null  bool          
 10  location           10911 non-null  object        
 11  latino             10823 non-null  float64       
 12  point_x            10903 non-null  float64       
 13  point_y            10903 non-null  float64       
 14  dist  

In [42]:
data4.to_csv('intermediate_shooting_data.csv', index = False)

PermissionError: [Errno 13] Permission denied: 'intermediate_shooting_data.csv'