# Cleaning of Shooters Data
Source: The Violence Project

In [54]:
import pandas as pd
from pathlib import Path

# Ignnore warning
import warnings
warnings.simplefilter(action='ignore')

# Local modules
from codes import code_column_names
from data_exploration import dataset_info, find_keyword_in_list

In [55]:
csv = Path('raw_data/full_database.csv')
cases_df = pd.read_csv(csv)

In [56]:
# Find column names with non-conventional characters
search_terms = ['\xa0']

problems = []

for item in search_terms:
    for c in cases_df.columns:
        if item in c:
            problems.append(c)

# Print problematic columns
print(problems)

# Remove problematic characters from column names
problems_fixed = [p.replace('\xa0','') for p in problems]
print(problems_fixed)

# Rename problematic columns
for i in range(len(problems)):
    cases_df = cases_df.rename(columns={problems[i]: problems_fixed[i]})


['Employment Type\xa0', 'Known Prejudices\xa0', 'Motive: Interpersonal Conflict\xa0', 'Motive: Other\xa0', 'Social Media Use\xa0', 'Leakage\xa0', 'Leakage Who\xa0', 'Leakage Specific/Nonspecific\xa0']
['Employment Type', 'Known Prejudices', 'Motive: Interpersonal Conflict', 'Motive: Other', 'Social Media Use', 'Leakage', 'Leakage Who', 'Leakage Specific/Nonspecific']


In [57]:
keyword_1 = 'Specify'
keyword_2 = 'Specified'
keyword_3 = 'Expanded'
keyword_4 = 'Medication Specified'

detailed_column = find_keyword_in_list(keyword_1, cases_df.columns)
detailed_column += find_keyword_in_list(keyword_2, cases_df.columns)
detailed_column += find_keyword_in_list(keyword_3, cases_df.columns)
detailed_column += find_keyword_in_list(keyword_4, cases_df.columns)

print(f"The following columns will be dropped:\n{detailed_column}")

# Drop all columns containing detailed information
cases_df = cases_df.drop(columns=detailed_column)

The following columns will be dropped:
['Specify Armed Person', 'Health Issues - Specify', 'Specify Relationship to Other Shooting(s)', 'Specify Pop Culture Connection', 'Specify Other Weapons or Gear', 'School Performance Specified', 'Community Involvement Specified', 'Domestic Abuse Specified', 'Psychiatric Medication Specified', 'Signs of Crisis Expanded', 'Psychiatric Medication Specified']


## Create shooter DataFrame

In [58]:
def missing_data(df):
    info_df = dataset_info(df).sort_values('missing', ascending=False)
    return info_df.loc[info_df['missing']>0,:]

In [59]:
id_columns = ['Case #', 'Shooter Last Name', 'Shooter First Name']


shooter_background_fields = ['Age',
                        'Gender',
                        'Race',
                        'Height',
                        'Weight',
                        'Immigrant',
                        'Sexual Orientation',
                        'Religion',
                        'Education',
                        'School Performance',
                        'Birth Order',
                        'Number of Siblings',
                        'Older Siblings',
                        'Younger Siblings',
                        'Relationship Status',
                        'Children',
                        'Employment Status',
                        'Employment Type',
                        'Military Service',
                        'Military Branch',
                        'Community Involvement']


shooter_crime_fields = ['Known to Police or FBI',
                        'Criminal Record',
                        'Part I Crimes',
                        'Part II Crimes',
                        'Highest Level of Justice System Involvement',
                        'History of Physical Altercations',
                        'History of Animal Abuse',
                        'History of Domestic Abuse',
                        'History of Sexual Offenses',
                        'Gang Affiliation',
                        'Terror Group Affiliation',
                        'Known Hate Group or Chat Room Affiliation',
                        'Violent Video Games',
                        'Bully']

shooter_trauma_fields = ['Bullied',
                        'Raised by Single Parent',
                        'Parental Divorce / Separation',
                        'Parental Death in Childhood',
                        'Parental Suicide',
                        'Childhood Trauma',
                        'Physically Abused',
                        'Sexually Abused',
                        'Emotionally Abused',
                        'Neglected',
                        'Childhood SES',
                        'Mother Violent Treatment',
                        'Parental Substance Abuse',
                        'Parent Criminal Record',
                        'Family Member Incarcerated',
                        'Adult Trauma']

shooter_crisis_fields = ['Recent or Ongoing Stressor',
                        'Signs of Being in Crisis',
                        'Timeline of Signs of Crisis',
                        'Inability to Perform Daily Tasks',
                        'Notably Depressed Mood',
                        'Unusually Calm or Happy',
                        'Rapid Mood Swings',
                        'Increased Agitation',
                        'Abusive Behavior',
                        'Isolation',
                        'Losing Touch with Reality',
                        'Paranoia']

shooter_health_fields = ['Suicidality',
                        'Prior Hospitalization',
                        'Voluntary or Involuntary Hospitalization',
                        'Prior Counseling',
                        'Voluntary or Mandatory Counseling',
                        'Psychiatric Medication',
                        'Medication Category',
                        'Treatment 6 Months Prior to Shooting',
                        'Mental Illness',
                        'FASD (Fetal Alcohol Spectrum Disorder)',
                        'Known Family Mental Health History',
                        'Autism Spectrum',
                        'Substance Use',
                        'Health Issues',
                        'Head Injury / Possible TBI']

shooter_only_fields = ['Known Prejudices',
                        'Interest in Firearms',
                        'Firearm Proficiency']

In [60]:
shooter_df = cases_df[id_columns + shooter_background_fields + shooter_crime_fields + shooter_trauma_fields + shooter_crisis_fields + shooter_health_fields + shooter_only_fields]

## Fields

In [61]:
missing_data(shooter_df[shooter_health_fields])

Unnamed: 0,columns,dtypes,elements,missing,unique
6,Medication Category,object,44,149,23
9,FASD (Fetal Alcohol Spectrum Disorder),float64,188,5,2
3,Prior Counseling,float64,192,1,2
5,Psychiatric Medication,float64,192,1,2
12,Substance Use,object,192,1,9


In [62]:
# Replace by 'NA' (original value in the data)
shooter_df = shooter_df.fillna({'Medication Category': 'NA'})

In [63]:
# Replace by 0 = 'No evidence'
shooter_df = shooter_df.fillna({'FASD (Fetal Alcohol Spectrum Disorder)': 0})
shooter_df = shooter_df.fillna({'Prior Counseling': 0})
shooter_df = shooter_df.fillna({'Psychiatric Medication': 0})
shooter_df = shooter_df.fillna({'Substance Use': 0})

In [64]:
missing_data(shooter_df[shooter_crisis_fields])

Unnamed: 0,columns,dtypes,elements,missing,unique
2,Timeline of Signs of Crisis,float64,157,36,4


In [65]:
# Replace by -1 = 'Unknown'
shooter_df = shooter_df.fillna({'Timeline of Signs of Crisis': -1})

In [66]:
missing_data(shooter_df[shooter_trauma_fields])

Unnamed: 0,columns,dtypes,elements,missing,unique
10,Childhood SES,float64,138,55,3
0,Bullied,float64,191,2,2
6,Physically Abused,float64,192,1,2


In [67]:
# Replace by 0 = 'No evidence'
shooter_df = shooter_df.fillna({'Bullied': 0})
shooter_df = shooter_df.fillna({'Physically Abused': 0})

# Replace by -1 = 'Unknown'
shooter_df = shooter_df.fillna({'Childhood SES': -1})

In [68]:
missing_data(shooter_df[shooter_crime_fields])

Unnamed: 0,columns,dtypes,elements,missing,unique
11,Known Hate Group or Chat Room Affiliation,object,192,1,6
12,Violent Video Games,float64,192,1,4
13,Bully,float64,192,1,2


In [69]:
# Replace by 0 = 'No evidence'
shooter_df = shooter_df.fillna({'Bully': 0})
shooter_df = shooter_df.fillna({'Known Hate Group or Chat Room Affiliation': 0})
shooter_df = shooter_df.fillna({'Violent Video Games': 0})

In [70]:
missing_data(shooter_df[shooter_background_fields])

Unnamed: 0,columns,dtypes,elements,missing,unique
4,Weight,float64,46,147,28
19,Military Branch,float64,49,144,6
3,Height,float64,58,135,15
9,School Performance,float64,89,104,3
7,Religion,float64,94,99,5
13,Younger Siblings,float64,105,88,7
12,Older Siblings,float64,107,86,6
10,Birth Order,float64,110,83,5
8,Education,float64,140,53,5
11,Number of Siblings,float64,141,52,11


In [71]:
# Replace by "Unknown"
shooter_df = shooter_df.fillna({'Weight': 'Unknown'})
shooter_df = shooter_df.fillna({'Height': 'Unknown'})

# Replace by 0 = 'No evidence'
shooter_df = shooter_df.fillna({'Children': 0})
shooter_df = shooter_df.fillna({'Community Involvement': 0})

# Replaced by 0 = 'None'
shooter_df = shooter_df.fillna({'Religion': 0})

# Replaced by -1 = 'NA'
shooter_df = shooter_df.fillna({'Military Branch': -1})

# Replace by -1 = 'Unknown'
shooter_df = shooter_df.fillna({'Race': -1})
shooter_df = shooter_df.fillna({'Education': -1})
shooter_df = shooter_df.fillna({'Relationship Status': -1})
shooter_df = shooter_df.fillna({'Employment Status': -1})
shooter_df = shooter_df.fillna({'Employment Type': -1})
shooter_df = shooter_df.fillna({'School Performance': -1})
shooter_df = shooter_df.fillna({'Immigrant': -1})
shooter_df = shooter_df.fillna({'Sexual Orientation': -1})

In [72]:
missing_data(shooter_df[shooter_background_fields])

Unnamed: 0,columns,dtypes,elements,missing,unique
13,Younger Siblings,float64,105,88,7
12,Older Siblings,float64,107,86,6
10,Birth Order,float64,110,83,5
11,Number of Siblings,float64,141,52,11


In [73]:
# Replace by -1 = 'Unknown'
shooter_df.loc[shooter_df['Number of Siblings'].isna(),'Birth Order'] = -1
shooter_df.loc[shooter_df['Number of Siblings'].isna(),'Older Siblings'] = -1
shooter_df.loc[shooter_df['Number of Siblings'].isna(),'Younger Siblings'] = -1
shooter_df.loc[shooter_df['Number of Siblings'].isna(),'Number of Siblings'] = -1

In [74]:
missing_data(shooter_df[shooter_background_fields])

Unnamed: 0,columns,dtypes,elements,missing,unique
13,Younger Siblings,float64,157,36,8
12,Older Siblings,float64,159,34,7
10,Birth Order,float64,162,31,6


In [75]:
# Replace by -1 = 'Unknown'
shooter_df.loc[shooter_df['Birth Order'].isna(),'Younger Siblings'] = -1
shooter_df.loc[shooter_df['Birth Order'].isna(),'Older Siblings'] = -1
shooter_df.loc[shooter_df['Birth Order'].isna(),'Birth Order'] = -1

In [76]:
missing_data(shooter_df[shooter_background_fields])

Unnamed: 0,columns,dtypes,elements,missing,unique
13,Younger Siblings,float64,188,5,8
12,Older Siblings,float64,190,3,7


In [77]:
shooter_df.loc[shooter_df['Younger Siblings'].isna(),:][['Birth Order','Younger Siblings']]

Unnamed: 0,Birth Order,Younger Siblings
15,4.0,
97,4.0,
104,2.0,
113,2.0,
166,4.0,


In [78]:
shooter_df.loc[shooter_df['Older Siblings'].isna(),:][['Birth Order','Older Siblings']]

Unnamed: 0,Birth Order,Older Siblings
15,4.0,
97,4.0,
104,2.0,


In [79]:
# Replace by -1 = 'Unknown'
shooter_df.loc[shooter_df['Younger Siblings'].isna(),'Younger Siblings'] = -1
shooter_df.loc[shooter_df['Younger Siblings'].isna(),'Older Siblings'] = -1
shooter_df.loc[shooter_df['Older Siblings'].isna(),'Younger Siblings'] = -1
shooter_df.loc[shooter_df['Older Siblings'].isna(),'Older Siblings'] = -1

In [80]:
missing_data(shooter_df)

Unnamed: 0,columns,dtypes,elements,missing,unique
83,Firearm Proficiency,float64,191,2,4
82,Interest in Firearms,float64,192,1,2
81,Known Prejudices,object,192,1,14


In [81]:
shooter_df.loc[shooter_df['Firearm Proficiency'].isna(),:][['Firearm Proficiency', 'Interest in Firearms', 'Known Prejudices']]

Unnamed: 0,Firearm Proficiency,Interest in Firearms,Known Prejudices
185,,,0.0
186,,1.0,


In [82]:
# Replace by 0 = 'no evidence'
shooter_df.loc[shooter_df['Interest in Firearms'].isna(),'Interest in Firearms'] = 0
shooter_df.loc[shooter_df['Known Prejudices'].isna(),'Known Prejudices'] = 0

In [83]:
shooter_df.loc[shooter_df['Firearm Proficiency'].isna(),:][['Firearm Proficiency', 'Interest in Firearms', 'Known Prejudices']]

Unnamed: 0,Firearm Proficiency,Interest in Firearms,Known Prejudices
185,,0.0,0
186,,1.0,0


In [84]:
# Replace by -1 = 'unknown'
shooter_df.loc[shooter_df['Firearm Proficiency'].isna(),'Firearm Proficiency'] = -1

In [85]:
dataset_info(shooter_df)

Unnamed: 0,columns,dtypes,elements,missing,unique
0,Case #,int64,193,0,193
1,Shooter Last Name,object,193,0,181
2,Shooter First Name,object,193,0,145
3,Age,int64,193,0,50
4,Gender,int64,193,0,4
...,...,...,...,...,...
79,Health Issues,int64,193,0,2
80,Head Injury / Possible TBI,int64,193,0,2
81,Known Prejudices,object,193,0,15
82,Interest in Firearms,float64,193,0,2


## Save DataFrame as a CSV file

In [86]:
csv_out = Path('clean_data/clean_shooters.csv')
shooter_df.to_csv(csv_out, index=False)