## Gun Violence Archive Congressional District Summary Scrape
This script scrapes congressional district level summary reports for 2014 - 2016 from the Gun Violence Archive. (http://www.gunviolencearchive.org/). These reports contain the folling variables for each congressional district for each year:

'Total Number of Incidents': Continuous<br>
'Number of Deaths': Continuous<br>
'Number of Injuries': Continuous<br>
'Number of Children (age 0-11)Killed or Injured': Continuous<br>
'Number of Teens (age 12-17)Killed or Injured': Continuous<br>
'Mass Shooting',<br>
'Officer Involved IncidentOfficer Shot or Killed': Continuous<br>
'Officer Involved IncidentSubject-Suspect Shot or Killed': Continuous<br>
'Home Invasion': Continuous<br>
'Defensive Use': Continuous<br>
'Accidental Shooting': Continuous<br>
'State': Categorical<br>
'Year': Categorical<br>
'District': Categorical<br>


The informations is then output to a CSV file. This is a long version which has all the 11 factors as columns with an entry for each congressional district by year.

In [6]:
#Import Packages
import requests
from bs4 import BeautifulSoup
import pandas as pd
import pickle
import time
from IPython import display

In [10]:
#Dict of State abbreviations: States full names
states = {
        'AK': 'Alaska',
        'AL': 'Alabama',
        'AR': 'Arkansas',
        'AZ': 'Arizona',
        'CA': 'California',
        'CO': 'Colorado',
        'CT': 'Connecticut',
        'DC': 'District of Columbia',
        'DE': 'Delaware',
        'FL': 'Florida',
        'GA': 'Georgia',
        'HI': 'Hawaii',
        'IA': 'Iowa',
        'ID': 'Idaho',
        'IL': 'Illinois',
        'IN': 'Indiana',
        'KS': 'Kansas',
        'KY': 'Kentucky',
        'LA': 'Louisiana',
        'MA': 'Massachusetts',
        'MD': 'Maryland',
        'ME': 'Maine',
        'MI': 'Michigan',
        'MN': 'Minnesota',
        'MO': 'Missouri',
        'MS': 'Mississippi',
        'MT': 'Montana',
        'NC': 'North Carolina',
        'ND': 'North Dakota',
        'NE': 'Nebraska',
        'NH': 'New Hampshire',
        'NJ': 'New Jersey',
        'NM': 'New Mexico',
        'NV': 'Nevada',
        'NY': 'New York',
        'OH': 'Ohio',
        'OK': 'Oklahoma',
        'OR': 'Oregon',
        'PA': 'Pennsylvania',
        'RI': 'Rhode Island',
        'SC': 'South Carolina',
        'SD': 'South Dakota',
        'TN': 'Tennessee',
        'TX': 'Texas',
        'UT': 'Utah',
        'VA': 'Virginia',
        'VT': 'Vermont',
        'WA': 'Washington',
        'WI': 'Wisconsin',
        'WV': 'West Virginia',
        'WY': 'Wyoming'
}

#Abbreviated state names
states_abrev = [l.lower() for l in states.keys()]

#Years to include
years = ['2014','2015','2016']

#States with one congressional district
at_large = ['mt', 'nd', 'sd', 'wy', 'vt', 'de', 'ak']

#Factors to include
factors = ['Total Number of Incidents',
'Number of Deaths',
'Number of Injuries',
'Number of Children (age 0-11)Killed or Injured',
'Number of Teens (age 12-17)Killed or Injured',
'Mass Shooting', 
'Officer Involved IncidentOfficer Shot or Killed',
'Officer Involved IncidentSubject-Suspect Shot or Killed',
'Home Invasion',
'Defensive Use',
'Accidental Shooting']

#base url
url = "http://www.gunviolencearchive.org/congress/{}"
url2 = "http://www.gunviolencearchive.org/congress/{}/{}"

#Create dict to store data
shooting_stats = dict.fromkeys(years)

#Create Sub-dict for each state
for x in shooting_stats.keys(): shooting_stats[x]=dict.fromkeys(states_abrev)

#Create fully nested dict year>state>district>factors
#State Level
for s in states_abrev:
    display.clear_output(wait=True)
    display.display('Working on State: {}'.format(s))
    
    result = requests.get(url.format(s))
    soup = BeautifulSoup(result.text,"lxml")
    options = soup.find_all("select",{'class':'fancy district-dropdown form-select'})[0].find_all('option')
    
    # Number of Districts
    options = [x.get_text() for x in options]
    options[0] = ''
    
    #Years
    for y in years:
        shooting_stats[y][s] = dict.fromkeys(options)
        
        #Factors
        for o in shooting_stats[y][s].keys():
            shooting_stats[y][s][o] = dict.fromkeys(factors)
display.clear_output()

In [11]:
#Pull Data For Each Congressional District
for s in states_abrev:
    for d in shooting_stats['2015'][s].keys():
        
        display.clear_output(wait=True)
        display.display('Working on State: {}, District: {}'.format(s,d))
        #Pull Site
        result = requests.get(url2.format(s,d))

        #Parse
        c = result.content
        soup = BeautifulSoup(result.text,"lxml")
    
        for x in range(3):
            try:
                #Pulls Frame From Site
                frame = soup.find_all("div",{"class":"small-12 medium-4 columns"})[x].find_all("span")

                #Pull Data From Table
                title = []
                answer = []
                for i in range(len(frame)): 
                    if i % 2 > 0: title.append(frame[i].get_text())
                    if i % 2 == 0: answer.append(frame[i].get_text())

                #Remove subscripts from title
                for t in range(len(title)):
                    if title[t][-1] == '1' or title[t][-1] == '2': title[t] = title[t][0:-1]

                #Insert data into Dict
                for i in range(len(title)):
                    #shooting_stats[states_abrev[s]][answer[0]][title[i]] = answer[i+1]
                    shooting_stats[answer[0]][s][d][title[i]] = answer[i+1]
            except:
                continue
display.clear_output(wait=True)

'Working on State: ky, District: 02'

In [12]:
#Save Raw Dict to Pickle as Backup (Saves as unique file each time it is run)
with open('data//gun_stats_dict{}.pickle'.format(str(time.time())[0:10]), 'wb') as handle:
    pickle.dump(shooting_stats, handle)

In [22]:
#Create Long Dataframe
for i in range(len(states_abrev)):
    for y in years:
        if i == 0 and y == '2014':
            master = pd.DataFrame(shooting_stats[y][states_abrev[i]]).T
            master["State"] = [states_abrev[i]]*master.shape[0]
            master["Year"] = [y]*master.shape[0]
        else:
            slave = pd.DataFrame(shooting_stats[y][states_abrev[i]]).T
            slave["State"] = [states_abrev[i]]*slave.shape[0]
            slave["Year"] = [y]*slave.shape[0]
            master = pd.concat([master,slave])

In [23]:
#Fixes the Indecies of At-Large States
master.index = ['00' if x == '' else x for x in list(master.index)]

In [24]:
#Create Unique District Names Column
district = []
for i in range(len(list(master['State']))):
    district.append(list(master['State'])[i].upper() + ' ' + list(master.index)[i])
    
master['District'] = district
master.index = district

In [25]:
#master.index = master['District']
master['District'] = [x[-2:] for x in list(master['District'])]

In [26]:
master.head().T

Unnamed: 0,CT 00,CT 01,CT 02,CT 03,CT 04
Accidental Shooting,15,2,2,4,2
Defensive Use,11,2,3,2,4
Home Invasion,26,7,5,8,3
Mass Shooting,1,0,0,0,1
Number of Children (age 0-11)Killed or Injured,1,0,0,1,0
Number of Deaths,76,26,3,17,16
Number of Injuries,206,57,11,70,52
Number of Teens (age 12-17)Killed or Injured,19,3,2,9,5
Officer Involved IncidentOfficer Shot or Killed,1,0,1,0,0
Officer Involved IncidentSubject-Suspect Shot or Killed,4,0,1,2,1


In [27]:
#Create Data Frame of at Large States
for i in range(len(at_large)):
    if i == 0:
        at_large_df = master[master['State'] == at_large[i]]
    if i > 0:
        at_large_df = pd.concat([at_large_df, master[master['State'] == at_large[i]]])

In [28]:
#Create Dataset without 00 congressional district values
master1 = pd.concat([master[master['District'] != '00'], at_large_df])

In [50]:
#Uppercase State
master1['State'] = [x.upper() for x in master1['State']]

In [51]:
# Remove Missing Values (PA 02 is not listed in this data)
master2 = master1.drop(master1.index[[i for i, x in enumerate(pd.isnull(master1['Number of Deaths'])) if x == True]],axis = 0)

In [52]:
#Write to CSV 
master2.to_csv('data//US_Shooting_Stats_by_Congressional_District_2014-2016_long.csv')

In [53]:
master2

Unnamed: 0,Accidental Shooting,Defensive Use,Home Invasion,Mass Shooting,Number of Children (age 0-11)Killed or Injured,Number of Deaths,Number of Injuries,Number of Teens (age 12-17)Killed or Injured,Officer Involved IncidentOfficer Shot or Killed,Officer Involved IncidentSubject-Suspect Shot or Killed,Total Number of Incidents,State,Year,District
CT 01,2,2,7,0,0,26,57,3,0,0,150,CT,2014,01
CT 02,2,3,5,0,0,3,11,2,1,1,52,CT,2014,02
CT 03,4,2,8,0,1,17,70,9,0,2,203,CT,2014,03
CT 04,2,4,3,1,0,16,52,5,0,1,122,CT,2014,04
CT 05,5,0,2,0,0,11,13,0,0,0,41,CT,2014,05
CT 01,3,4,12,0,0,28,103,4,0,2,232,CT,2015,01
CT 02,6,4,6,0,1,11,18,3,0,2,69,CT,2015,02
CT 03,3,5,10,1,0,24,85,7,0,3,189,CT,2015,03
CT 04,3,2,8,1,1,18,98,9,0,0,187,CT,2015,04
CT 05,3,1,3,0,0,7,25,2,0,0,46,CT,2015,05
