# American Crime and Safety
Data Source: https://github.com/tracybedrosian/Choropleth-2013-Crime-in-America

Kaggle: https://www.kaggle.com/zhenyufan/basic-eda-of-american-crime

## Prepare the Packages and CSV File

In [26]:
import pandas as pd
import seaborn as sns 
import plotly.plotly as py
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from plotly.graph_objs import Scatter, Layout
init_notebook_mode(connected=True)
%matplotlib inline

In [27]:
crime_df = pd.read_csv('crime.csv')
crime_df.head()

Unnamed: 0,State,City,Population,Violent crime,Murder and nonnegligent manslaughter,Rape (revised definition)1,Rape (legacy definition)2,Robbery,Aggravated assault,Property crime,Burglary,Larceny- theft,Motor vehicle theft,Arson3,Unnamed: 14,Unnamed: 15,Unnamed: 16
0,ALABAMA,Abbeville,2645.0,11.0,1.0,1.0,,2.0,7.0,63.0,21.0,39.0,3.0,,,,
1,ALABAMA,Adamsville,4481.0,19.0,1.0,0.0,,7.0,11.0,321.0,58.0,252.0,11.0,,,,
2,ALABAMA,Addison,744.0,1.0,0.0,1.0,,0.0,0.0,25.0,6.0,17.0,2.0,,,,
3,ALABAMA,Alabaster,31170.0,44.0,0.0,2.0,,11.0,31.0,640.0,70.0,544.0,26.0,,,,
4,ALABAMA,Alexander City,14692.0,119.0,2.0,16.0,,12.0,89.0,661.0,121.0,510.0,30.0,,,,


## Data Cleanse and Preparation

### Remove Invalid Variables and NAs' Rows

In [28]:
crime_df.drop(['Rape\r(legacy\rdefinition)2', 'Arson3', 'Unnamed: 14', 'Unnamed: 15', 'Unnamed: 16'], inplace = True, axis = 1)

In [29]:
# According to the count result, I found that city(9292<9302) seems to have na values. Therefore we need to remove City's NAs
crime_df = crime_df[pd.notnull(crime_df['City'])]
print(crime_df.count())

State                                     9292
City                                      9292
Population                                9289
Violent\rcrime                            9288
Murder and\rnonnegligent\rmanslaughter    9292
Rape\r(revised\rdefinition)1              5431
Robbery                                   9292
Aggravated\rassault                       9289
Property\rcrime                           9288
Burglary                                  9290
Larceny-\rtheft                           9290
Motor\rvehicle\rtheft                     9292
dtype: int64


In [30]:
# And then fill some variables' NAs with zero
crime_df[['Violent\rcrime', 'Rape\r(revised\rdefinition)1']] = crime_df[['Violent\rcrime', 'Rape\r(revised\rdefinition)1']].fillna(0)
crime_df[['Aggravated\rassault', 'Property\rcrime', 'Burglary', 'Larceny-\rtheft']] = crime_df[['Aggravated\rassault', 'Property\rcrime', 'Burglary', 'Larceny-\rtheft']].fillna(0)
# And then fill Population with average population
crime_df['Population'] = crime_df['Population'].fillna(crime_df['Population'].mean())
crime_df.count()

State                                     9292
City                                      9292
Population                                9292
Violent\rcrime                            9292
Murder and\rnonnegligent\rmanslaughter    9292
Rape\r(revised\rdefinition)1              9292
Robbery                                   9292
Aggravated\rassault                       9292
Property\rcrime                           9292
Burglary                                  9292
Larceny-\rtheft                           9292
Motor\rvehicle\rtheft                     9292
dtype: int64

### Merge Some Variables and Make Some Calculations

#### Calculate Total Population, Crime, and Crime Rate of Every State 

In [31]:
crime_df['Total Crime'] = crime_df['Violent\rcrime'] + crime_df['Murder and\rnonnegligent\rmanslaughter'] + crime_df['Rape\r(revised\rdefinition)1'] + crime_df['Robbery'] + crime_df['Aggravated\rassault'] + crime_df['Property\rcrime'] + crime_df['Burglary'] + crime_df['Larceny-\rtheft'] + crime_df['Motor\rvehicle\rtheft']
crime_group_df = crime_df.groupby(['State'], as_index=False).sum()[['State', 'Population', 'Total Crime']]
crime_group_df['Crime Rate'] = crime_group_df['Total Crime'] / crime_group_df['Population']
crime_group_df.head()

Unnamed: 0,State,Population,Total Crime,Crime Rate
0,ALABAMA,2392789.0,249605.0,0.104316
1,ALASKA,473660.0,42548.0,0.089828
2,ARIZONA,4867945.0,403654.0,0.082921
3,ARKANSAS,1617882.0,166814.0,0.103106
4,CALIFORNIA,31696400.0,1983102.0,0.062566


#### Give Every State Code for Further EDA

In [32]:
us_state_abbrev = {
    'Alabama': 'AL',
    'Alaska': 'AK',
    'Arizona': 'AZ',
    'Arkansas': 'AR',
    'California': 'CA',
    'Colorado': 'CO',
    'Connecticut': 'CT',
    'Delaware': 'DE',
    'Florida': 'FL',
    'Georgia': 'GA',
    'Hawaii': 'HI',
    'Idaho': 'ID',
    'Illinois': 'IL',
    'Indiana': 'IN',
    'Iowa': 'IA',
    'Kansas': 'KS',
    'Kentucky': 'KY',
    'Louisiana': 'LA',
    'Maine': 'ME',
    'Maryland': 'MD',
    'Massachusetts': 'MA',
    'Michigan': 'MI',
    'Minnesota': 'MN',
    'Mississippi': 'MS',
    'Missouri': 'MO',
    'Montana': 'MT',
    'Nebraska': 'NE',
    'Nevada': 'NV',
    'New Hampshire': 'NH',
    'New Jersey': 'NJ',
    'New Mexico': 'NM',
    'New York': 'NY',
    'North Carolina': 'NC',
    'North Dakota': 'ND',
    'Ohio': 'OH',
    'Oklahoma': 'OK',
    'Oregon': 'OR',
    'Pennsylvania': 'PA',
    'Rhode Island': 'RI',
    'South Carolina': 'SC',
    'South Dakota': 'SD',
    'Tennessee': 'TN',
    'Texas': 'TX',
    'Utah': 'UT',
    'Vermont': 'VT',
    'Virginia': 'VA',
    'Washington': 'WA',
    'West Virginia': 'WV',
    'Wisconsin': 'WI',
    'Wyoming': 'WY',
}

In [33]:
us_state_code = pd.DataFrame.from_dict(us_state_abbrev, orient='index')
us_state_code = us_state_code.reset_index()
us_state_code = us_state_code.rename(columns={'index': 'State', 0: 'Code'})
us_state_code['State'] = us_state_code['State'].str.upper()
us_state_crime_df = crime_group_df.merge(us_state_code, on='State', how='inner')
us_state_crime_df.head()

Unnamed: 0,State,Population,Total Crime,Crime Rate,Code
0,ALABAMA,2392789.0,249605.0,0.104316,AL
1,ALASKA,473660.0,42548.0,0.089828,AK
2,ARIZONA,4867945.0,403654.0,0.082921,AZ
3,ARKANSAS,1617882.0,166814.0,0.103106,AR
4,CALIFORNIA,31696400.0,1983102.0,0.062566,CA


## Exploratory Data Analysis

In [34]:
trace = go.Scatter(
    x=us_state_crime_df['Population'],
    y=us_state_crime_df['Crime Rate'],
    mode='markers',
    text=us_state_crime_df['State'],
    marker=dict(
        size=12,               
        color=us_state_crime_df['Crime Rate'],
        colorscale='Viridis',  
        opacity=0.8
    )
)

data = [trace]
layout = go.Layout(
    showlegend=False,
    title='The USA Crime',
    scene = dict(
        xaxis = dict(title='X:Population'),
        yaxis = dict(title='Y: Crime Rate'),
    ),
    width=800,
    height=600,
)
fig = go.Figure(data=data, layout=layout)
iplot(fig)

In [35]:
data = dict(type = 'choropleth', 
            colorscale = 'Jet', 
            locations = us_state_crime_df['Code'], 
            z = us_state_crime_df['Crime Rate'], 
            locationmode = 'USA-states', 
            text = us_state_crime_df['State'], 
            marker = dict(line = dict(color = 'rgb(255, 255,255)', width = 2)),
            colorbar = {'title':"Crime Rate"}
           )

layout = dict(title = 'The USA Crime Rate',
              geo = dict(scope='usa',
                         showlakes = True,
                         lakecolor = 'rgb(85,173,240)')
             )

choromap = go.Figure(data = [data], layout=layout)

iplot(choromap)

According to these two plots, we can conclude that the crime rate is unrelated with population. And the West, middle, and northeast parts of America are much safer than other parts. The southeast part of America is most unsafe