# Data Analysis

In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline

## Methods

In [2]:
def getStatsForColumn(column_name):
    print('Unique values: %s \nUnique counts: %d.' % 
          (np.sort(crime_data[column_name].unique()),
           crime_data[column_name].unique().shape[0]))

In [3]:
def identifyNullValues(data):  
    #Column indexes which have null value
    meta_column = np.flatnonzero((data.isnull().any()== True))
    dict={}
    for x in meta_column:     
        dict[data.columns.values[x]] = data.iloc[:, x].isnull().sum()
    
    return dict

## Data

### Crime Data

In [4]:
crime_data = pd.read_csv('data/crime_data.csv')

In [5]:
crime_data.head()

Unnamed: 0,IncidntNum,Category,Descript,DayOfWeek,Date,Time,PdDistrict,Resolution,Address,X,Y,Location,PdId
0,150060275,NON-CRIMINAL,LOST PROPERTY,Monday,01/19/2015,14:00,MISSION,NONE,18TH ST / VALENCIA ST,-122.421582,37.761701,"(37.7617007179518, -122.42158168137)",15006027571000
1,150098210,ROBBERY,"ROBBERY, BODILY FORCE",Sunday,02/01/2015,15:45,TENDERLOIN,NONE,300 Block of LEAVENWORTH ST,-122.414406,37.784191,"(37.7841907151119, -122.414406029855)",15009821003074
2,150098210,ASSAULT,AGGRAVATED ASSAULT WITH BODILY FORCE,Sunday,02/01/2015,15:45,TENDERLOIN,NONE,300 Block of LEAVENWORTH ST,-122.414406,37.784191,"(37.7841907151119, -122.414406029855)",15009821004014
3,150098210,SECONDARY CODES,DOMESTIC VIOLENCE,Sunday,02/01/2015,15:45,TENDERLOIN,NONE,300 Block of LEAVENWORTH ST,-122.414406,37.784191,"(37.7841907151119, -122.414406029855)",15009821015200
4,150098226,VANDALISM,"MALICIOUS MISCHIEF, VANDALISM OF VEHICLES",Tuesday,01/27/2015,19:00,NORTHERN,NONE,LOMBARD ST / LAGUNA ST,-122.431119,37.800469,"(37.8004687042875, -122.431118543788)",15009822628160


In [6]:
print('Rows: %d, Columns: %d' % (crime_data.shape[0], crime_data.shape[1]))

Rows: 2188068, Columns: 13


In [7]:
print('Null values: %s.' % identifyNullValues(crime_data))

Null values: {'PdDistrict': 1}.


###### Incident Number

In [8]:
getStatsForColumn('IncidntNum')

Unique values: [     3979     10128     10736 ... 991549731 991564488 991582377] 
Unique counts: 1726468.


###### Category

In [9]:
getStatsForColumn('Category')

Unique values: ['ARSON' 'ASSAULT' 'BAD CHECKS' 'BRIBERY' 'BURGLARY' 'DISORDERLY CONDUCT'
 'DRIVING UNDER THE INFLUENCE' 'DRUG/NARCOTIC' 'DRUNKENNESS'
 'EMBEZZLEMENT' 'EXTORTION' 'FAMILY OFFENSES' 'FORGERY/COUNTERFEITING'
 'FRAUD' 'GAMBLING' 'KIDNAPPING' 'LARCENY/THEFT' 'LIQUOR LAWS' 'LOITERING'
 'MISSING PERSON' 'NON-CRIMINAL' 'OTHER OFFENSES'
 'PORNOGRAPHY/OBSCENE MAT' 'PROSTITUTION' 'RECOVERED VEHICLE' 'ROBBERY'
 'RUNAWAY' 'SECONDARY CODES' 'SEX OFFENSES, FORCIBLE'
 'SEX OFFENSES, NON FORCIBLE' 'STOLEN PROPERTY' 'SUICIDE' 'SUSPICIOUS OCC'
 'TREA' 'TRESPASS' 'VANDALISM' 'VEHICLE THEFT' 'WARRANTS' 'WEAPON LAWS'] 
Unique counts: 39.


###### Description

In [10]:
getStatsForColumn('Descript')

Unique values: ['ABANDONMENT OF CHILD' 'ABORTION'
 'ACCESS CARD INFORMATION, PUBLICATION OF'
 'ACCESS CARD INFORMATION, THEFT OF' 'ACCIDENTAL BURNS'
 'ACCIDENTAL LACERATIONS' 'ACCIDENTAL SHOOTING'
 'ACTS AGAINST PUBLIC TRANSIT' 'ADVERTISING DISTRIBUTORS PERMIT VIOLATION'
 'AEROSOL CONTAINER; SALE, PURCHASE OR POSSESSION OF'
 'AFFIXING ADVERTISMENTS TO POLES'
 'AGGRAVATED ASSAULT OF POLICE OFFICER, SNIPING'
 'AGGRAVATED ASSAULT OF POLICE OFFICER,BODILY FORCE'
 'AGGRAVATED ASSAULT ON POLICE OFFICER WITH A GUN'
 'AGGRAVATED ASSAULT ON POLICE OFFICER WITH A KNIFE'
 'AGGRAVATED ASSAULT WITH A DEADLY WEAPON' 'AGGRAVATED ASSAULT WITH A GUN'
 'AGGRAVATED ASSAULT WITH A KNIFE' 'AGGRAVATED ASSAULT WITH BODILY FORCE'
 'AGGRESSIVE SOLICITING' 'AID OR HARBOR FELON' 'AIDED CASE'
 'AIDED CASE -PROPERTY FOR DESTRUCTION' 'AIDED CASE, DOG BITE'
 'AIDED CASE, INJURED PERSON' 'AIDED CASE, MENTAL DISTURBED'
 'AIDED CASE, SICK PERSON' 'ALCOHOLIC BEVERAGE, PROCURING SALE OF'
 'AMMUNITION, POSS. BY PROHIBITED

###### Day of Week

In [11]:
getStatsForColumn('DayOfWeek')

Unique values: ['Friday' 'Monday' 'Saturday' 'Sunday' 'Thursday' 'Tuesday' 'Wednesday'] 
Unique counts: 7.


###### Date

In [12]:
getStatsForColumn('Date')

Unique values: ['01/01/2003' '01/01/2004' '01/01/2005' ... '12/31/2015' '12/31/2016'
 '12/31/2017'] 
Unique counts: 5560.


###### Time

In [13]:
getStatsForColumn('Time')

Unique values: ['00:01' '00:02' '00:03' ... '23:57' '23:58' '23:59'] 
Unique counts: 1439.


###### Police Department District

In [14]:
crime_data = crime_data.replace(np.nan, 'N/A')
getStatsForColumn('PdDistrict')

Unique values: ['BAYVIEW' 'CENTRAL' 'INGLESIDE' 'MISSION' 'N/A' 'NORTHERN' 'PARK'
 'RICHMOND' 'SOUTHERN' 'TARAVAL' 'TENDERLOIN'] 
Unique counts: 11.


###### Resolution

In [15]:
getStatsForColumn('Resolution')

Unique values: ['ARREST, BOOKED' 'ARREST, CITED' 'CLEARED-CONTACT JUVENILE FOR MORE INFO'
 'COMPLAINANT REFUSES TO PROSECUTE'
 'DISTRICT ATTORNEY REFUSES TO PROSECUTE' 'EXCEPTIONAL CLEARANCE'
 'JUVENILE ADMONISHED' 'JUVENILE BOOKED' 'JUVENILE CITED'
 'JUVENILE DIVERTED' 'LOCATED' 'NONE' 'NOT PROSECUTED'
 'PROSECUTED BY OUTSIDE AGENCY' 'PROSECUTED FOR LESSER OFFENSE'
 'PSYCHOPATHIC CASE' 'UNFOUNDED'] 
Unique counts: 17.


###### Address

In [16]:
getStatsForColumn('Address')

Unique values: ['0 Block of  HARRISON ST' '0 Block of 10TH AV' '0 Block of 10TH ST' ...
 'ZOE ST / BRYANT ST' 'ZOE ST / FREELON ST' 'ZOE ST / WELSH ST'] 
Unique counts: 25130.


###### X (Latitude)

In [17]:
getStatsForColumn('X')

Unique values: [-122.51364206 -122.51364206 -122.51364206 ... -122.36493749 -122.3647507
 -120.5       ] 
Unique counts: 60256.


###### Y (Longitude)

In [18]:
getStatsForColumn('Y')

Unique values: [37.70787902 37.70791996 37.7079219  ... 37.81997549 37.82062084
 90.        ] 
Unique counts: 58337.


###### Location

In [19]:
getStatsForColumn('Location')

Unique values: ['(37.7078790224135, -122.463626254961)'
 '(37.7079199575616, -122.46092149191)'
 '(37.7079219034586, -122.428716681874)' ...
 '(37.81997549229705, -122.37427517670966)'
 '(37.8206208380702, -122.364750704393)' '(90, -120.5)'] 
Unique counts: 61054.


###### Police Department ID

In [20]:
getStatsForColumn('PdId')

Unique values: [     397963010     1012863010     1073663010 ... 99154973163010
 99156448863010 99158237763010] 
Unique counts: 2188068.


### City Facilities Data