# Atlanta Crime Report: 2009 - 2022

### Import Libraries

In [1]:
import pandas as pd
import altair as alt
import os
from csv_parser import csv_columnNames_to_rows
# import numpy as np
# import geopandas as gpd

### Specify Directory

In [2]:
#Specify Directory
dirname = 'COBRA-Data'

### View column names of each dataFrame
Wrote a script to create a dataFrame out of the column names for each CSV file in a given folder.  
https://www.linkedin.com/pulse/csv-column-name-parser-brandon-wilson/

In [3]:
csv_columnNames_to_rows(dirname)

Importing: ['COBRA-2009-2019.csv', 'COBRA-2020(NEW RMS 9-30 12-31).csv', 'COBRA-2020-OldRMS-09292020.csv', 'COBRA-2021.csv', 'COBRA-2022.csv'] 
...
loading: COBRA-Data/COBRA-2009-2019.csv
loading: COBRA-Data/COBRA-2020(NEW RMS 9-30 12-31).csv
loading: COBRA-Data/COBRA-2020-OldRMS-09292020.csv
loading: COBRA-Data/COBRA-2021.csv
loading: COBRA-Data/COBRA-2022.csv
... 
Success


Unnamed: 0,COBRA-2009-2019.csv,COBRA-2020(NEW RMS 9-30 12-31).csv,COBRA-2020-OldRMS-09292020.csv,COBRA-2021.csv,COBRA-2022.csv
0,Report Number,offense_id,offense_id,offense_id,offense_id
1,Report Date,rpt_date,rpt_date,rpt_date,rpt_date
2,Occur Date,occur_date,occur_date,occur_date,occur_date
3,Occur Time,occur_time,occur_time,occur_day,occur_day
4,Possible Date,poss_date,poss_date,occur_day_num,occur_day_num
5,Possible Time,poss_time,poss_time,occur_time,occur_time
6,Beat,beat,beat,poss_date,poss_date
7,Apartment Office Prefix,apt_office_prefix,apartment_office_prefix,poss_time,poss_time
8,Apartment Number,apt_office_num,apartment_number,beat,beat
9,Location,location,location,zone,zone


## Prepare data
I am preparing two seperate tables.  An aggregate table of 2009-2022 and a small table with specialty data from 2021-2022.  
I also am ensuring type is set to date as necessary.

In [4]:
dfs = []
c = 0
for file in os.listdir(dirname):
    if c == 0:
        dfs.append(pd.read_csv(dirname + '/' + file, low_memory=False, parse_dates=['Occur Date']))
    else:
        dfs.append(pd.read_csv(dirname + '/' + file, low_memory=False, parse_dates=['occur_date']))
    c+=1
            
zone_data = dfs[3:]

### Keep and rename columns for merge

In [5]:
keep = ['Report Number', 'Occur Date', 'Occur Time', 'Location', 'UCR Literal', 'Neighborhood', 'Latitude', 'Longitude']

dfs[1].rename(columns = {'offense_id':'Report Number', 'rpt_date':'Report Date', 'occur_date':'Occur Date', 
                         'occur_time':'Occur Time', 'location':'Location', 'UC2_Literal':'UCR Literal', 
                          'neighborhood':'Neighborhood', 'lat':'Latitude', 'long':'Longitude'}, inplace = True)

dfs[2].rename(columns = {'offense_id':'Report Number', 'rpt_date':'Report Date', 'occur_date':'Occur Date', 
                         'occur_time':'Occur Time', 'location':'Location', 'UC2_Literal':'UCR Literal', 
                          'neighborhood':'Neighborhood', 'lat':'Latitude', 'long':'Longitude'}, inplace = True)

dfs[3].rename(columns = {'offense_id':'Report Number', 'rpt_date':'Report Date', 'occur_date':'Occur Date', 
                         'occur_time':'Occur Time', 'location':'Location', 'UC2_Literal':'UCR Literal', 
                          'neighborhood':'Neighborhood', 'lat':'Latitude', 'long':'Longitude'}, inplace = True)

dfs[4].rename(columns = {'offense_id':'Report Number', 'rpt_date':'Report Date', 'occur_date':'Occur Date', 
                         'occur_time':'Occur Time', 'location':'Location', 'UC2_Literal':'UCR Literal', 
                          'neighborhood':'Neighborhood', 'lat':'Latitude', 'long':'Longitude'}, inplace = True)

zone_keep = ['Report Number', 'zone']
zone_data[0].rename(columns = {'offense_id':'Report Number'}, inplace = True)
zone_data[1].rename(columns = {'offense_id':'Report Number'}, inplace = True)

### Drop unwanted columns

In [6]:
# Passes in a list data frames and a list of column names to keep for each data frame
def dfs_dropcols(dfs, keep):
    c = 0
    for df in dfs:
        dfs[c] = df[keep]
        c+=1
    return dfs

In [7]:
dfs = dfs_dropcols(dfs, keep)
zone_data = dfs_dropcols(zone_data, zone_keep)

In [8]:
# Test drop function
for df in dfs:
    print('df:', list(df))
for df in zone_data:
    print('zone_data:', list(df))

df: ['Report Number', 'Occur Date', 'Occur Time', 'Location', 'UCR Literal', 'Neighborhood', 'Latitude', 'Longitude']
df: ['Report Number', 'Occur Date', 'Occur Time', 'Location', 'UCR Literal', 'Neighborhood', 'Latitude', 'Longitude']
df: ['Report Number', 'Occur Date', 'Occur Time', 'Location', 'UCR Literal', 'Neighborhood', 'Latitude', 'Longitude']
df: ['Report Number', 'Occur Date', 'Occur Time', 'Location', 'UCR Literal', 'Neighborhood', 'Latitude', 'Longitude']
df: ['Report Number', 'Occur Date', 'Occur Time', 'Location', 'UCR Literal', 'Neighborhood', 'Latitude', 'Longitude']
zone_data: ['Report Number', 'zone']
zone_data: ['Report Number', 'zone']


### Fix Date Formatting and Range

In [9]:
# Checking date formatting
for df in dfs:
    print(df['Occur Date'].head())
    

0   2009-01-01
1   2009-01-01
2   2009-01-01
3   2009-01-01
4   2009-01-01
Name: Occur Date, dtype: datetime64[ns]
0    11/29/2020
1    12/28/2020
2    12/24/2020
3    12/30/2020
4    12/31/2020
Name: Occur Date, dtype: object
0   2020-01-02
1   2020-01-06
2   2020-01-09
3   2020-01-15
4   2020-01-18
Name: Occur Date, dtype: datetime64[ns]
0   2021-05-19
1   2021-06-27
2   2021-01-04
3   2021-09-18
4   2020-10-03
Name: Occur Date, dtype: datetime64[ns]
0   2022-07-20
1   2020-10-06
2   2020-11-09
3   2020-11-14
4   2020-11-20
Name: Occur Date, dtype: datetime64[ns]


## Union dfs tables

In [10]:
df_union = None
for df in dfs:
    df_union = pd.concat([df_union, df]).drop_duplicates()
    
# check union 
summ = 0 
c = 0
for df in dfs:
    print(df.shape[0], f'rows in dfs[{c}]')
    summ+=df.shape[0]
    c+=1
print('...')
print(summ, 'rows in dfs total')
print(df_union.shape[0], 'rows in df_union total')
print('...')
print(summ - df_union.shape[0], 'duplicates were removed')

342914 rows in dfs[0]
7249 rows in dfs[1]
14831 rows in dfs[2]
21397 rows in dfs[3]
14605 rows in dfs[4]
...
400996 rows in dfs total
400813 rows in df_union total
...
183 duplicates were removed


## Union zone_data tables

In [11]:
zone_union = None
for z in zone_data:
    zone_union = pd.concat([zone_union, z]).drop_duplicates()
    
# check union 
summ = 0 
c = 0
for z in zone_data:
    print(z.shape[0], f'rows in zone_data[{c}]')
    summ+=z.shape[0]
    c+=1
print('...')
print(summ, 'rows in zone_data total')
print(zone_union.shape[0], 'rows in zone_union total')
print('...')
print(summ - zone_union.shape[0], 'duplicates were removed')

21397 rows in zone_data[0]
14605 rows in zone_data[1]
...
36002 rows in zone_data total
35749 rows in zone_union total
...
253 duplicates were removed


__________________________________

# Analysis
The analysis will be split into two pieces.  
The larger data frame of df_union will be in part 1 and part 2 will utilize zone_union.

## Part 1:  df_union

In [None]:
# 2 problems
# 1) The dates are out of range.  Anything older than 2009 needs to be dumped. -> before the zone_data is defined
# 2) the sorting is off

###  1a) Time-series analysis
This is a time-series analysis of volume.

# !!!!!!!!!!!!!!!!!! 2 problems
 1) The dates are out of range.  Anything older than 2009 needs to be dumped. -> before the zone_data is defined  
 2) the sorting is off

In [12]:
# Reset Occur date back to date type
# df_union.head()
df_union['Occur Date'] = pd.to_datetime(df_union['Occur Date'], errors = 'coerce')
# df_union.head().info()
df_union.sort_values(by='Occur Date', ascending=True)
df_union




# 2 problems
# 1) The dates are out of range.  Anything older than 2009 needs to be dumped. -> before the zone_data is defined
# 2) the sorting is off

Unnamed: 0,Report Number,Occur Date,Occur Time,Location,UCR Literal,Neighborhood,Latitude,Longitude
0,90010930,2009-01-01,1145,2841 GREENBRIAR PKWY,LARCENY-NON VEHICLE,Greenbriar,33.688450,-84.493280
1,90011083,2009-01-01,1330,12 BROAD ST SW,LARCENY-NON VEHICLE,Downtown,33.753200,-84.392010
2,90011208,2009-01-01,1500,3500 MARTIN L KING JR DR SW,LARCENY-NON VEHICLE,Adamsville,33.757350,-84.502820
3,90011218,2009-01-01,1450,3393 PEACHTREE RD NE,LARCENY-NON VEHICLE,Lenox,33.846760,-84.362120
4,90011289,2009-01-01,1600,2841 GREENBRIAR PKWY SW,LARCENY-NON VEHICLE,Greenbriar,33.686770,-84.497730
...,...,...,...,...,...,...,...,...
14600,222440403,2022-08-31,19:00,"3060 CONTINENTAL COLONY PKWY SW\nATL, GA 30331...",AUTO THEFT,Greenbriar,33.680364,-84.493697
14601,222440463,2022-08-31,20:00,"18 W PEACHTREE PL NW\nATL, GA 30308\nUNITED ST...",LARCENY-FROM VEHICLE,Downtown,33.763871,-84.387992
14602,222440602,2022-08-30,00:00,"608 RALPH MCGILL BLVD NE\nATL, GA 30308\nUNITE...",LARCENY-NON VEHICLE,Old Fourth Ward,33.764066,-84.367410
14603,222440722,2022-08-25,23:00,"360 EDGEWOOD AVE SE\nATLANTA, GA 30312\nUNITED...",AUTO THEFT,Sweet Auburn,33.754553,-84.376007


### 1b) ??? heat map of x=date and y=time

### 1c) ???

### 1d) ???

## Part 2: zone_union
Part 2 uses the zone_union table to create a choropleth map to signify volume within specific locations of Atlanta. 

# repeat date issue!!!!!!!!!!!!!!!!!!


 2 problems
 1) The dates are out of range.  Anything older than 2009 needs to be dumped. -> before the zone_data is defined  
 2) the sorting is off