# Atlanta Crime Report: 2009 - 2022

### Import Libraries

In [1]:
import pandas as pd
import altair as alt
import numpy as np
import os

# import geopandas as gpd

### def import_csv_lst -> extracts data from list of csv files contained in a folder

In [2]:
def import_csv_lst(dirname):
    
    print('Importing:', os.listdir(dirname), '\n...')
    files = os.listdir(dirname)
    dfs = []
    x = 0
    
    while x < len(files):
        file = ('COBRA-Data/' + files[x])
        print('loading:', file)
        df = pd.read_csv(file, low_memory=False)
        x+=1
        dfs.append(df)

    if len(dfs) == len(files):
        print('... \nSuccess')
    else:
        print('... \nerror')
    
    return (dfs, files)

### Import Data

In [3]:
dirname = 'COBRA-Data'
dfs_files = import_csv_lst(dirname)

Importing: ['COBRA-2009-2019.csv', 'COBRA-2020(NEW RMS 9-30 12-31).csv', 'COBRA-2020-OldRMS-09292020.csv', 'COBRA-2021.csv', 'COBRA-2022.csv'] 
...
loading: COBRA-Data/COBRA-2009-2019.csv
loading: COBRA-Data/COBRA-2020(NEW RMS 9-30 12-31).csv
loading: COBRA-Data/COBRA-2020-OldRMS-09292020.csv
loading: COBRA-Data/COBRA-2021.csv
loading: COBRA-Data/COBRA-2022.csv
... 
Success


### Checkpoint - Copy

In [4]:
dfs = dfs_files[0].copy()
files = dfs_files[1]

## Joining Data

### def -> dfsLst_to_rowsOfColumns
This function iterates through a list of dataFrames and creates a new dataFrame where the rows are made up of the column names, and the new column names are dataFrame names. 

In [5]:
# creates dictionary of (df_name:[column names])
def csv_and_cols_to_dict(dfs, files):
    d = {}
    c = 0
    ls = []
    
    for i in dfs:
        for e in i.columns:
            ls.append(e) 
        d[files[c]] = ls
        c+=1
        ls = []
    return d


# returns max length of a list for any key in dictionary
def max_dict_len(d):
    max_len = 0
    for k in d:
        if len(d[k]) > max_len:
            max_len = len(d[k])
    return max_len


# Adjusts length to allow for data frame creation
def len_adjust(d):
    max_len = max_dict_len(d) #retrieves max len
    for i in d:
        if len(d[i]) < max_len:
            dif = max_len - len(d[i])
            ls = []
            for t in range(dif):
                d[i].append('XXX')
    return d


# Convert columns of each dataframe from a list of dataframes into a table (rows=df_columns, columns=df_name)
def dfNameToCol_dfColToRow(dfs, files):
    return len_adjust(csv_and_cols_to_dict(dfs, files)) # returns adjusted dictionary 


# Converts a list of dataFrames to a data frame using the columns as dataFrame names and rows and column names
def dfsLst_to_rowsOfColumns(dfs, files):
    return pd.DataFrame.from_dict(dfNameToCol_dfColToRow(dfs, files))

### View of every column for each data frame

In [6]:
d = dfsLst_to_rowsOfColumns(dfs, files)
d

Unnamed: 0,COBRA-2009-2019.csv,COBRA-2020(NEW RMS 9-30 12-31).csv,COBRA-2020-OldRMS-09292020.csv,COBRA-2021.csv,COBRA-2022.csv
0,Report Number,offense_id,offense_id,offense_id,offense_id
1,Report Date,rpt_date,rpt_date,rpt_date,rpt_date
2,Occur Date,occur_date,occur_date,occur_date,occur_date
3,Occur Time,occur_time,occur_time,occur_day,occur_day
4,Possible Date,poss_date,poss_date,occur_day_num,occur_day_num
5,Possible Time,poss_time,poss_time,occur_time,occur_time
6,Beat,beat,beat,poss_date,poss_date
7,Apartment Office Prefix,apt_office_prefix,apartment_office_prefix,poss_time,poss_time
8,Apartment Number,apt_office_num,apartment_number,beat,beat
9,Location,location,location,zone,zone


## Keep and rename columns for merge

In [7]:
keep = ['Report Number', 'Occur Date', 'Occur Time', 'Location', 'UCR Literal', 'Neighborhood', 'Latitude', 'Longitude']

dfs[1].rename(columns = {'offense_id':'Report Number', 'rpt_date':'Report Date', 'occur_date':'Occur Date', 
                         'occur_time':'Occur Time', 'location':'Location', 'UC2_Literal':'UCR Literal', 
                          'neighborhood':'Neighborhood', 'lat':'Latitude', 'long':'Longitude'}, inplace = True)

dfs[2].rename(columns = {'offense_id':'Report Number', 'rpt_date':'Report Date', 'occur_date':'Occur Date', 
                         'occur_time':'Occur Time', 'location':'Location', 'UC2_Literal':'UCR Literal', 
                          'neighborhood':'Neighborhood', 'lat':'Latitude', 'long':'Longitude'}, inplace = True)

dfs[3].rename(columns = {'offense_id':'Report Number', 'rpt_date':'Report Date', 'occur_date':'Occur Date', 
                         'occur_time':'Occur Time', 'location':'Location', 'UC2_Literal':'UCR Literal', 
                          'neighborhood':'Neighborhood', 'lat':'Latitude', 'long':'Longitude'}, inplace = True)

dfs[4].rename(columns = {'offense_id':'Report Number', 'rpt_date':'Report Date', 'occur_date':'Occur Date', 
                         'occur_time':'Occur Time', 'location':'Location', 'UC2_Literal':'UCR Literal', 
                          'neighborhood':'Neighborhood', 'lat':'Latitude', 'long':'Longitude'}, inplace = True)

### Def dfs_dropcols -> iterates through list of dataFrames and drops unwanted columns with a single list input

In [8]:
def dfs_dropcols(dfs, keep):
    c = 0
    for df in dfs:
        dfs[c] = df[keep]
        c+=1
    d = dfsLst_to_rowsOfColumns(dfs,files)
    return d
    

In [9]:
dfs_dropcols(dfs, keep)

Unnamed: 0,COBRA-2009-2019.csv,COBRA-2020(NEW RMS 9-30 12-31).csv,COBRA-2020-OldRMS-09292020.csv,COBRA-2021.csv,COBRA-2022.csv
0,Report Number,Report Number,Report Number,Report Number,Report Number
1,Occur Date,Occur Date,Occur Date,Occur Date,Occur Date
2,Occur Time,Occur Time,Occur Time,Occur Time,Occur Time
3,Location,Location,Location,Location,Location
4,UCR Literal,UCR Literal,UCR Literal,UCR Literal,UCR Literal
5,Neighborhood,Neighborhood,Neighborhood,Neighborhood,Neighborhood
6,Latitude,Latitude,Latitude,Latitude,Latitude
7,Longitude,Longitude,Longitude,Longitude,Longitude


# !!!!!!!!!!!!!!ABOVE IS GOOD!!!!!!!!!!!!!!! 
I have data ready for merge (I have special plan for heat map)

# list the analysis i am interested in 
1) heat map using lat an long  -----> zone_data = dfs.copy() <-----  (join 2021 and 2022 together with zone_data[0:2]) 
2) time-series for volume of crime  
3) breakdown of most common crimes (bar chart)  
4) breakdown of the time occurance (bar chart)  


remember to delete duplicates

for each section, it is likely easier to choose the needed columns from each, or should i join the data before???

### this is a template for renaming columns - just switch to code to copy
dfs[0].rename(columns = {'Report Number':'??????', 'Report Date':'??????', 'Occur Date':'??????', 
                         'Occur Time':'??????','Possible Date':'??????', 'Possible Time':'??????', 
                         'Beat':'??????', 'Apartment Office Prefix':'??????', 'Apartment Number':'??????', 
                         'Location':'??????', 'Shift Occurence':'??????', 'Location Type':'??????',
                         'UCR Literal':'??????', 'UCR #':'??????', 'IBR Code':'??????', 'Neighborhood':'??????', 
                         'NPU':'??????', 'Latitude':'??????', 'Longitude':'??????'}, inplace = True)