In [1]:
import numpy as np, pandas as pd
from datetime import datetime

In [2]:
start_date = datetime(2018, 1, 1).date()
end_date = datetime(2020, 3, 15).date()

In [3]:
def standardize_datetime_one_arg(df, col, date_format): 
    def date_fn(d, date_format): 
        if type(d) == float or d == 'nan':
            return datetime.strptime('2000-01-01', '%Y-%m-%d').date() # will get filtered out
        return datetime.strptime(d, date_format).date()

    df['std_date'] = df[col].apply(date_fn, args=(date_format,))
    df = df[(df['std_date'] >= start_date) & (df['std_date'] <= end_date)]
    return df
                                   
def standardize_datetime_two_args(df, col1, date_format1, col2, date_format2):
    def date_fn(row, col1, date_format1, col2, date_format2):
        d1 = str(row[col1])
        d2 = str(row[col2])
        if type(d1) != float and d1 != 'nan':
            return datetime.strptime(d1, date_format1).date()
        elif type(d2) != float and d2 != 'nan': 
            return datetime.strptime(d2, date_format2).date()        
        else: 
            return datetime.strptime('2000-01-01', '%Y-%m-%d').date() # will get filtered out
    
    df['std_date'] = df.apply(date_fn, args=(col1, date_format1, col2, date_format2), axis=1)
    df = df[(df['std_date'] >= start_date) & (df['std_date'] <= end_date)]
    return df

In [4]:
def standardize_with_mapping(df, city_name, file_name, mapping_columns):
    mapping = pd.read_csv('mappings/mapping_' + file_name + '.csv', index_col=None, dtype=str)
    df['join_key'] = ''
    mapping['join_key'] = ''
    for key in mapping_columns: 
        df['join_key'] = df['join_key'] + df[key].fillna('').astype(str)
        mapping['join_key'] = mapping['join_key'] + mapping[key].fillna('').astype(str)

    ### DROPOUT ###
    dropout = pd.merge(df, mapping, how='left', on='join_key')
    print("Dropout:", dropout[dropout['category_1'].astype(str) == 'nan'].shape)
    ###############

    df = pd.merge(df, mapping, how='inner', on='join_key')
    df['city'] = city_name
    df = df[['std_date', 'city', 'category_1', 'category_2', 'category_3']]
    df.to_csv('standardized/standardized_incidents_' + file_name + '.csv', index=False)
    print('Length: ', len(df))
    return df

# Atlanta

In [5]:
city_name = 'Atlanta'
file_name = 'atlanta'
mapping_columns = ['UC2_Literal']

# date_col1 = 
# date_format1 = 
# date_col2 = 
# date_format2 = 

In [6]:
# Read raw
atl1 = pd.read_csv('raw/raw_incidents_atlanta/COBRA-2009-2019 (Updated 1_9_2020)/COBRA-2009-2019.csv', index_col=None)
atl2 = pd.read_csv('raw/raw_incidents_atlanta/COBRA-2020 Old RMS data to 09_29_2020/COBRA-2020-OldRMS-09292020.csv', index_col=None)
atl3 = pd.read_csv('raw/raw_incidents_atlanta/COBRA-2020 New RMS from 9_30_20 (Updated 10_22_2020)/COBRA-2020.csv', index_col=None)
atl1 = atl1.rename(columns={'Report Number':'offense_id',
                            'Report Date':'rpt_date',
                            'Occur Date':'occur_date',
                            'UCR Literal':'UC2_Literal'})

# Datetime
atl1 = standardize_datetime_two_args(atl1, 'occur_date', '%Y-%m-%d', 'rpt_date', '%Y-%m-%d')
atl2 = standardize_datetime_two_args(atl2, 'occur_date', '%m/%d/%Y', 'rpt_date', '%m/%d/%Y')
atl3 = standardize_datetime_two_args(atl3, 'occur_date', '%m/%d/%Y', 'rpt_date', '%m/%d/%Y')

atl1 = atl1[['offense_id', 'std_date', 'UC2_Literal']]
atl2 = atl2[['offense_id', 'std_date', 'UC2_Literal']]
atl3 = atl3[['offense_id', 'std_date', 'UC2_Literal']]

std = pd.concat([atl1, atl2, atl3], sort=True)
std = standardize_with_mapping(std, city_name, file_name, mapping_columns)
std.head()

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


Dropout: (0, 8)
Length:  85337


Unnamed: 0,std_date,city,category_1,category_2,category_3
0,2018-01-01,Atlanta,Property,Theft,Theft from Auto
1,2018-01-01,Atlanta,Property,Theft,Theft from Auto
2,2018-01-01,Atlanta,Property,Theft,Theft from Auto
3,2018-01-01,Atlanta,Property,Theft,Theft from Auto
4,2018-01-01,Atlanta,Property,Theft,Theft from Auto


# Austin

In [7]:
city_name = 'Austin'
file_name = 'austin'
mapping_columns = ['crime_type', 'ucr_code']

date_col1 = 'occ_date'
date_format1 = '%Y-%m-%dT%H:%M:%S.%f'
# date_col2 = 
# date_format2 = 

In [8]:
std = pd.read_csv('raw/raw_incidents_' + file_name + '.csv', index_col=None, dtype=str)
std = standardize_datetime_one_arg(std, date_col1, date_format1)
std = standardize_with_mapping(std, city_name, file_name, mapping_columns)
std.head()

Dropout: (78448, 38)
Length:  152892


Unnamed: 0,std_date,city,category_1,category_2,category_3
0,2020-03-15,Austin,Drug,Drug,Drug
1,2020-03-14,Austin,Drug,Drug,Drug
2,2020-03-12,Austin,Drug,Drug,Drug
3,2020-03-10,Austin,Drug,Drug,Drug
4,2020-03-06,Austin,Drug,Drug,Drug


# Baltimore

In [9]:
city_name = 'Baltimore'
file_name = 'baltimore'
mapping_columns = ['Description']

date_col1 = 'CrimeDateTime'
date_format1 = '%Y/%m/%d %H:%M:%S+%f'
# date_col2 = 
# date_format2 = 

In [10]:
std = pd.read_csv('raw/raw_incidents_' + file_name + '.csv', index_col=None, dtype=str)
std = standardize_datetime_one_arg(std, date_col1, date_format1)
std = standardize_with_mapping(std, city_name, file_name, mapping_columns)
std.head()

Dropout: (1539, 24)
Length:  101253


Unnamed: 0,std_date,city,category_1,category_2,category_3
0,2020-03-15,Baltimore,Violent,Homicide,Unclassified
1,2020-03-15,Baltimore,Violent,Homicide,Unclassified
2,2020-03-13,Baltimore,Violent,Homicide,Unclassified
3,2020-03-10,Baltimore,Violent,Homicide,Unclassified
4,2020-03-09,Baltimore,Violent,Homicide,Unclassified


# Boston

In [11]:
city_name = 'Boston'
file_name = 'boston'
mapping_columns = ['OFFENSE_DESCRIPTION']

date_col1 = 'OCCURRED_ON_DATE'
date_format1 = '%Y-%m-%d %H:%M:%S'
# date_col2 = 
# date_format2 = 

In [12]:
std = pd.read_csv('raw/raw_incidents_' + file_name + '.csv', index_col=None, dtype=str)
std = standardize_datetime_one_arg(std, date_col1, date_format1)
std = standardize_with_mapping(std, city_name, file_name, mapping_columns)
std.head()

Dropout: (127319, 23)
Length:  88936


Unnamed: 0,std_date,city,category_1,category_2,category_3
0,2019-10-16,Boston,Violent,Assault,Aggravated Assault
1,2019-09-29,Boston,Violent,Assault,Aggravated Assault
2,2019-09-29,Boston,Violent,Assault,Aggravated Assault
3,2019-09-28,Boston,Violent,Assault,Aggravated Assault
4,2019-09-28,Boston,Violent,Assault,Aggravated Assault


# Chicago

In [13]:
city_name = 'Chicago'
file_name = 'chicago'
mapping_columns = ['iucr']

date_col1 = 'date'
date_format1 = '%Y-%m-%dT%H:%M:%S.%f'
# date_col2 = 
# date_format2 = 

In [14]:
std = pd.read_csv('raw/raw_incidents_' + file_name + '.csv', index_col=None, dtype=str)
std = standardize_datetime_one_arg(std, date_col1, date_format1)
std = standardize_with_mapping(std, city_name, file_name, mapping_columns)
std.head()

Dropout: (71197, 28)
Length:  516614


Unnamed: 0,std_date,city,category_1,category_2,category_3
0,2018-01-01,Chicago,Property,White Collar,Fraud
1,2018-01-01,Chicago,Property,White Collar,Fraud
2,2018-01-01,Chicago,Property,White Collar,Fraud
3,2018-01-01,Chicago,Property,White Collar,Fraud
4,2018-01-01,Chicago,Property,White Collar,Fraud


# Cincinnati

In [15]:
city_name = 'Cincinnati'
file_name = 'cincinnati'
mapping_columns = ['ucr_group', 'offense']

date_col1 = 'date_from'
date_format1 = '%Y-%m-%dT%H:%M:%S.%f'
date_col2 = 'date_reported'
date_format2 = '%Y-%m-%dT%H:%M:%S.%f'

In [16]:
std = pd.read_csv('raw/raw_incidents_' + file_name + '.csv', index_col=None, dtype=str)
std = standardize_datetime_two_args(std, date_col1, date_format1, date_col2, date_format2)
std = standardize_with_mapping(std, city_name, file_name, mapping_columns)
std.head()

Dropout: (14139, 47)
Length:  63966


Unnamed: 0,std_date,city,category_1,category_2,category_3
0,2019-02-16,Cincinnati,Violent,Rape,Rape
1,2018-05-01,Cincinnati,Violent,Rape,Rape
2,2019-02-05,Cincinnati,Violent,Rape,Rape
3,2018-01-05,Cincinnati,Violent,Rape,Rape
4,2018-05-27,Cincinnati,Violent,Rape,Rape


# Dallas

In [17]:
city_name = 'Dallas'
file_name = 'dallas'
mapping_columns = ['ucr_offense', 'nibrs_crime', 'nibrs_crime_category']

date_col1 = 'date1'
date_format1 = '%Y-%m-%d %H:%M:%S.0%f'
# date_col2 = 
# date_format2 = 

In [18]:
std = pd.read_csv('raw/raw_incidents_' + file_name + '.csv', index_col=None, dtype=str)
std = standardize_datetime_one_arg(std, date_col1, date_format1)
std = standardize_with_mapping(std, city_name, file_name, mapping_columns)
std.head()

Dropout: (107033, 107)
Length:  170799


Unnamed: 0,std_date,city,category_1,category_2,category_3
0,2019-09-04,Dallas,Property,Other Property Crime,Other Property Crime
1,2018-07-08,Dallas,Property,Other Property Crime,Other Property Crime
2,2018-07-08,Dallas,Property,Other Property Crime,Other Property Crime
3,2018-08-02,Dallas,Property,Other Property Crime,Other Property Crime
4,2018-07-08,Dallas,Property,Other Property Crime,Other Property Crime


# Detroit

In [19]:
city_name = 'Detroit'
file_name = 'detroit'
mapping_columns = ['offense_category']

date_col1 = 'incident_timestamp'
date_format1 = '%Y/%m/%d %H:%M:%S+00'
# date_col2 = 
# date_format2 = 

In [20]:
std = pd.read_csv('raw/raw_incidents_' + file_name + '.csv', index_col=None, dtype=str)
std = standardize_datetime_one_arg(std, date_col1, date_format1)
std = standardize_with_mapping(std, city_name, file_name, mapping_columns)
std.head()

Dropout: (16489, 30)
Length:  160386


Unnamed: 0,std_date,city,category_1,category_2,category_3
0,2018-01-09,Detroit,Violent,Robbery,Robbery
1,2018-01-21,Detroit,Violent,Robbery,Robbery
2,2018-01-23,Detroit,Violent,Robbery,Robbery
3,2018-01-28,Detroit,Violent,Robbery,Robbery
4,2018-10-19,Detroit,Violent,Robbery,Robbery


# Fort Worth

In [21]:
city_name = 'Fort Worth'
file_name = 'fort_worth'
mapping_columns = ['offense']

date_col1 = 'from_date'
date_format1 = '%Y-%m-%dT%H:%M:%S.%f'
# date_col2 = 
# date_format2 = 

In [22]:
std = pd.read_csv('raw/raw_incidents_' + file_name + '.csv', index_col=None, dtype=str)
std = standardize_datetime_one_arg(std, date_col1, date_format1)
std = standardize_with_mapping(std, city_name, file_name, mapping_columns)
std.head()

Dropout: (17916, 23)
Length:  63955


Unnamed: 0,std_date,city,category_1,category_2,category_3
0,2019-07-27,Fort Worth,Violent,Assault,Simple Assault
1,2019-12-11,Fort Worth,Violent,Assault,Simple Assault
2,2019-06-11,Fort Worth,Violent,Assault,Simple Assault
3,2019-07-08,Fort Worth,Violent,Assault,Simple Assault
4,2019-06-03,Fort Worth,Violent,Assault,Simple Assault


# Los Angeles

In [23]:
city_name = 'Los Angeles'
file_name = 'los_angeles'
mapping_columns = ['crm_cd']

date_col1 = 'date_occ'
date_format1 = '%Y-%m-%dT%H:%M:%S.%f'
# date_col2 = 
# date_format2 = 

In [24]:
std = pd.read_csv('raw/raw_incidents_' + file_name + '.csv', index_col=None, dtype=str)
std = standardize_datetime_one_arg(std, date_col1, date_format1)
std = standardize_with_mapping(std, city_name, file_name, mapping_columns)
std.head()

Dropout: (34267, 34)
Length:  454458


Unnamed: 0,std_date,city,category_1,category_2,category_3
0,2020-01-08,Los Angeles,Violent,Assault,Simple Assault
1,2020-01-01,Los Angeles,Violent,Assault,Simple Assault
2,2020-01-01,Los Angeles,Violent,Assault,Simple Assault
3,2020-01-01,Los Angeles,Violent,Assault,Simple Assault
4,2020-01-01,Los Angeles,Violent,Assault,Simple Assault


# Milwaukee

In [25]:
std = pd.read_csv('raw/raw_incidents_milwaukee.csv', index_col=None, dtype=str)
mapping = pd.read_csv('mappings/mapping_milwaukee.csv', index_col=None, dtype=str)

for col_name in mapping['ColName'].values:
    std[col_name] = std[col_name].astype(str)

def standardize_datetime_milwaukee(d): 
    return datetime.strptime(d, '%Y-%m-%d %H:%M:%S').date()
std['std_date'] = std['ReportedDateTime'].apply(standardize_datetime_milwaukee)
std = std[(std['std_date'] >= start_date) & (std['std_date'] <= end_date)]

all_crime_types = pd.DataFrame()
for col_name in mapping['ColName'].values:
    one_crime_type = std.loc[std[col_name] == '1', ['std_date',col_name]]
    one_crime_type['join_key'] = col_name
    one_crime_type = one_crime_type.drop(col_name, axis=1)
    all_crime_types = pd.concat([all_crime_types, one_crime_type])    
mapping['join_key'] = mapping['ColName'].fillna('').astype(str)
std = pd.merge(all_crime_types, mapping, how='inner', on='join_key')
std['city'] = 'Milwaukee'
std = std[['std_date', 'city', 'category_1', 'category_2', 'category_3']]
std.to_csv('standardized/standardized_incidents_milwaukee.csv', index=False)

print('Length: ', len(std))
std.head()

Length:  76682


Unnamed: 0,std_date,city,category_1,category_2,category_3
0,2018-01-02,Milwaukee,Violent,Homicide,Murder
1,2018-01-25,Milwaukee,Violent,Homicide,Murder
2,2018-01-29,Milwaukee,Violent,Homicide,Murder
3,2018-02-14,Milwaukee,Violent,Homicide,Murder
4,2018-05-12,Milwaukee,Violent,Homicide,Murder


# New York City

In [26]:
city_name = 'New York City'
file_name = 'new_york_city'
mapping_columns = ['ofns_desc', 'law_cat_cd', 'pd_desc']

date_col1 = 'cmplnt_fr_dt'
date_format1 = '%Y-%m-%dT%H:%M:%S.%f'
# date_col2 = 
# date_format2 = 

In [27]:
std = pd.read_csv('raw/raw_incidents_' + file_name + '.csv', index_col=None, dtype=str)
std = standardize_datetime_one_arg(std, date_col1, date_format1)
std = standardize_with_mapping(std, city_name, file_name, mapping_columns)
std.head()

Dropout: (374396, 44)
Length:  652750


Unnamed: 0,std_date,city,category_1,category_2,category_3
0,2018-02-20,New York City,Property,White Collar,Fraud
1,2020-03-01,New York City,Property,White Collar,Fraud
2,2020-02-24,New York City,Property,White Collar,Fraud
3,2019-10-15,New York City,Property,White Collar,Fraud
4,2019-03-01,New York City,Property,White Collar,Fraud


# Philadelphia

In [28]:
city_name = 'Philadelphia'
file_name = 'philadelphia'
mapping_columns = ['ucr_general', 'text_general_code']

date_col1 = 'dispatch_date'
date_format1 = '%Y-%m-%d'
# date_col2 = 
# date_format2 = 

In [29]:
std = pd.read_csv('raw/raw_incidents_' + file_name + '.csv', index_col=None, dtype=str)
std = standardize_datetime_one_arg(std, date_col1, date_format1)
std = standardize_with_mapping(std, city_name, file_name, mapping_columns)
std.head()

Dropout: (79356, 23)
Length:  266533


Unnamed: 0,std_date,city,category_1,category_2,category_3
0,2018-01-06,Philadelphia,Property,Theft,Unclassified
1,2018-03-29,Philadelphia,Property,Theft,Unclassified
2,2018-05-18,Philadelphia,Property,Theft,Unclassified
3,2018-05-24,Philadelphia,Property,Theft,Unclassified
4,2019-11-12,Philadelphia,Property,Theft,Unclassified


# Phoenix

In [30]:
city_name = 'Phoenix'
file_name = 'phoenix'
mapping_columns = ['UCR CRIME CATEGORY']

date_col1 = 'OCCURRED ON'
date_format1 = '%m/%d/%Y  %H:%M'
date_col2 = 'OCCURRED TO'
date_format2 = '%m/%d/%Y  %H:%M'

In [31]:
std = pd.read_csv('raw/raw_incidents_' + file_name + '.csv', index_col=None, dtype=str)
std = standardize_datetime_two_args(std, date_col1, date_format1, date_col2, date_format2)
std = standardize_with_mapping(std, city_name, file_name, mapping_columns)
std.head()

Dropout: (0, 13)
Length:  148006


Unnamed: 0,std_date,city,category_1,category_2,category_3
0,2018-01-01,Phoenix,Property,Theft,Unclassified
1,2018-01-01,Phoenix,Property,Theft,Unclassified
2,2018-01-01,Phoenix,Property,Theft,Unclassified
3,2018-01-01,Phoenix,Property,Theft,Unclassified
4,2018-01-01,Phoenix,Property,Theft,Unclassified


# San Francisco

In [32]:
city_name = 'San Francisco'
file_name = 'san_francisco'
mapping_columns = ['incident_category']

date_col1 = 'incident_datetime'
date_format1 = '%Y-%m-%dT%H:%M:%S.%f'
# date_col2 = 
# date_format2 = 

In [33]:
std = pd.read_csv('raw/raw_incidents_' + file_name + '.csv', index_col=None, dtype=str)
std = standardize_datetime_one_arg(std, date_col1, date_format1)
std = standardize_with_mapping(std, city_name, file_name, mapping_columns)
std.head()

Dropout: (118130, 42)
Length:  214004


Unnamed: 0,std_date,city,category_1,category_2,category_3
0,2018-01-01,San Francisco,Property,Theft,Grand Larceny Auto
1,2018-01-01,San Francisco,Property,Theft,Grand Larceny Auto
2,2018-01-01,San Francisco,Property,Theft,Grand Larceny Auto
3,2018-01-01,San Francisco,Property,Theft,Grand Larceny Auto
4,2018-01-02,San Francisco,Property,Theft,Grand Larceny Auto


# Seattle

In [34]:
city_name = 'Seattle'
file_name = 'seattle'
mapping_columns = ['offense_parent_group', 'offense']

date_col1 = 'offense_start_datetime'
date_format1 = '%Y-%m-%d %H:%M:%S'
date_col2 = 'report_datetime'
date_format2 = '%Y-%m-%d %H:%M:%S'

In [35]:
std = pd.read_csv('raw/raw_incidents_' + file_name + '.csv', index_col=None, dtype=str)
std = standardize_datetime_two_args(std, date_col1, date_format1, date_col2, date_format2)
std = standardize_with_mapping(std, city_name, file_name, mapping_columns)
std.head()

Dropout: (8450, 24)
Length:  152195


Unnamed: 0,std_date,city,category_1,category_2,category_3
0,2020-02-05,Seattle,Drug,Drug,Drug
1,2020-02-04,Seattle,Drug,Drug,Drug
2,2020-02-04,Seattle,Drug,Drug,Drug
3,2020-02-03,Seattle,Drug,Drug,Drug
4,2020-02-03,Seattle,Drug,Drug,Drug


# Washington DC

In [36]:
city_name = 'Washington DC'
file_name = 'washington_dc'
mapping_columns = ['OFFENSE']

date_col1 = 'START_DATE'
date_format1 = '%Y/%m/%d %H:%M:%S+00'
# date_col2 = 
# date_format2 = 

In [37]:
std = pd.read_csv('raw/raw_incidents_' + file_name + '.csv', index_col=None, dtype=str)
std = standardize_datetime_one_arg(std, date_col1, date_format1)
std = standardize_with_mapping(std, city_name, file_name, mapping_columns)
std.head()

Dropout: (0, 31)
Length:  74270


Unnamed: 0,std_date,city,category_1,category_2,category_3
0,2018-04-01,Washington DC,Violent,Rape,Rape
1,2018-01-22,Washington DC,Violent,Rape,Rape
2,2018-01-01,Washington DC,Violent,Rape,Rape
3,2018-01-23,Washington DC,Violent,Rape,Rape
4,2018-01-02,Washington DC,Violent,Rape,Rape


# Houston

In [38]:
# Jan - May 2018
df1 = pd.DataFrame()
months = ['jan', 'feb', 'mar', 'apr', 'may']
for m in months: 
    tmp = pd.read_excel('raw/raw_incidents_houston/2018/' + m + '18.xls')    
    df1 = pd.concat([df1, tmp], sort=True)
mapping = pd.read_csv('mappings/mapping_houston1.csv', index_col=None, dtype=str)
std = standardize_datetime_one_arg(df1, 'Date', '%m/%d/%Y')

std['join_key'] = std['Offense Type'].fillna('').astype(str)
mapping['join_key'] = mapping['Offense Type'].fillna('').astype(str)
std = pd.merge(std, mapping, how='inner', on='join_key')
std['city'] = 'Houston'
std = std[['std_date', 'city', 'category_1', 'category_2', 'category_3']]

std2 = pd.DataFrame()
std2 = pd.concat([std2, std], sort=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  std['join_key'] = std['Offense Type'].fillna('').astype(str)


In [39]:
# June - Dec 2018
df2 = pd.DataFrame()
months = ['june', 'july', 'aug', 'sep', 'oct', 'nov', 'dec']
for m in months: 
    tmp = pd.read_excel('raw/raw_incidents_houston/2018/flattened_' + m + '18.xlsx')    
    df2 = pd.concat([df2, tmp], sort=True)
mapping = pd.read_csv('mappings/mapping_houston2.csv', index_col=None, dtype=str)
std = standardize_datetime_one_arg(df2, 'Occurrence Date', '%Y-%m-%d')

std['join_key'] = std['NIBRS Description'].fillna('').astype(str)
mapping['join_key'] = mapping['NIBRS Description'].fillna('').astype(str)
std = pd.merge(std, mapping, how='inner', on='join_key')
std['city'] = 'Houston'
std = std[['std_date', 'city', 'category_1', 'category_2', 'category_3']]

std2 = pd.concat([std2, std], sort=True)

In [40]:
# 2019
df3 = pd.read_excel('raw/raw_incidents_houston/2019/2019_NIBRSPublicView.Jan1-Dec31.xlsx', dtype=str)

# 2020
tmp = pd.read_excel('raw/raw_incidents_houston/2020/NIBRSPublicView.Jan1-Dec31-2020.xlsx', dtype=str)
df3 = pd.concat([df3, tmp], sort=True)

mapping = pd.read_csv('mappings/mapping_houston3.csv', index_col=None, dtype=str)
std = standardize_datetime_one_arg(df3, 'Occurrence\nDate', '%Y-%m-%d %H:%M:%S')

std['join_key'] = std['NIBRSDescription'].fillna('').astype(str)
mapping['join_key'] = mapping['NIBRSDescription'].fillna('').astype(str)
std = pd.merge(std, mapping, how='inner', on='join_key')
std['city'] = 'Houston'
std = std[['std_date', 'city', 'category_1', 'category_2', 'category_3']]

std2 = pd.concat([std2, std], sort=True)
std2.to_csv('standardized/standardized_incidents_houston.csv', index=False)
print('Length:', len(std2))
std2.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  std['join_key'] = std['NIBRSDescription'].fillna('').astype(str)


Length: 404456


Unnamed: 0,category_1,category_2,category_3,city,std_date
0,Violent,Robbery,Robbery,Houston,2018-01-01
1,Violent,Robbery,Robbery,Houston,2018-01-14
2,Violent,Robbery,Robbery,Houston,2018-01-01
3,Violent,Robbery,Robbery,Houston,2018-01-04
4,Violent,Robbery,Robbery,Houston,2018-01-13


# Portland

In [41]:
city_name = 'Portland'
file_name = 'portland'
mapping_columns = ['OffenseType']

date_col1 = 'OccurDate'
date_format1 = '%m/%d/%Y'
# date_col2 = 
# date_format2 = 

In [42]:
# Portland
df = pd.DataFrame()

# 2018
tmp = pd.read_csv('raw/raw_incidents_portland/CrimeData-2018.csv')    
df = pd.concat([df, tmp], sort=True)

# 2019
tmp = pd.read_csv('raw/raw_incidents_portland/CrimeData-2019.csv')    
df = pd.concat([df, tmp], sort=True)

# 2020
tmp = pd.read_csv('raw/raw_incidents_portland/CrimeData-2020.csv')    
df = pd.concat([df, tmp], sort=True)

In [43]:
std = df
std = standardize_datetime_one_arg(std, date_col1, date_format1)
std = standardize_with_mapping(std, city_name, file_name, mapping_columns)
std.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['join_key'] = ''
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['join_key'] = df['join_key'] + df[key].fillna('').astype(str)


Dropout: (4436, 20)
Length:  126745


Unnamed: 0,std_date,city,category_1,category_2,category_3
0,2018-01-01,Portland,Violent,Assault,Simple Assault
1,2018-01-01,Portland,Violent,Assault,Simple Assault
2,2018-01-01,Portland,Violent,Assault,Simple Assault
3,2018-01-02,Portland,Violent,Assault,Simple Assault
4,2018-01-02,Portland,Violent,Assault,Simple Assault


# Denver

In [44]:
city_name = 'Denver'
file_name = 'denver'
mapping_columns = ['OFFENSE_TYPE_ID', 'OFFENSE_CATEGORY_ID']

date_col1 = 'FIRST_OCCURRENCE_DATE'
date_format1 = '%m/%d/%Y %I:%M:%S %p'
# date_col2 = 
# date_format2 = 

In [45]:
std = pd.read_csv('raw/raw_incidents_' + file_name + '.csv', index_col=None, dtype=str)
std = standardize_datetime_one_arg(std, date_col1, date_format1)
std = standardize_with_mapping(std, city_name, file_name, mapping_columns)
std.head()

Dropout: (106828, 26)
Length:  94579


Unnamed: 0,std_date,city,category_1,category_2,category_3
0,2019-11-20,Denver,Drug,Drug,Drug
1,2019-11-22,Denver,Drug,Drug,Drug
2,2019-09-27,Denver,Drug,Drug,Drug
3,2019-01-25,Denver,Drug,Drug,Drug
4,2018-11-16,Denver,Drug,Drug,Drug


# Nashville

In [46]:
city_name = 'Nashville'
file_name = 'nashville'
mapping_columns = ['offense_description']

date_col1 = 'incident_occurred'
date_format1 = '%Y-%m-%dT%H:%M:%S.%f'
# date_col2 = 
# date_format2 = 

In [47]:
std = pd.read_csv('raw/raw_incidents_' + file_name + '.csv', index_col=None, dtype=str)
std = standardize_datetime_one_arg(std, date_col1, date_format1)
std = standardize_with_mapping(std, city_name, file_name, mapping_columns)
std.head()

Dropout: (93999, 37)
Length:  160822


Unnamed: 0,std_date,city,category_1,category_2,category_3
0,2020-03-05,Nashville,Property,Theft,Grand Larceny Auto
1,2018-07-07,Nashville,Property,Theft,Grand Larceny Auto
2,2018-07-19,Nashville,Property,Theft,Grand Larceny Auto
3,2019-10-31,Nashville,Property,Theft,Grand Larceny Auto
4,2018-04-25,Nashville,Property,Theft,Grand Larceny Auto


# Louisville

In [48]:
city_name = 'Louisville'
file_name = 'louisville'
mapping_columns = ['CRIME_TYPE']

date_col1 = 'DATE_OCCURED'
date_format1 = '%Y-%m-%d %H:%M:%S'
date_col2 = 'DATE_REPORTED'
date_format2 = '%Y-%m-%d %H:%M:%S'

In [49]:
# Louisville

df = pd.DataFrame()

# 2018
tmp = pd.read_csv('raw/raw_incidents_louisville/Crime_Data_2018.csv')    
df = pd.concat([df, tmp], sort=True)

# 2019
tmp = pd.read_csv('raw/raw_incidents_louisville/Crime_Data_2019.csv')    
df = pd.concat([df, tmp], sort=True)

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [50]:
std = df
std = standardize_datetime_two_args(std, date_col1, date_format1, date_col2, date_format2)
std = standardize_with_mapping(std, city_name, file_name, mapping_columns)
std.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['join_key'] = ''
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['join_key'] = df['join_key'] + df[key].fillna('').astype(str)


Dropout: (22202, 21)
Length:  143457


Unnamed: 0,std_date,city,category_1,category_2,category_3
0,2018-03-13,Louisville,Property,Other Property Crime,Other Property Crime
1,2018-03-13,Louisville,Property,Other Property Crime,Other Property Crime
2,2018-03-13,Louisville,Property,Other Property Crime,Other Property Crime
3,2018-03-13,Louisville,Property,Other Property Crime,Other Property Crime
4,2018-03-13,Louisville,Property,Other Property Crime,Other Property Crime


# Kansas City

In [51]:
city_name = 'Kansas City'
file_name = 'kansas_city'
mapping_columns = ['offense']

date_col1 = 'from_date'
date_format1 = '%Y-%m-%dT%H:%M:%S.%f'
date_col2 = 'reported_date'
date_format2 = '%Y-%m-%dT%H:%M:%S.%f'

In [52]:
std = pd.read_csv('raw/raw_incidents_' + file_name + '.csv', index_col=None, dtype=str)
std = standardize_datetime_two_args(std, date_col1, date_format1, date_col2, date_format2)
std = standardize_with_mapping(std, city_name, file_name, mapping_columns)
std.head()

Dropout: (73598, 33)
Length:  177585


Unnamed: 0,std_date,city,category_1,category_2,category_3
0,2020-01-01,Kansas City,Violent,Robbery,Robbery
1,2020-01-01,Kansas City,Violent,Robbery,Robbery
2,2020-01-06,Kansas City,Violent,Robbery,Robbery
3,2020-01-06,Kansas City,Violent,Robbery,Robbery
4,2020-02-07,Kansas City,Violent,Robbery,Robbery


# Raleigh

In [53]:
city_name = 'Raleigh'
file_name = 'raleigh'
mapping_columns = ['crime_description']

date_col1 = 'reported_date'
date_format1 = '%Y/%m/%d %H:%M:%S+%f'
# date_col2 = 
# date_format2 = 

In [54]:
std = pd.read_csv('raw/raw_incidents_' + file_name + '.csv', index_col=None, dtype=str)
std = standardize_datetime_one_arg(std, date_col1, date_format1)
std = standardize_with_mapping(std, city_name, file_name, mapping_columns)
std.head()


Dropout: (42743, 29)
Length:  68199


Unnamed: 0,std_date,city,category_1,category_2,category_3
0,2018-02-17,Raleigh,Violent,Rape,Rape
1,2018-02-18,Raleigh,Violent,Rape,Rape
2,2018-01-21,Raleigh,Violent,Rape,Rape
3,2018-02-01,Raleigh,Violent,Rape,Rape
4,2018-02-18,Raleigh,Violent,Rape,Rape


# Buffalo

In [55]:
city_name = 'Buffalo'
file_name = 'buffalo'
mapping_columns = ['incident_type_primary']

date_col1 = 'incident_datetime'
date_format1 = '%Y-%m-%dT%H:%M:%S.%f'
# date_col2 = 
# date_format2 = 

In [56]:
std = pd.read_csv('raw/raw_incidents_' + file_name + '.csv', index_col=None, dtype=str)
std = standardize_datetime_one_arg(std, date_col1, date_format1)
std = standardize_with_mapping(std, city_name, file_name, mapping_columns)
std.head()

Dropout: (369, 37)
Length:  31283


Unnamed: 0,std_date,city,category_1,category_2,category_3
0,2020-01-05,Buffalo,Property,Theft,Unclassified
1,2020-01-04,Buffalo,Property,Theft,Unclassified
2,2020-01-05,Buffalo,Property,Theft,Unclassified
3,2020-01-05,Buffalo,Property,Theft,Unclassified
4,2020-01-04,Buffalo,Property,Theft,Unclassified


# Virginia Beach

In [57]:
city_name = 'Virginia Beach'
file_name = 'virginia_beach'
mapping_columns = ['Offense Description']

date_col1 = 'Date Occured'
date_format1 = '%Y-%m-%d %H:%M:%S'
# date_col2 = 
# date_format2 = 

In [58]:
std = pd.read_csv('raw/raw_incidents_' + file_name + '.csv', index_col=None, dtype=str)
std = standardize_datetime_one_arg(std, date_col1, date_format1)
std = standardize_with_mapping(std, city_name, file_name, mapping_columns)
std.head()

Dropout: (13601, 19)
Length:  48036


Unnamed: 0,std_date,city,category_1,category_2,category_3
0,2019-10-18,Virginia Beach,Property,Theft,Theft from Auto
1,2018-01-01,Virginia Beach,Property,Theft,Theft from Auto
2,2018-01-01,Virginia Beach,Property,Theft,Theft from Auto
3,2018-01-01,Virginia Beach,Property,Theft,Theft from Auto
4,2018-01-01,Virginia Beach,Property,Theft,Theft from Auto


# Little Rock

In [59]:
city_name = 'Little Rock'
file_name = 'little_rock'
mapping_columns = ['offense_description']

date_col1 = 'incident_date'
date_format1 = '%Y-%m-%dT%H:%M:%S.%f'
# date_col2 = 
# date_format2 = 

In [60]:
std = pd.read_csv('raw/raw_incidents_' + file_name + '.csv', index_col=None, dtype=str)
std = standardize_datetime_one_arg(std, date_col1, date_format1)
std = standardize_with_mapping(std, city_name, file_name, mapping_columns)
std.head()

Dropout: (0, 31)
Length:  33900


Unnamed: 0,std_date,city,category_1,category_2,category_3
0,2018-01-07,Little Rock,Violent,Rape,Rape
1,2018-01-08,Little Rock,Violent,Rape,Rape
2,2018-01-14,Little Rock,Violent,Rape,Rape
3,2018-01-17,Little Rock,Violent,Rape,Rape
4,2018-01-22,Little Rock,Violent,Rape,Rape
