In [2]:
import pandas as pd
from IPython.display import HTML

In [3]:
df = pd.read_csv('v2crimedata.csv')

In [4]:
df.head()

Unnamed: 0,Incident UID,Agency,Incident Date,CIBRS Unique Offense ID,CIBRS Offense Code,CIBRS Offense Description,Victim UID,Victim Category,Victim Age,Overall Race,City,Zip Code,Census Tract,CIBRS Status,Domestic Violence Incident
0,661987,SAN DIEGO,2/18/21 17:40,661987-13A-679864,13A,Aggravated Assault,679864,I,22.0,HISPANIC,,92037.0,,ProcessingComplete - Valid,True
1,793674,SAN DIEGO,3/16/21 5:30,793674-11A-818279,11A,Forcible Rape,818279,I,41.0,WHITE,,,,ProcessingComplete - Valid,False
2,859375,NATIONAL CITY,3/22/21 14:25,859375-11A-887268,11A,Forcible Rape,887268,I,22.0,HISPANIC,,,,ProcessingComplete - Valid,False
3,658884,SAN DIEGO,3/24/21 12:00,658884-13B-676587,13B,Simple Assault,676587,I,45.0,HISPANIC,,,,ProcessingComplete - Valid,True
4,721771,CARLSBAD,5/4/21 18:54,721771-13B-743217,13B,Simple Assault,743217,I,28.0,HISPANIC,,92011.0,,ProcessingComplete - Valid,False


In [5]:
df['City'].isnull().sum()

2402

## Steps to cleaning city column:
- For rows that have 'sheriff' as city name, replace these values with null and then do one of 2 things:
    1. Use zip code to figure out the city
    2. Use the 'agency' column to estimate what city the incident occurred in


In [7]:
# replace sheriff values with null

df['City'] = df['City'].replace('SHERIFF', None)

In [8]:
zipcode_city_dict = {
    92037 : 'SAN DIEGO',
    92011 : 'CARLSBAD',
    92020 : 'EL CAJON',
    91942 : 'LA MESA',
    92105 : 'SAN DIEGO',
    92071 : 'SANTEE',
    92082 : 'VALLEY CENTER',
    92040 : 'LAKESIDE',
    92101 : 'SAN DIEGO',
    91977 : 'SPRING VALLEY',
    92115 : 'SAN DIEGO',
    91945 : 'LEMON GROVE',
    92113 : 'SAN DIEGO',
    92028 : 'FALLBROOK',
    92102 : 'SAN DIEGO',
    92107 : 'SAN DIEGO',
    92058 : 'OCEANSIDE',
    92054 : 'OCEANSIDE',
    92122 : 'SAN DIEGO',
    92021 : 'EL CAJON',
    92081 : 'VISTA',
    91950 : 'NATIONAL CITY',
    92173 : 'SAN YSIDRO',
    92025 : 'ESCONDIDO',
    92104 : 'SAN DIEGO',
    91932 : 'IMPERIAL BEACH',
    91910 : 'CHULA VISTA',
    92110 : 'SAN DIEGO',
    92123 : 'SAN DIEGO',
    92126 : 'SAN DIEGO',
    92064 : 'POWAY',
    92027 : 'ESCONDIDO',
    92116 : 'SAN DIEGO',
    91914 : 'CHULA VISTA',
    92114 : 'SAN DIEGO',
    92111 : 'SAN DIEGO',
    92083 : 'VISTA',
    92109 : 'SAN DIEGO',
    91911 : 'CHULA VISTA',
    92129 : 'SAN DIEGO',
    92103 : 'SAN DIEGO',
    92019 : 'EL CAJON',
    92117 : 'SAN DIEGO',
    92154 : 'SAN DIEGO',
    92024 : 'ENCINITAS',
    92069 : 'SAN MARCOS',
    92056 : 'OCEANSIDE',
    92014 : 'DEL MAR',
    92008 : 'CARLSBAD',
    92010 : 'CARLSBAD',
    91935 : 'JAMUL',
    92084 : 'VISTA',
    91905 : 'BOULEVARD',
    91978 : 'SPRING VALLEY',
    92003 : 'BONSALL',
    92139 : 'SAN DIEGO',
    91915 : 'CHULA VISTA',
    91901 : 'ALPINE',
    92078 : 'SAN MARCOS',
    92121 : 'SAN DIEGO',
    92026 : 'ESCONDIDO',
    92106 : 'SAN DIEGO',
    92124 : 'SAN DIEGO',
    92059 : 'PALA',
    91941 : 'LA MESA',
    92057 : 'OCEANSIDE',
    91906 : 'CAMPO',
    92065 : 'RAMONA',
    92128 : 'SAN DIEGO',
    92120 : 'SAN DIEGO',
    92029 : 'ESCONDIDO',
    92009 : 'CARLSBAD',
    91913 : 'CHULA VISTA',
    92130 : 'SAN DIEGO',
    92108 : 'SAN DIEGO',
    92004 : 'BORREGO SPRINGS',
    92131 : 'SAN DIEGO',
    92119 : 'SAN DIEGO',
    91902 : 'BONITA',
    91934 : 'JACUMBA',
    92075 : 'SOLANA BEACH',
    91963 : 'POTRERO',
    91917 : 'DULZURA',
    92127 : 'SAN DIEGO',
    92007 : 'CARDIFF BY THE SEA',
    92091 : 'RANCHO SANTA FE',
    92136 : 'SAN DIEGO',
    92055 : 'CAMP PENDLETON',
    91916 : 'DESCANSO',
    91962 : 'PINE VALLEY',
    92067 : 'RANCHO SANTA FE',
    92145 : 'SAN DIEGO',
    92061 : 'PAUMA VALLEY',
    92093 : 'LA JOLLA',
    91931 : 'GUATAY',
    92086 : 'WARNER SPRINGS',
    92135 : 'SAN DIEGO',
    92036 : 'JULIAN',
    92672 : 'SAN CLEMENTE',
    92140 : 'SAN DIEGO',
    92070 : 'SANTA YSABEL',
    92182 : 'SAN DIEGO',
    92066 : 'RANCHITA',
    92536 : 'AGUANGA',
    92118 : 'CORONADO',
    92134 : 'SAN DIEGO',
    92060 : 'PALOMAR MOUNTAIN',
    92161 : 'SAN DIEGO',
    92155 : 'SAN DIEGO',
    91980 : 'TECATE',
    92096 : 'SAN MARCOS',
    91948 : 'MOUNT LAGUNA'
}


In [9]:
# replace null city values with their corresponding zip code city

df['City'] = df.apply(
    lambda row: zipcode_city_dict.get(row['Zip Code']) if pd.isnull(row['City']) else row['City'],
    axis=1
)

df

Unnamed: 0,Incident UID,Agency,Incident Date,CIBRS Unique Offense ID,CIBRS Offense Code,CIBRS Offense Description,Victim UID,Victim Category,Victim Age,Overall Race,City,Zip Code,Census Tract,CIBRS Status,Domestic Violence Incident
0,661987,SAN DIEGO,2/18/21 17:40,661987-13A-679864,13A,Aggravated Assault,679864,I,22.0,HISPANIC,SAN DIEGO,92037.0,,ProcessingComplete - Valid,True
1,793674,SAN DIEGO,3/16/21 5:30,793674-11A-818279,11A,Forcible Rape,818279,I,41.0,WHITE,,,,ProcessingComplete - Valid,False
2,859375,NATIONAL CITY,3/22/21 14:25,859375-11A-887268,11A,Forcible Rape,887268,I,22.0,HISPANIC,,,,ProcessingComplete - Valid,False
3,658884,SAN DIEGO,3/24/21 12:00,658884-13B-676587,13B,Simple Assault,676587,I,45.0,HISPANIC,,,,ProcessingComplete - Valid,True
4,721771,CARLSBAD,5/4/21 18:54,721771-13B-743217,13B,Simple Assault,743217,I,28.0,HISPANIC,CARLSBAD,92011.0,,ProcessingComplete - Valid,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62328,2332374,OCEANSIDE,9/28/24 19:20,2332374-13B-2045587,13B,Simple Assault,2045587,I,36.0,HISPANIC,OCEANSIDE,92054.0,184.00,ProcessingComplete - Valid,True
62329,2340093,SHERIFF,9/30/24 19:02,2340093-13B-2051941,13B,Simple Assault,2051941,I,14.0,WHITE,ENCINITAS,92024.0,171.07,ProcessingComplete - Valid,False
62330,2345854,EL CAJON,9/27/24 14:15,2345854-13B-2056500,13B,Simple Assault,2056500,I,8.0,MIDDLE EASTERN,EL CAJON,92021.0,168.07,ProcessingComplete - Valid,False
62331,2332703,SAN DIEGO,9/30/24 0:56,2332703-13B-2045943,13B,Simple Assault,2045943,I,59.0,HISPANIC,SAN DIEGO,92113.0,48.00,ProcessingComplete - Valid,True


In [10]:
# v3crimedata

df.to_csv('v3crimedata.csv', index=False)

In [11]:
# create download link for v3 of data

def create_download_link(filename):
    return HTML(f'<a href="{filename}" download>Download {filename}</a>')

create_download_link('v3crimedata.csv')

## next steps for 'City' column:

- Add 'Beat' and 'BCS Area' columns by left joining on 'Incident UID'
- determine which unincorporated areas can be added in to other cities, e.g. 'ESCONDIDO UNINC' can be combined with 'ESCONDIDO'

In [13]:
# v0 is original data that contains Beat and BCS Area columns

v0 = pd.read_csv('v0crimedata.csv')
v3 = pd.read_csv('v3crimedata.csv')

In [14]:
# add back in 'Beat' and 'BCS Area' columns

v0_subset = v0[['Incident UID', 'Beat', 'BCS Area']].drop_duplicates(subset='Incident UID')
v4 = v3.merge(v0_subset, on='Incident UID', how='left')

In [15]:
v4.City.value_counts()

City
SAN DIEGO       24128
CHULA VISTA      4982
OCEANSIDE        4316
ESCONDIDO        3449
EL CAJON         3439
                ...  
GUATAY              5
LA JOLLA            3
TECATE              2
MOUNT LAGUNA        2
OTAY                1
Name: count, Length: 61, dtype: int64

In [16]:
# mapping the unincorporated areas to cities

city_mapping = {
    'ESCONDIDO UNINC': 'ESCONDIDO',
    'LA MESA UNINC': 'LA MESA',
    'SAN MARCOS UNIN': 'SAN MARCOS',
    'VISTA UNINC': 'VISTA',
    'EL CAJON UNINC': 'EL CAJON'
}

In [17]:
# change unincorporated areas to corresponding cities

v4['City'] = v4['City'].replace(city_mapping)
v4['City'].value_counts()

City
SAN DIEGO             24128
CHULA VISTA            4982
OCEANSIDE              4316
EL CAJON               3722
ESCONDIDO              3561
VISTA                  2572
NATIONAL CITY          1823
SPRING VALLEY          1789
SOUTH BAY              1782
SAN MARCOS             1677
LA MESA                1659
CARLSBAD               1491
SANTEE                 1354
LAKESIDE               1008
LEMON GROVE             928
FALLBROOK               831
ENCINITAS               701
IMPERIAL BEACH          671
RAMONA                  550
POWAY                   528
VALLEY CENTER           418
ALPINE                  303
BONITA                  152
PALA                    136
JAMUL                   119
SOLANA BEACH            113
CAMPO                   108
BONSALL                  88
DEL MAR                  79
PAUMA VALLEY             69
JULIAN                   67
RANCHO SANTA FE          60
BORREGO SPRINGS          50
BOULEVARD                44
LINCOLN ACRES            34
PINE VALLEY    

## next cleaning steps for 'City':

- for rows where 'City' is null, see if we can add city names based on Sheriff BCS Area or Police Beat.
    - Create dictionaries mapping BCS Areas and Police Beats to city names
    - Zip codes will be null for some of these since we sometimes can't get that specific. City should be the primary column for determining location.

In [19]:
null_city_rows = v4[v4['City'].isnull()]

In [20]:
# Display distinct values of 'BCS Area' column where 'Agency' is equal to 'SHERIFF'

bcs_area_values = null_city_rows[null_city_rows['Agency'] == 'SHERIFF']['BCS Area'].unique()
bcs_area_values

array(['DETENTION FACILITY', 'OUT OF COUNTY', 'CITY OF VISTA',
       'NON-CONTRACT CITY', 'ALPINE', 'CITY OF SAN MARCOS',
       'NORTH COASTAL UNINC', 'IMPERIAL BEACH UNINC',
       'CITY OF IMPERIAL BEACH', 'CITY OF ENCINITAS'], dtype=object)

In [21]:
beat_values = null_city_rows[null_city_rows['Agency'] != 'SHERIFF']['Beat'].unique()
beat_values

array([625, 999, 523,   5,   8,  99,   3, 324,  20,   2,   7, 327,  16,
       611,   1, 541, 614, 521, 550,  23, 511, 613,   4, 243, 232, 627,
       512, 602,  24, 721, 200, 314,   9,  13])

In [22]:
# map different sheriff bcs areas to corresponding zip codes

bcsarea_city_dict = {
    'CITY OF VISTA' : 'VISTA',
    'ALPINE' : 'ALPINE',
    'CITY OF SAN MARCOS' : 'SAN MARCOS',
    'IMPERIAL BEACH UNINC' : 'IMPERIAL BEACH',
    'CITY OF IMPERIAL BEACH' : 'IMPERIAL BEACH',
    'CITY OF ENCINITAS' : 'ENCINITAS'
}

In [23]:
v4['City'] = v4.apply(
    lambda row: bcsarea_city_dict.get(row['BCS Area']) if pd.isnull(row['City']) else row['City'],
    axis=1
)

In [24]:
# replacing with bcs areas gave us 9 more rows

v4['City'].isnull().sum()

166

In [25]:
v4['Zip Code'] = v4.apply(
    lambda row: zipcode_city_dict.get(row['City']) if pd.isnull(row['Zip Code']) else row['Zip Code'],
    axis=1
)

In [26]:
# map different beat values to corresponding zip codes

beat_zipcode_dict = {
    625: 92110,
    523: 92101,
    324: 91942,
    327: 92115,
    611: 92106,
    541: 92101,
    614: 92107,
    521: 92101,
    511: 92113,
    613: 92107,
    243: 92145,
    232: 92128,
    627: 92103,
    512: 92113,
    602: 92129,
    721: 92154,
    314: 92119
}

In [27]:
v4['Zip Code'] = v4.apply(
    lambda row: beat_zipcode_dict.get(row['Beat']) if pd.isnull(row['City']) else row['Zip Code'],
    axis=1
)

v4['City'] = v4.apply(
    lambda row: zipcode_city_dict.get(row['Zip Code']) if pd.isnull(row['City']) else row['City'],
    axis=1
)

In [28]:
# replacing with beat gave us 24 more rows

v4['City'].isnull().sum()

142

In [29]:
# remove the rows where BCS Area is out of the county

v4 = v4[v4['BCS Area'] != 'OUT OF COUNTY']

In [30]:
# deleting out of county removed 4 rows

v4[v4['City'].isnull()].shape

(138, 17)

In [31]:
v4[v4['City'].isnull()]['Agency'].unique()

array(['NATIONAL CITY', 'SHERIFF', 'SAN DIEGO', 'CARLSBAD', 'OCEANSIDE',
       'LA MESA', 'ESCONDIDO', 'EL CAJON', 'HARBOR POLICE', 'CHULA VISTA'],
      dtype=object)

In [32]:
agency_city_dict = {
    'NATIONAL CITY' : 'NATIONAL CITY',
    'CARLSBAD' : 'CARLSBAD',
    'OCEANSIDE' : 'OCEANSIDE',
    'LA MESA' : 'LA MESA',
    'ESCONDIDO' : 'ESCONDIDO',
    'EL CAJON' : 'EL CAJON',
    'CHULA VISTA' : 'CHULA VISTA'
}

In [33]:
# Fill the City column using the Agency values and the dictionary for rows where City is null

v4.loc[v4['City'].isnull(), 'City'] = v4.loc[v4['City'].isnull(), 'Agency'].map(agency_city_dict)

In [34]:
# filling with values from Agency gave us 60 more rows

v4['City'].isnull().sum()

78

## last steps for 'City' data:

- look at what patterns we can find to fill in as many 'City' values as we can

In [36]:
# safe to assume if detention facility and beat 24, then city is Santee

v4[(v4['Beat'] == 24) & (v4['BCS Area'] == 'DETENTION FACILITY')]['City'].value_counts()

City
SANTEE       405
EL CAJON       1
SAN DIEGO      1
Name: count, dtype: int64

In [37]:
# Update the City field to 'SANTEE' for rows where City is null, BCS Area is 'DETENTION FACILITY' and Beat is 24

v4.loc[(v4['City'].isnull()) & (v4['BCS Area'] == 'DETENTION FACILITY') & (v4['Beat'] == 24), 'City'] = 'SANTEE'

In [38]:
# using detention facilities and beat gave us 18 more rows

v4[v4['City'].isnull()].shape

(60, 17)

In [39]:
v4[v4['City'].isnull()]

Unnamed: 0,Incident UID,Agency,Incident Date,CIBRS Unique Offense ID,CIBRS Offense Code,CIBRS Offense Description,Victim UID,Victim Category,Victim Age,Overall Race,City,Zip Code,Census Tract,CIBRS Status,Domestic Violence Incident,Beat,BCS Area
6,772344,SHERIFF,6/22/21 9:00,772344-13B-796205,13B,Simple Assault,796205,I,,OTHER,,,,ProcessingComplete - Valid,False,29,DETENTION FACILITY
7,773065,SAN DIEGO,7/3/21 2:00,773065-13B-796976,13B,Simple Assault,796976,I,18.0,HISPANIC,,,,ProcessingComplete - Valid,True,999,
8,804854,SAN DIEGO,8/13/21 21:00,804854-13B-829961,13B,Simple Assault,829961,I,40.0,WHITE,,,,ProcessingComplete - Valid,True,999,
17,1438067,SHERIFF,10/12/22 9:00,1438067-13A-1306213,13A,Aggravated Assault,1306213,I,,WHITE,,,,ProcessingComplete - Valid,False,29,DETENTION FACILITY
21,1906077,SHERIFF,11/1/23 9:07,1906077-13B-1694149,13B,Simple Assault,1694149,I,48.0,WHITE,,,,ProcessingComplete - Valid,False,29,DETENTION FACILITY
48,1895587,SHERIFF,1/10/21 19:57,1895587-13B-1685125,13B,Simple Assault,1685125,I,0.0,HISPANIC,,,,ProcessingComplete - Valid,False,7,NON-CONTRACT CITY
179,842006,SAN DIEGO,2/1/21 12:00,842006-11D-868919,11D,Forcible Fondling,868919,I,10.0,WHITE,,,,ProcessingComplete - Valid,False,999,
251,1684372,SAN DIEGO,1/1/21 0:01,1684372-11A-1513584,11A,Forcible Rape,1513584,I,20.0,WHITE,,,,ProcessingComplete - Valid,False,999,
375,1684372,SAN DIEGO,1/1/21 0:01,1684372-11C-1513584,11C,Sexual Assault With An Object,1513584,I,20.0,WHITE,,,,ProcessingComplete - Valid,False,999,
4724,796100,SAN DIEGO,4/1/21 0:00,796100-36B-820804,36B,Statutory Rape,820804,I,14.0,BLACK,,,,ProcessingComplete - Valid,False,999,


In [40]:
# assume all rows with 'Beat' 29 are in Oceanside

v4[v4['Beat'] == 29]['City'].value_counts()

City
OCEANSIDE    34
Name: count, dtype: int64

In [41]:
# assume all rows with 'Beat' 202 are in Camp Pendleton

v4[v4['Beat'] == 202]['City'].value_counts()

City
CAMP PENDLETON    27
VISTA              2
SAN CLEMENTE       1
Name: count, dtype: int64

In [42]:
# add city names for beats 29 and 202

v4.loc[(v4['City'].isnull()) & (v4['Beat'] == 29), 'City'] = 'OCEANSIDE'
v4.loc[(v4['City'].isnull()) & (v4['Beat'] == 202), 'City'] = 'CAMP PENDLETON'

In [43]:
# adding city names for beats 29 and 202 gave us 14 more rows

v4['City'].isnull().sum()

46

In [44]:
v4[v4['City'].isnull()]

Unnamed: 0,Incident UID,Agency,Incident Date,CIBRS Unique Offense ID,CIBRS Offense Code,CIBRS Offense Description,Victim UID,Victim Category,Victim Age,Overall Race,City,Zip Code,Census Tract,CIBRS Status,Domestic Violence Incident,Beat,BCS Area
7,773065,SAN DIEGO,7/3/21 2:00,773065-13B-796976,13B,Simple Assault,796976,I,18.0,HISPANIC,,,,ProcessingComplete - Valid,True,999,
8,804854,SAN DIEGO,8/13/21 21:00,804854-13B-829961,13B,Simple Assault,829961,I,40.0,WHITE,,,,ProcessingComplete - Valid,True,999,
48,1895587,SHERIFF,1/10/21 19:57,1895587-13B-1685125,13B,Simple Assault,1685125,I,0.0,HISPANIC,,,,ProcessingComplete - Valid,False,7,NON-CONTRACT CITY
179,842006,SAN DIEGO,2/1/21 12:00,842006-11D-868919,11D,Forcible Fondling,868919,I,10.0,WHITE,,,,ProcessingComplete - Valid,False,999,
251,1684372,SAN DIEGO,1/1/21 0:01,1684372-11A-1513584,11A,Forcible Rape,1513584,I,20.0,WHITE,,,,ProcessingComplete - Valid,False,999,
375,1684372,SAN DIEGO,1/1/21 0:01,1684372-11C-1513584,11C,Sexual Assault With An Object,1513584,I,20.0,WHITE,,,,ProcessingComplete - Valid,False,999,
4724,796100,SAN DIEGO,4/1/21 0:00,796100-36B-820804,36B,Statutory Rape,820804,I,14.0,BLACK,,,,ProcessingComplete - Valid,False,999,
4883,919885,SAN DIEGO,5/14/21 12:00,919885-11C-950669,11C,Sexual Assault With An Object,950669,I,7.0,BLACK,,,,ProcessingComplete - Valid,False,999,
5687,919885,SAN DIEGO,5/14/21 12:00,919885-11B-950669,11B,Forcible Sodomy,950669,I,7.0,BLACK,,,,ProcessingComplete - Valid,False,999,
5699,1072767,SAN DIEGO,5/1/21 0:01,1072767-11A-1110236,11A,Forcible Rape,1110236,I,18.0,ASIAN,,,,ProcessingComplete - Invalid,False,999,


In [45]:
# safe to assume if detention facility and beat 21, then city is San Diego

v4[(v4['Beat'] == 21) & (v4['BCS Area'] == 'DETENTION FACILITY')]['City'].value_counts()

City
SAN DIEGO    34
Name: count, dtype: int64

In [46]:
# safe to assume if detention facility and beat 26, then city is Vista

v4[(v4['Beat'] == 26) & (v4['BCS Area'] == 'DETENTION FACILITY')]['City'].value_counts()

City
VISTA    33
Name: count, dtype: int64

In [47]:
# Update the City field to 'SAN DIEGO' for rows where City is null, BCS Area is 'DETENTION FACILITY' and Beat is 21

v4.loc[(v4['City'].isnull()) & (v4['BCS Area'] == 'DETENTION FACILITY') & (v4['Beat'] == 21), 'City'] = 'SAN DIEGO'

In [48]:
# Update the City field to 'SAN DIEGO' for rows where City is null, BCS Area is 'DETENTION FACILITY' and Beat is 26

v4.loc[(v4['City'].isnull()) & (v4['BCS Area'] == 'DETENTION FACILITY') & (v4['Beat'] == 26), 'City'] = 'VISTA'

In [49]:
# adding city names for beat 21 detention facilities gave us 5 more rows

v4['City'].isnull().sum()

41

In [50]:
# final dataframe of null 'City' value rows

v4[v4['City'].isnull()]

Unnamed: 0,Incident UID,Agency,Incident Date,CIBRS Unique Offense ID,CIBRS Offense Code,CIBRS Offense Description,Victim UID,Victim Category,Victim Age,Overall Race,City,Zip Code,Census Tract,CIBRS Status,Domestic Violence Incident,Beat,BCS Area
7,773065,SAN DIEGO,7/3/21 2:00,773065-13B-796976,13B,Simple Assault,796976,I,18.0,HISPANIC,,,,ProcessingComplete - Valid,True,999,
8,804854,SAN DIEGO,8/13/21 21:00,804854-13B-829961,13B,Simple Assault,829961,I,40.0,WHITE,,,,ProcessingComplete - Valid,True,999,
48,1895587,SHERIFF,1/10/21 19:57,1895587-13B-1685125,13B,Simple Assault,1685125,I,0.0,HISPANIC,,,,ProcessingComplete - Valid,False,7,NON-CONTRACT CITY
179,842006,SAN DIEGO,2/1/21 12:00,842006-11D-868919,11D,Forcible Fondling,868919,I,10.0,WHITE,,,,ProcessingComplete - Valid,False,999,
251,1684372,SAN DIEGO,1/1/21 0:01,1684372-11A-1513584,11A,Forcible Rape,1513584,I,20.0,WHITE,,,,ProcessingComplete - Valid,False,999,
375,1684372,SAN DIEGO,1/1/21 0:01,1684372-11C-1513584,11C,Sexual Assault With An Object,1513584,I,20.0,WHITE,,,,ProcessingComplete - Valid,False,999,
4724,796100,SAN DIEGO,4/1/21 0:00,796100-36B-820804,36B,Statutory Rape,820804,I,14.0,BLACK,,,,ProcessingComplete - Valid,False,999,
4883,919885,SAN DIEGO,5/14/21 12:00,919885-11C-950669,11C,Sexual Assault With An Object,950669,I,7.0,BLACK,,,,ProcessingComplete - Valid,False,999,
5687,919885,SAN DIEGO,5/14/21 12:00,919885-11B-950669,11B,Forcible Sodomy,950669,I,7.0,BLACK,,,,ProcessingComplete - Valid,False,999,
5699,1072767,SAN DIEGO,5/1/21 0:01,1072767-11A-1110236,11A,Forcible Rape,1110236,I,18.0,ASIAN,,,,ProcessingComplete - Invalid,False,999,


In [96]:
# v4crimedata

v4.to_csv('v4crimedata.csv', index=False)

In [98]:
# create download link for v4 of data

def create_download_link(filename):
    return HTML(f'<a href="{filename}" download>Download {filename}</a>')

create_download_link('v4crimedata.csv')