In [1]:
import pandas as pd
from datetime import datetime
import jovian_project.data_cleaning as dc
import folium

import plotly.express as px
from folium.plugins import HeatMap
from sodapy import Socrata
import matplotlib.pyplot as plt
import re
import branca

import numpy as np

### Data Source
The data is from the 'Crime Data from 2020 to Present' Dataset by Los Angeles Police Department from the Los Angeles Open Data Portal. The data can be found [here](https://data.lacity.org/A-Safe-City/Crime-Data-from-2020-to-Present/2nrs-mtv8). It Contains all crimes reported to the LAPD from 2020 to present and it is updated daily. It contains more than 700,000 records and 26 columns. Data has been pulled in JSON format through the SODA API.

We then used the **pandas** library to import this data into a pandas dataframe.

In [2]:
client = Socrata("data.lacity.org", None)
results = client.get("2nrs-mtv8", limit=1000000)



In [3]:
#create a dataframe from the resulting data
crime_df = pd.DataFrame.from_records(results)

In [4]:
crime_df.shape

(968833, 28)

We now copy this dataframe and create a new one with the same data to clean and analyze.


In [5]:
cleaned_crime_df=crime_df.copy()

### Data Exploration and Cleaning
1. Drop redundant columns.
1. Check for the shape of the data and the data types of the columns.
2. Check for missing values in each column and replace them with appropriate values.
3. Check for duplicates and drop them.
4. Check for unique values in each column, Check for outliers and replace them with appropriate values.
5. Check for the correlation between the columns.


##### 1. Check for the shape of the data and the data types of the columns.
- The data has 28 columns and over 725,000 rows.
- The data types of the columns are mostly objects and we will have to change them to appropriate data types for analysis and visualization.
- *time occ* column (time of occurence) has 24 hour military time but it is in string format and the data has missing zeros. We will have to add zeros to the front of the entries that are less than 4 digits. We will also have to change the data type to time objects for easier analysis and visualization.

In [6]:
#check the shape of dataframe
print(cleaned_crime_df.shape)

(968833, 28)


In [7]:
#checking data types of columns
cleaned_crime_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 968833 entries, 0 to 968832
Data columns (total 28 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   dr_no           968833 non-null  object
 1   date_rptd       968833 non-null  object
 2   date_occ        968833 non-null  object
 3   time_occ        968833 non-null  object
 4   area            968833 non-null  object
 5   area_name       968833 non-null  object
 6   rpt_dist_no     968833 non-null  object
 7   part_1_2        968833 non-null  object
 8   crm_cd          968833 non-null  object
 9   crm_cd_desc     968833 non-null  object
 10  mocodes         844158 non-null  object
 11  vict_age        968833 non-null  object
 12  vict_sex        850889 non-null  object
 13  vict_descent    850880 non-null  object
 14  premis_cd       968820 non-null  object
 15  premis_desc     968317 non-null  object
 16  weapon_used_cd  337873 non-null  object
 17  weapon_desc     337873 non-nu

In [8]:
#convert vict_age, crm_cd, area, rpt_dist_no, dr_no  columns to interger
cleaned_crime_df['vict_age']=cleaned_crime_df['vict_age'].astype(int)
cleaned_crime_df['crm_cd']=cleaned_crime_df['crm_cd'].astype(int)
cleaned_crime_df['area']=cleaned_crime_df['area'].astype(int)
cleaned_crime_df['rpt_dist_no']=cleaned_crime_df['rpt_dist_no'].astype(int)
cleaned_crime_df['dr_no']=cleaned_crime_df['dr_no'].astype(int)


In [9]:
#converting longitude and latitude to float
cleaned_crime_df['lon']=cleaned_crime_df['lon'].astype(float)
cleaned_crime_df['lat']=cleaned_crime_df['lat'].astype(float)


In [10]:
#converting date_occ and date_rptd columns to pandas datetime objects
pd.to_datetime(cleaned_crime_df['date_occ'], format='%Y-%m-%dT%H:%M:%S.%f')
cleaned_crime_df['date_occ'] = cleaned_crime_df['date_occ'].astype('datetime64[ns]')

In [11]:
pd.to_datetime(cleaned_crime_df['date_rptd'], format='%Y-%m-%dT%H:%M:%S.%f')
cleaned_crime_df['date_rptd'] = cleaned_crime_df['date_rptd'].astype('datetime64[ns]')

In [12]:
cleaned_crime_df['crm_cd_desc']= cleaned_crime_df['crm_cd_desc'].str.lower()
cleaned_crime_df['crm_cd_desc']= cleaned_crime_df['crm_cd_desc'].str.capitalize()
cleaned_crime_df['crm_cd_desc'].unique()

array(['Battery - simple assault',
       'Sex offender registrant out of compliance',
       'Vandalism - misdeameanor ($399 or under)',
       'Vandalism - felony ($400 & over, all church vandalisms)',
       'Rape, forcible', 'Shoplifting - petty theft ($950 & under)',
       'Other miscellaneous crime',
       'Theft-grand ($950.01 & over)excpt,guns,fowl,livestk,prod',
       'Burglary from vehicle', 'Criminal threats - no weapon displayed',
       'Arson', 'Intimate partner - simple assault',
       'Theft plain - petty ($950 & under)', 'Theft of identity',
       'Robbery', 'Assault with deadly weapon, aggravated assault',
       'Burglary', 'Vehicle - stolen',
       'Theft from motor vehicle - petty ($950 & under)',
       'Brandish weapon', 'Intimate partner - aggravated assault',
       'Bunco, grand theft', 'Theft, person',
       'Battery with sexual contact', 'Bike - stolen',
       'Battery police (simple)',
       'Letters, lewd  -  telephone calls, lewd',
       'Violat

In [13]:
#converting time_occ(time of occurence) column from 24 hour military format with some values missing a few digits to pandas time objects
dc.edit_column(cleaned_crime_df, 'time_occ', dc.clean_military_time)
cleaned_crime_df['time_occ']=pd.to_datetime(cleaned_crime_df['time_occ'], format='%H:%M').dt.time

In [14]:
#check the columns of dataframe
print(cleaned_crime_df.columns)

Index(['dr_no', 'date_rptd', 'date_occ', 'time_occ', 'area', 'area_name',
       'rpt_dist_no', 'part_1_2', 'crm_cd', 'crm_cd_desc', 'mocodes',
       'vict_age', 'vict_sex', 'vict_descent', 'premis_cd', 'premis_desc',
       'weapon_used_cd', 'weapon_desc', 'status', 'status_desc', 'crm_cd_1',
       'location', 'lat', 'lon', 'crm_cd_2', 'cross_street', 'crm_cd_3',
       'crm_cd_4'],
      dtype='object')


##### 1. Drop redundant columns.
- **weapon_used_code** is redundant with weapon_description. Although it may be useful for machine learning purposes, it is not useful for EDA because it is a code and not a description.
- **status_code** is redundant with status_description. Status description is more descriptive, easier to understand and small in size.
- **crm_cd_1** is redundant with crm_cd. By definition, crm_cd_1 is the same as crm_cd. I will drop crm_cd_1.
- **mocodes, crm_cd_2, cross_street** all have missing values but they will not be used in the analysis so they will be dropped.
- **crm_cd_3 and crm_cd_4** are practically empty and will be dropped.

In [15]:
# Dropping the redundant columns
cleaned_crime_df.drop(columns=['weapon_used_cd', 'status', 'crm_cd_1','mocodes','part_1_2','cross_street', 'crm_cd_3', 'crm_cd_4','crm_cd_2'], inplace=True)


##### 2. Check for missing values in each column and replace them with appropriate values.<br>
We find the null values using isna() function and use sum() function to find the total number of null values in each column.

Here is how i dealt with the missing values in each column using Pandas *fillna()* method:
- **weapon_desc** (474450(approx) Missing values) - Even though approx half of the values are null, the known values will provide us good info about weapons used. Null values Replaced with 'UNKNOWN WEAPON/OTHER WEAPON' value already found in the column.
- **vict_sex** (94800(approx) Missing values) - Replaced with 'X' value already found in the column which represents unknown sex.
- **vict_descent** (94800(approx) Missing values) - Replaced with 'unknown' which will represents unknown descent.
- **premis_cd(9 missing values), premis_desc(407 missing values)** are analogous to each other. With exploration of these two columns we find that code 256 means unknown and has null values in the corresponding premis_desc column in the same row. Thus we fill missing values in premis_desc with 'unknown' and fill missing values in premis_cd with 256.

In [16]:
#shows number of null values
cleaned_crime_df.isna().sum()

dr_no                0
date_rptd            0
date_occ             0
time_occ             0
area                 0
area_name            0
rpt_dist_no          0
crm_cd               0
crm_cd_desc          0
vict_age             0
vict_sex        117944
vict_descent    117953
premis_cd           13
premis_desc        516
weapon_desc     630960
status_desc          0
location             0
lat                  0
lon                  0
dtype: int64

In [17]:
cleaned_crime_df['weapon_desc'].fillna('UNKNOWN WEAPON/OTHER WEAPON', inplace=True)
cleaned_crime_df['vict_sex'].fillna('Unknown', inplace=True)
cleaned_crime_df['vict_descent'].fillna('Unknown', inplace=True)
cleaned_crime_df['premis_cd'].fillna('256', inplace=True)
cleaned_crime_df['premis_desc'].fillna('Unknown', inplace=True)


In [18]:
#check for missing values again
cleaned_crime_df.isna().sum()

dr_no           0
date_rptd       0
date_occ        0
time_occ        0
area            0
area_name       0
rpt_dist_no     0
crm_cd          0
crm_cd_desc     0
vict_age        0
vict_sex        0
vict_descent    0
premis_cd       0
premis_desc     0
weapon_desc     0
status_desc     0
location        0
lat             0
lon             0
dtype: int64

##### 3. Check for duplicates
Using *duplicated() and sum()* methods. duplicated method looks for duplicate rows and sum sums up to show us the number of duplicate rows based on all the columns. We find that there are 0 duplicate rows.

In [19]:
cleaned_crime_df.duplicated().sum()

20803

##### 4. Check for outliers
Using *describe(), unique() and value_counts()* methods. We find that there are following outliers in the data:
- **vict_age** - 120 and values less than 0 are outliers. Hence, these rows must be dropped. We found these by using the describe and unique methods on the column. Values equal to 0 are also outliers but they represent a significant number of victims and will cause considerable loss of data in other columns. Since this column is numeric and we will be using it for analysis, we will replace these values with the median of the column.
- **vict_sex** - 'H' is an outlier and 'X' is a missing value. We will replace these with 'Unknown' which is more informative.
- **vict descent** - '-' is an outlier which appears in one row which will be dropped 

In [20]:
cleaned_crime_df.describe()

Unnamed: 0,dr_no,area,rpt_dist_no,crm_cd,vict_age,lat,lon
count,968833.0,968833.0,968833.0,968833.0,968833.0,968833.0,968833.0
mean,209507000.0,10.735398,1119.933727,502.570342,30.334692,33.993965,-118.076692
std,13217020.0,6.093403,609.375151,208.396044,21.595784,1.65187,5.725666
min,817.0,1.0,101.0,110.0,-4.0,0.0,-118.6676
25%,200321800.0,6.0,622.0,330.0,15.0,34.0132,-118.4297
50%,210816200.0,11.0,1143.0,442.0,31.0,34.0587,-118.3222
75%,221008700.0,16.0,1621.0,626.0,45.0,34.1649,-118.2739
max,239916500.0,21.0,2199.0,956.0,120.0,34.3343,0.0


##### Check for outliers in 'vict_age' column

In [21]:

cleaned_crime_df['vict_age'].unique()

array([ 36,  25,   0,  76,  31,  23,  29,  35,  41,  24,  34,  46,  66,
        40,  27,  62,  43,  71,  50,  19,  51,  33,  69,  39,  57,  78,
        52,  38,  55,  44,  18,  54,  22,  28,  42,  56,  67,  37,  60,
        61,  59,  32,  30,  45,  20,  15,  58,  47,  48,  26,  21,  64,
        75,  12,  49,  68,  14,  13,  10,  53,  74,  17,  65,  63,   8,
        16,  72,  70,   9,  90,  85,  81,  79,  94,  73,  11,  80,   5,
        82,   2,  77,  84,  88,  96,  99,   7,  86,  92,   3,  83,  87,
         6,  -1,  89,   4,  93,  98,  91,  95,  97, 120,  -2,  -3,  -4])

In [22]:
# Check how many rows have age greater than 100. Upon checking Only 1 row has age greater than 100 so we can safely drop it
cleaned_crime_df[cleaned_crime_df['vict_age']>100].shape

(1, 19)

In [23]:
# check how many rows have age less than 0. 51 rows have age less than 0 so we can safely drop them
cleaned_crime_df[cleaned_crime_df['vict_age']<0].shape

(66, 19)

In [24]:
# check how many rows have age=0. Upon checking, 176828 rows have age=0. We can't drop them as they are too many. We will just assume that the age is unknown.
cleaned_crime_df[cleaned_crime_df['vict_age']==0].shape

(224746, 19)

In [25]:
# drop rows with age less than 0
cleaned_crime_df.drop(cleaned_crime_df[cleaned_crime_df['vict_age']<0].index, inplace=True)
# drop rows with age greater than 100
cleaned_crime_df.drop(cleaned_crime_df[cleaned_crime_df['vict_age']>100].index, inplace=True)


In [26]:
# check the mean of vict_age column but exclude rows with age=0
mean_age=cleaned_crime_df[cleaned_crime_df['vict_age']!=0]['vict_age'].mean()
mean_age

39.50058062955297

In [27]:
# find out median of vict_age column but exclude rows with age=0
median_age=cleaned_crime_df[cleaned_crime_df['vict_age']!=0]['vict_age'].median()
median_age


37.0

In [28]:
#checking unique values in vict_age column again after cleaning the data
cleaned_crime_df['vict_age'].unique()

# all values seem plausible now

array([36, 25,  0, 76, 31, 23, 29, 35, 41, 24, 34, 46, 66, 40, 27, 62, 43,
       71, 50, 19, 51, 33, 69, 39, 57, 78, 52, 38, 55, 44, 18, 54, 22, 28,
       42, 56, 67, 37, 60, 61, 59, 32, 30, 45, 20, 15, 58, 47, 48, 26, 21,
       64, 75, 12, 49, 68, 14, 13, 10, 53, 74, 17, 65, 63,  8, 16, 72, 70,
        9, 90, 85, 81, 79, 94, 73, 11, 80,  5, 82,  2, 77, 84, 88, 96, 99,
        7, 86, 92,  3, 83, 87,  6, 89,  4, 93, 98, 91, 95, 97])

##### Checking for outliers in 'vict_sex' column

In [29]:
cleaned_crime_df['vict_sex'].value_counts()

M          407628
F          364163
Unknown    117943
X           78943
H              88
N               1
Name: vict_sex, dtype: int64

In [30]:
# replace 'X' and 'H' with 'Unknown' in vict sex column
cleaned_crime_df['vict_sex'].replace(['X', 'H','N'], 'Unknown', inplace=True)


In [31]:
cleaned_crime_df.columns

Index(['dr_no', 'date_rptd', 'date_occ', 'time_occ', 'area', 'area_name',
       'rpt_dist_no', 'crm_cd', 'crm_cd_desc', 'vict_age', 'vict_sex',
       'vict_descent', 'premis_cd', 'premis_desc', 'weapon_desc',
       'status_desc', 'location', 'lat', 'lon'],
      dtype='object')

##### Checking for outliers in 'vict_descent' column

In [32]:
cleaned_crime_df['vict_descent'].value_counts()

H          302663
W          202583
B          140979
Unknown    117952
X           88069
O           79019
A           22103
K            4819
F            3492
C            3144
J            1187
I             874
V             834
Z             428
P             234
U             175
G              60
D              54
L              52
S              43
-               2
Name: vict_descent, dtype: int64

In [33]:
# drop the row with '-' in vict_descent column
cleaned_crime_df.drop(cleaned_crime_df[cleaned_crime_df['vict_descent']=='-'].index, inplace=True)

While checking for outliers i discovered that the descent of the victims is denoted by single letters which is not very informative. I will replace these with full names of the descents.

In [34]:
#make a dictionary of Descent Codes and Descent Description
descent_dict = {'A': 'Other Asian', 'B': 'Black', 
                'C': 'Chinese', 'D': 'Cambodian', 
                'F': 'Filipino', 'G': 'Guamanian', 
                'H': 'Hispanic/Latin/Mexican', 
                'I': 'American Indian/Alaskan Native', 
                'J': 'Japanese', 'K': 'Korean', 'L': 'Laotian', 
                'O': 'Other', 'P': 'Pacific Islander', 
                'S': 'Samoan', 'U': 'Hawaiian', 'V': 'Vietnamese', 
                'W': 'White', 'X': 'Unknown', 
                'Z': 'Asian Indian'}
# replace the Descent Codes with Descent Description
cleaned_crime_df['vict_descent'].replace(descent_dict, inplace=True)

##### 5. Check for the correlation between the columns.
- Checked if date_occ > date_rptd because a crime cannot be reported before it occurs. Fortunately, there are no such rows.
- Checked for correlation between numeric columns using the corr() method. We find that there is no strong correlation between the columns.

In [35]:
# check if there are any rows where date_occ is greater than date_rptd which is not possible since a crime can't be reported before it occured. Fortunately there are no such rows
cleaned_crime_df[cleaned_crime_df['date_occ']>cleaned_crime_df['date_rptd']].shape

(0, 19)

In [36]:
#check for correlation between numeric columns in the cleaned dataframe.
cleaned_crime_df.corr()

  cleaned_crime_df.corr()


Unnamed: 0,dr_no,area,rpt_dist_no,crm_cd,vict_age,lat,lon
dr_no,1.0,0.034972,0.035116,-0.03137,-0.02714,0.007666,-0.006881
area,0.034972,1.0,0.999045,-0.000244,0.014487,0.024352,-0.010051
rpt_dist_no,0.035116,0.999045,1.0,-0.000316,0.014512,0.024112,-0.009984
crm_cd,-0.03137,-0.000244,-0.000316,1.0,-0.010965,-0.038307,0.038508
vict_age,-0.02714,0.014487,0.014512,-0.010965,1.0,0.002285,-0.000914
lat,0.007666,0.024352,0.024112,-0.038307,0.002285,1.0,-0.998278
lon,-0.006881,-0.010051,-0.009984,0.038508,-0.000914,-0.998278,1.0


### Data Analysis and Visualization

In [37]:

crime_victim_sex_df = cleaned_crime_df.groupby('vict_sex').count()['dr_no'].reset_index()

#exclude unknown values from crime_victim_sex_df
crime_victim_sex_df.drop(crime_victim_sex_df[crime_victim_sex_df['vict_sex']=='Unknown'].index, inplace=True)

In [38]:
ethnic_victim_df = cleaned_crime_df.groupby('vict_descent').count()['dr_no'].reset_index()
ethnic_victim_df.drop(ethnic_victim_df[ethnic_victim_df['vict_descent']=='Unknown'].index, inplace=True)
ethnic_victim_df

Unnamed: 0,vict_descent,dr_no
0,American Indian/Alaskan Native,874
1,Asian Indian,428
2,Black,140979
3,Cambodian,54
4,Chinese,3144
5,Filipino,3492
6,Guamanian,60
7,Hawaiian,175
8,Hispanic/Latin/Mexican,302663
9,Japanese,1187


In [39]:
# create a dataframe from cleaned_crime_df grouped by crm_cd_desc and median of vict_age
age_area_df = cleaned_crime_df.groupby('crm_cd_desc').median()['vict_age'].reset_index()
age_area_df

  age_area_df = cleaned_crime_df.groupby('crm_cd_desc').median()['vict_age'].reset_index()


Unnamed: 0,crm_cd_desc,vict_age
0,Abortion/illegal,40.0
1,Arson,27.0
2,Assault with deadly weapon on police officer,0.0
3,"Assault with deadly weapon, aggravated assault",35.0
4,Attempted robbery,34.0
...,...,...
136,"Vehicle, stolen - other (motorized scooters, b...",19.0
137,Violation of court order,39.0
138,Violation of restraining order,40.0
139,Violation of temporary restraining order,40.0


In [40]:
#drop rows with age=0 using .loc 
age_area_df= age_area_df[age_area_df.vict_age != 0]

In [41]:
age_area_df

Unnamed: 0,crm_cd_desc,vict_age
0,Abortion/illegal,40.0
1,Arson,27.0
3,"Assault with deadly weapon, aggravated assault",35.0
4,Attempted robbery,34.0
5,Battery - simple assault,38.0
...,...,...
134,Vehicle - attempt stolen,40.0
136,"Vehicle, stolen - other (motorized scooters, b...",19.0
137,Violation of court order,39.0
138,Violation of restraining order,40.0


In [42]:
# Do not delete this cell. put it into data cleaning

cleaned_crime_df['month'] = cleaned_crime_df['date_occ'].dt.month_name()
cleaned_crime_df['month_num'] = cleaned_crime_df['date_occ'].dt.month
cleaned_crime_df['year'] = cleaned_crime_df['date_occ'].dt.year

In [43]:
cleaned_crime_df['year'].value_counts()

2022    254245
2019    212606
2021    209082
2020    199190
2023     93641
Name: year, dtype: int64

#### Question: Which Crimes are Most Common?

In [46]:
crime_desc_df = cleaned_crime_df[['crm_cd_desc','crm_cd', 'dr_no']].groupby('crm_cd_desc').count().reset_index()

#### Question 1: What is the Geospatial Distribution of Crimes in LA?


We will use the **folium** library to visualize the data to get some insight about about the spatial distribution of Crimes . We will use the **HeatMap** function to create a heatmap of the crimes in LA. We will use the **CircleMarker** function to create a circle marker for each crime in LA. We will use the **Choropleth** function to create a choropleth map of the crimes in LA.

First we will create a Choropleth map of the crimes in LA. We will use the **geojson** file of the LA neighborhoods to create the map. Since this crime data has been created by LAPD(Los Angeles Police Department) we will use the GeoJson data from LA's [open data portal](https://geohub.lacity.org/datasets/lahub::lapd-reporting-district/about) to plot LAPD's Reporting districts and the link them with rpt_dist_no column in the crime data.

First we create a new dataframe, from cleaned dataframe, grouped by rpt_dist_no and the crime count in each reporting district.

In [47]:
crime_choropleth_df = cleaned_crime_df[['rpt_dist_no', 'dr_no']].groupby(['rpt_dist_no']).count()['dr_no'].reset_index()

In [48]:
# Link to LAPD Reporting Districts GeoJSON
la_geojson_url = 'https://services5.arcgis.com/7nsPwEMP38bSkCjy/arcgis/rest/services/LAPD_Reporting_District/FeatureServer/0/query?outFields=*&where=1%3D1&f=geojson'

In [49]:
los_angeles_map_choropleth = folium.Map(location=[34.0522, -118.2437], zoom_start=12, tiles="cartodb positron")

In [50]:
folium.Choropleth(
    geo_data=la_geojson_url,
    name="choropleth",
    data=crime_choropleth_df,
    columns=["rpt_dist_no", 'dr_no'],
    key_on="feature.properties.REPDIST",
    fill_color='YlOrRd',
    fill_opacity=0.7,
    line_opacity=1,
    legend_name="Crime Number",
    threshold_scale=[1,100, 500, 1000, 2000, 3_000, 4000, 5000, 6000],
    smooth_factor=0,
    Highlight= True,
    line_weight = 0.5,
    overlay=True,
    ).add_to(los_angeles_map_choropleth)

folium.LayerControl().add_to(los_angeles_map_choropleth)

<folium.map.LayerControl at 0x7fcc66c7d990>

In [51]:
los_angeles_map_choropleth

We get some idea of distribution of crimes in LA's districts but it is not very clear. We will Create CircleMarkers on the map with more than 500 Crimes to get a better idea of the hotspots of crimes in LA.

In [52]:
area_dist_df = cleaned_crime_df[['area_name', 'rpt_dist_no', 'dr_no', 'lat', 'lon']].groupby(['area_name', 'rpt_dist_no']).agg({'dr_no':'count', 'lat':'median', 'lon':'median'}).reset_index()
area_dist_df.sort_values(by=['area_name','dr_no'], ascending=False, inplace=True)

In [53]:
area_dist_df2 =area_dist_df[area_dist_df['dr_no']>500]


We took median of the latitude and longitude of the crimes in each reporting district and created a new dataframe with the median latitude and longitude of each reporting district. We then created a new dataframe with the reporting districts with more than 500 crimes. 

In [54]:
los_angeles_map_circlemarker = folium.Map(location=[34.0522, -118.2437], zoom_start=12, tiles="CartoDB dark_matter")

In [55]:
# iterate over all rows of area_dist_df and get lat, lon, count of crimes, area name and district number for each row
for lt, lo, cnt, area, dist in zip(area_dist_df['lat'], area_dist_df['lon'], area_dist_df['dr_no'],area_dist_df['area_name'],area_dist_df['rpt_dist_no']):
    if cnt>1000:
        color="#FF5733"
    else:
        color="#2874A6"
    popup_text = """Area Name:{}<br>
                District:{}<br>
                Criminal Incidents:{}<br>"""
    popup_text = popup_text.format(area, dist, cnt)
    folium.CircleMarker(location=[lt, lo],popup= popup_text, radius=cnt/100, color=color, fill=True).add_to(los_angeles_map_circlemarker)

Orange markers indicate the districts with more than 1000 crimes and blue markers indicate the districts with more than 500 crimes but less than 1000. 

In [56]:
los_angeles_map_circlemarker

For all Crime comparison Visualization by Areas and their Districts we create a SunBurst chart. This chart will give us visualization of crime rate in various areas and districts which are interactive and clickable

In [57]:
fig = px.sunburst(area_dist_df,
                  title='Crime by Area and District',
                 path=['area_name', 'rpt_dist_no'],
                 
                 values='dr_no',
                 color='dr_no',
                 labels={ 'area_name':'Area', 'rpt_dist_no':'District',
                          'dr_no':'Number of Crimes', 'dr_no_sum':'Crime Count', 'parent':'Parent', 'labels':'Area/District', 'id':'ID'  },
                 hover_data={'dr_no':False},
                 height=900,
                 template='plotly_dark',
                 color_continuous_scale=px.colors.sequential.solar,
                 branchvalues='total',) 
                
fig.show()

#### Insights from the 3 Visualizations:
- Central and Hollywood Areas have the most active Crime Districts.
- Pacific, South East and 77th street follow behind closely
- Central's district 162 and Hollywood's district 645 have the most crimes.
- Devonshire, Hollenbeck, Foothill, Northeast, West LA areas enjoy low crime rates.

#### Question 2: What are the most common crimes in Los Angeles?

In [58]:
crime_desc_df = cleaned_crime_df[['crm_cd_desc','crm_cd', 'dr_no']].groupby('crm_cd_desc').count().reset_index()

In [59]:
fig = px.pie(crime_desc_df, values='dr_no', 
             names='crm_cd_desc', 
             color_discrete_sequence=px.colors.sequential.RdBu,
             template='plotly_dark', 
             title='Crime Type Distribution in Los Angeles',
             height=900,
             labels={ 'crm_cd_desc':'Crime ',
                    'dr_no':'Number of Crimes ', 
                    },
                 )
fig.update_traces(textposition='inside', textinfo='percent+label')
fig.show()

#### Insights:
- Stolen Vehicles, Simple Assaults, Burglaries , $400 and over Vandalisms and Church Vandalisms and Theft of identity are the most common crimes in LA.
- Together these account for 43 percent of all crimes in LA.
- Petty thefts, lewd acts with a child and dishonest employees are the least common crimes in LA.

#### Question 3: What about Violent Crimes?
We found a pdf file on the LAPD website which lists the violent crimes. We will use **findall** method to find these in our cleaned dataframe and then mark them by creating another column called **Violent/Non-Violent**. We will then use the **value_counts()** method to find the number of violent crimes in LA.

In [60]:
violent_crimes = ['Homicide', 'Manslaughter', 'Rape', 
                  'Sexual Penetration', 'Oral Copulation', 'Sodomy', 
                  'Robbery', 'Assault', 'Child Abuse', 'Criminal Threats', 
                  'Stalking','Battery', 'Lynching', 'Brandishing', 'Train Wrecking', 'Resisting Arrest']

In [61]:
cleaned_crime_df['Violent/Non-Violent'] = cleaned_crime_df['crm_cd_desc'].str.findall('|'.join(violent_crimes), flags=re.IGNORECASE).apply(lambda x: 'Violent' if len(x)>0 else 'Non-Violent')

Calculate percentage of violent crimes out of total crimes

In [62]:
violent_value_count = cleaned_crime_df['Violent/Non-Violent'].value_counts()
violent_percent = (violent_value_count[1]/(violent_value_count[0]+violent_value_count[1]))*100
violent_percent

29.209590777526827

In [63]:
violent_crimes_df = cleaned_crime_df[cleaned_crime_df['Violent/Non-Violent']=='Violent']

In [64]:
violent_crimes_bar_df=violent_crimes_df.groupby('crm_cd_desc').count()['dr_no'].reset_index()
violent_crimes_bar_df.sort_values(by=['dr_no'], inplace=True)


In [132]:
fig = px.bar(violent_crimes_bar_df[-15:], x='dr_no', y='crm_cd_desc', text_auto='.2s',
            title="Top 15 Violent Crimes", height = 900, color='dr_no',
            labels={ 'crm_cd_desc':'Crime', 
                          'dr_no':'Number of Crimes',  },
                 
            color_continuous_scale=px.colors.sequential.Rainbow, 
            template='plotly_dark')
fig.update_traces(textfont_size=12, textangle=0, textposition="outside", cliponaxis=False)
fig.show()

#### Insights:
- Simple Assault, Aggravated Assault, Robbery and Criminal threats are the most common violent crimes in LA.
- Assaults against Intimate partner feature in top 6 separately from other assaults.
- Together these account for most of all violent crimes in LA.
- Approximately 29 percent of all crimes in LA are violent crimes.


#### Question 4: Which weapons are being used in Violent Crimes

In [66]:
weapon_violent_crime_df = violent_crimes_df[['weapon_desc', 'crm_cd_desc', 'dr_no']].groupby(['weapon_desc', 'crm_cd_desc']).count()['dr_no'].reset_index()
weapon_violent_crime_df=weapon_violent_crime_df[weapon_violent_crime_df['weapon_desc']!='UNKNOWN WEAPON/OTHER WEAPON']
weapon_violent_crime_df.sort_values(by=['dr_no'], inplace=True)

In [131]:
fig = px.bar(weapon_violent_crime_df[-30:], y='dr_no', x='weapon_desc', text_auto='.2s',
            title="Top Weapons used in Violent Crimes and Crime Types conducted with them", height = 900, color='crm_cd_desc',
            labels={ 'crm_cd_desc':'Crime', 'weapon_desc':'Weapon',
                          'dr_no':'Number of Crimes' },
            color_discrete_sequence=px.colors.qualitative.Alphabet, template='plotly_dark')

fig.update_traces(textfont_size=12, textangle=0, textposition="outside", cliponaxis=False)

fig.show()

#### Insights:
- Most of the violent crimes are committed using personal weapons like hands, feet, fists, etc.
- Firearms are 2nd most common weapons used in violent crimes.
- Robberies are committed using firearms as well as using personal weapons and Verbal Threats.
- Crimes against women such as *Assaults against Intimate Partners* and *Rapes* are mostly committed using personal weapons. Thus Pepper sprays are very effective to protect against such crimes.


#### Question 5: Where are most of the crimes committed?

In [68]:
premisis_crime_df = cleaned_crime_df[['premis_desc', 'crm_cd_desc', 'dr_no']].groupby(['premis_desc']).count()['dr_no'].reset_index()
premisis_crime_df.sort_values(by=['dr_no'], inplace=True)



In [118]:
fig = px.bar(premisis_crime_df[-15:], y='premis_desc', x='dr_no', text_auto='.2s',
            title="Top Crime Premises", height = 900,
            labels={ 'premis_desc':'Location', 
                          'dr_no':'Number of Crimes' },
            color='dr_no', color_continuous_scale=px.colors.sequential.Rainbow, 
            template='plotly_dark')
fig.update_traces(textfont_size=12, textangle=0, textposition="outside", cliponaxis=False)
fig.show()

#### Insights:
- Streets are the top location for crimes in LA.
- Residences, Parking Lots, Sidewalks, Businesses are the next most common locations.


#### Question 6: Which Crimes are most common in top 6 locations?

In [70]:
premisis_crime_df2 = cleaned_crime_df[['premis_desc', 'crm_cd_desc', 'dr_no']].groupby(['premis_desc', 'crm_cd_desc']).count()['dr_no'].reset_index()
premisis_crime_df2.sort_values(by=['dr_no'], inplace=True)

premisis_crime_df2 = premisis_crime_df2[premisis_crime_df2['premis_desc'].isin(premisis_crime_df['premis_desc'][-6:].values[:])]


In [116]:
fig = px.sunburst(premisis_crime_df2, 
                 path=['premis_desc', 'crm_cd_desc'], 
                 values='dr_no',
                 color='dr_no',
                 height=900,
                 template='plotly_dark',
                 title='Top Crime Premises and Crime Types conducted in them',
                 color_continuous_scale=px.colors.sequential.Turbo) 
                
fig.show()

#### Insights:
- Stolen vehicles, Burglaries from vehicles, Theft and Aggravated assaults are the most common Street crimes in LA.
- Theft of Identity is surprisingly the most common crime in Residences(single family and apartments).
- Burglaries from vehicles, Stolen vehicles and Thefts are the most common crimes in Parking Lots.
- Unsurprisingly, Thefts and Burglaries are the most common crimes in Sidewalks and Businesses.


#### Question 7: Are there any particular times of the day at which crimes are committed?

In [72]:
time_crime_df = cleaned_crime_df[['time_occ', 'crm_cd_desc', 'dr_no']].groupby(['time_occ', 'crm_cd_desc']).count()['dr_no'].reset_index()
time_crime_df2 = time_crime_df.groupby('time_occ')['dr_no'].sum().reset_index()
time_bins =[0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20 ,22, 24]
time_labels = ['12am-2am', '2am-4am', '4am-6am', '6am-8am', '8am-10am', '10am-12pm', '12pm-2pm', '2pm-4pm', '4pm-6pm', '6pm-8pm', '8pm-10pm', '10pm-12am']  
time_crime_df2['time_occ'] = pd.cut(time_crime_df2['time_occ'].apply(lambda x: x.hour), bins=time_bins,labels=time_labels, right=False)
time_crime_df2 = time_crime_df2.groupby('time_occ')['dr_no'].sum().reset_index()



In [117]:
fig = px.bar(time_crime_df2,title='Crime Numbers in Various times of the day', 
             x='time_occ', y="dr_no", color='dr_no',
             labels={ 'time_occ':'Time Interval',
                          'dr_no':'Number of Crimes' }, 
             color_continuous_scale=px.colors.sequential.Rainbow, 
             template='plotly_dark')
fig.show()

#### Insights:
- Crimes are lowest between 4 am and 6 am.
- After 6 am, crimes start increasing and reach a peak at 12-2 pm.
- Crimes then start decreasing and reach a low at between 2pm and 4pm.
- Crimes then start increasing again and reach a peak at 6-8 pm.
- Crimes then start decreasing and reach their lowest at around 4am.
- Cause of lowest crimes at 4am could be that most people are asleep at that time.
- 12pm to 10 pm is the time when most crimes are committed.

#### Question 8: Which Sex at which age is more likely to be a victim of a crime?

In [74]:
sa_crm_df = cleaned_crime_df.groupby(['vict_sex', 'vict_age']).count()['dr_no'].reset_index()
sa_crm_df = sa_crm_df[sa_crm_df['vict_age']!=0]
sa_crm_df = sa_crm_df[sa_crm_df['vict_sex']!='Unknown']
sa_crm_df.rename(columns={'dr_no':'Victim Number', 'vict_sex':'Victim Sex', 'vict_age':'Victim Age'}, inplace=True)

In [85]:
fig = px.scatter(sa_crm_df, x="Victim Age", y="Victim Number", color="Victim Sex",
                 size='Victim Number',
                  hover_data=['Victim Number'],
                  height=800,
                  title='Scatter Plot of Victim Age and Number of Victims, separated by Sex',
                  template='simple_white',
                  color_discrete_sequence=['Red', 'Black'])
fig.show()

#### Insights:
- Men and Women are mostly equally likely to be victims of a crime.
- Both sexes of ages 23-35 are most likely to be victims of a crime.
- Younger women below the age of 30 are more likely to be the victim of a crime than younger males. This trend is especially visible in ages 19-29.
- Older males above the age of 30 are more likely to be the victim of a crime than older women. This trend is especially visible in ages 38-65.
- Children below the age of 10 are least likely to be the victim of a crime.
- Males of age 35 and 50 are unusually high victims of crimes

#### Question 9: Elaborate on the crimes suffered by juvenile victims.

In [78]:
juvenile_crime_df = cleaned_crime_df[cleaned_crime_df['vict_age']<=18]

In [79]:
juvenile_crime_scatter_df = juvenile_crime_df.groupby(['vict_age', 'crm_cd_desc']).count()['dr_no'].reset_index()
juvenile_crime_scatter_df = juvenile_crime_scatter_df[juvenile_crime_scatter_df['vict_age']!=0]

In [87]:
fig = px.scatter(juvenile_crime_scatter_df, x="vict_age", y="dr_no", color="crm_cd_desc",
                  size='dr_no',
                  labels={ 'crm_cd_desc':'Crime', 'vict_age':"Victim's Age",
                          'dr_no':'Number of Crimes' },
                  title='Scatter Plot of Victims less than 19 years and Number of Crimes, Color Coded by Crime Type',
                  template='simple_white',
                  hover_data=['crm_cd_desc'],
                  
                  color_discrete_sequence=px.colors.qualitative.Dark24 , height=800)
fig.show()

#### Insights:
- Child Neglect and Child Abuse are the most common crimes suffered by juvenile victims of age 0-3
- Child Abuse and Crimes by suspects 10 years or older are the most common crimes suffered by juvenile victims of age 4-11
- Simple assaults skyrocket in the age group 12-18, Aggravated assaults become high in age group 14-18 both of which are Violent Crimes.
- Robberies also become high in 14-18 age group.

#### Question 10: Which Ethnic Groups are most affected by Crimes?

In [81]:
sd_crm_df = cleaned_crime_df[['vict_descent', 'vict_age', 'dr_no']].groupby(['vict_descent', 'vict_age']).count()['dr_no'].reset_index()
sd_crm_df = sd_crm_df[sd_crm_df['vict_age']!=0]
sd_crm_df = sd_crm_df[sd_crm_df['vict_descent']!='Unknown']
sd_crm_df.rename(columns={'dr_no':'Victim Number', 'vict_age':'Victim Age', 'vict_descent':'Victim Descent'}, inplace=True)

In [88]:
fig = px.scatter(sd_crm_df, x="Victim Age", y="Victim Number", color="Victim Descent",
                  hover_data=['Victim Number'],
                  template='plotly_dark',
                  title='Scatter Plot of Victim Age and Number of Victims, separated by Descent',
                  color_discrete_sequence= px.colors.qualitative.Dark24, height=800 )


fig.show()

#### Insights:
- Hispanic/Latin/Mexican Ethinicities are most affected by crimes. They are 48.5 percent of the population but 56.5 percent of the victims of crimes.
- Older White people, over the age of 58, are more affected than other ethinicities of the same age.
- Younger Black people, below the age of 20, are more affected than white people of the same age despite being only 9.8 percent of the population against 29.4 percent for White people.
- Black people are generally more affected by crimes than other ethinicities as compared to their population percentage.
- Asian people are least affected by crimes despite being 9.8 percent of the population.

#### Questions 11: Which Crimes are encountered by each ethnic group?

In [83]:
descent_crm_df = cleaned_crime_df[['vict_descent', 'crm_cd_desc', 'dr_no']].groupby(['vict_descent', 'crm_cd_desc']).count()['dr_no'].reset_index()
descent_crm_df = descent_crm_df[descent_crm_df['vict_descent']!='Unknown']
descent_crm_df.rename(columns={'dr_no':'Victim Number', 'crm_cd_desc':'Crime', 'vict_descent':'Victim Descent'}, inplace=True)

In [90]:
fig = px.sunburst(descent_crm_df, 
                 path=['Victim Descent','Crime'], 
                 values='Victim Number',
                 labels={
                          'dr_no':'Number of Crimes', 'parent':'Parent', 'labels':'Ethnicity/Crime', 'id':'ID'  },
                 hover_data={},
                 title='Sunburst Plot of Victim Descent and Crime Types they suffered',
                 height=800,
                 color='Victim Number',
                 template='plotly_dark',
                 color_continuous_scale=px.colors.sequential.Turbo) 
                
fig.show()

#### Insights:
- Hispanic/Latin/Mexican and Black ethnicities are most affected by Assaults(simple and aggravated). It accounts for more than 1/4th of all crimes against them.
- White, Asians and Other Ethnicities on the other hand encounter more burglaries and thefts(Petty and Theft of Identity)
- Hispanic/Latin/Mexican and Black ethnicities suffer more violent crimes than any other ethnicities.
- Black and Asian(combined) have similar population percentages but Blacks suffer significantly more crimes than Asians. The nature of Crimes are also more serious.

#### Question 10: How have the crime numbers changed over the years?

We will use a line plot to compare crime numbers over each month of the each year.

**Issues Encountered**:
- Current month's data is incomplete because this data is updated every week. Thus Current month shows a big dip in crime numbers 
- We have year and month numbers in our dataframe we created for this line chart
- we take the month number and multiply it with 10,000 and then add year's number e.g. if year is 2023 and month is 3, it comes up as 32023 which is 3 and 2023 combined. This we compare to the current year and month's combination and only take those rows into our Dataframe which do not have the current year and month's combination.

In [111]:
line_chart_MF_df = cleaned_crime_df.groupby(['year', 'month', 'month_num']).count()['dr_no'].reset_index()
line_chart_MF_df.sort_values(by=[ 'month_num'], inplace=True)

# exclude current month of the current year because the data is not yet complete for the current month
line_chart_MF_df = line_chart_MF_df[line_chart_MF_df['year']+ (line_chart_MF_df['month_num'])*10000!=(datetime.now().month)*10000+datetime.now().year]

In [110]:
# Line charts comparing crime victims by month and year
fig = px.line(line_chart_MF_df, x="month", y="dr_no", color='year', 
              title='Crime by Month and Year',
              labels={ 'year':'Year', 'month':'Month of Year',
                          'dr_no':'Number of Crimes' },
              
              markers=True, 
              template='plotly_dark',
              color_discrete_sequence=px.colors.sequential.Rainbow_r)

             
fig.show()

#### Insights:
- 2019 had a normal crime rate until october from where it dipped.
- 2020 jan had higher crime rate but it dipped possibly because of covid restrictions and remained lowest from march to the rest of the year.
- 2021 started with low crime rate but it increased steadily, possibly because of lifting of covid restrictions. In september it surpassed all the previous years and reached a peak in october.
- 2022 recorded the highest crime rate of all the years this data has been recorded for in LA despite Covid pandemic still not over.
- December 2022 recorded the highest crime rate of all the months in all the years.
- 2023 Started off high in January which is usual for the month in all years but it normalised later on.
- There is usually a dip in crimes in February and Uptick in crimes in October every year. 