In [1]:
import pandas as pd
from datetime import datetime
import jovian_project.data_cleaning as dc
from sodapy import Socrata

In [4]:
#Used the Socrata Api to get the data from the Los Angeles Crime Data
client = Socrata("data.lacity.org", None)
results = client.get("2nrs-mtv8", limit=1000000)




In [17]:
#create a dataframe from the resulting data
crime_df = pd.DataFrame.from_records(results)

In [18]:
cleaned_crime_df= crime_df.copy()

In [19]:
dc.edit_column(cleaned_crime_df, 'time_occ', dc.clean_military_time)

In [20]:
cleaned_crime_df['time_occ']=pd.to_datetime(cleaned_crime_df['time_occ'], format='%H:%M').dt.time

In [23]:
# Dropping the Weapon Used Code, Status, Crime Code 1 columns as they are redundant
cleaned_crime_df.drop(columns=['weapon_used_cd', 'status', 'crm_cd_1','mocodes','part_1_2'], inplace=True)


In [24]:
# Filling missing values in Weapon Description column with 'UNKNOWN WEAPON/OTHER WEAPON'as it is already present in the Weapon Description column, Victim Sex -> X (Rerpresents unknown) and Victim Descent -> Unknown
cleaned_crime_df['weapon_desc'].fillna('UNKNOWN WEAPON/OTHER WEAPON', inplace=True)
cleaned_crime_df['vict_sex'].fillna('X', inplace=True)
cleaned_crime_df['vict_descent'].fillna('Unknown', inplace=True)


In [25]:
#Descent of victim was represented by a single letter code which was not very intuitive. So, we replaced the codes with the actual description of the descent.
#make a dictionary of Descent Codes and Descent Description
descent_dict = {'A': 'Other Asian', 'B': 'Black', 
                'C': 'Chinese', 'D': 'Cambodian', 
                'F': 'Filipino', 'G': 'Guamanian', 
                'H': 'Hispanic/Latin/Mexican', 
                'I': 'American Indian/Alaskan Native', 
                'J': 'Japanese', 'K': 'Korean', 'L': 'Laotian', 
                'O': 'Other', 'P': 'Pacific Islander', 
                'S': 'Samoan', 'U': 'Hawaiian', 'V': 'Vietnamese', 
                'W': 'White', 'X': 'Unknown', 
                'Z': 'Asian Indian'}
# replace the Descent Codes with Descent Description
cleaned_crime_df['vict_descent'].replace(descent_dict, inplace=True)

In [26]:
# Convert date_occ and date_rptd to datetime objects and remove the time component beacause it was the same for all the rows.
cleaned_crime_df['date_occ'] = pd.to_datetime(cleaned_crime_df['date_occ'], format='%Y-%m-%dT%H:%M:%S.%f').dt.date
cleaned_crime_df['date_rptd'] = pd.to_datetime(cleaned_crime_df['date_rptd'], format='%Y-%m-%dT%H:%M:%S.%f').dt.date

In [32]:

cleaned_crime_df['vict_age'].unique()

array(['36', '25', '0', '76', '31', '23', '29', '35', '41', '24', '34',
       '46', '66', '40', '27', '62', '43', '71', '50', '19', '51', '33',
       '69', '39', '57', '78', '52', '38', '55', '44', '18', '54', '22',
       '28', '42', '56', '67', '37', '60', '61', '59', '32', '30', '45',
       '20', '15', '58', '47', '48', '26', '21', '64', '75', '12', '49',
       '68', '14', '13', '10', '53', '74', '17', '65', '63', '8', '16',
       '72', '70', '9', '90', '85', '81', '79', '94', '73', '11', '80',
       '5', '82', '2', '77', '84', '88', '96', '99', '7', '86', '92', '3',
       '83', '87', '6', '-1', '89', '4', '93', '98', '91', '95', '-2',
       '97', '120'], dtype=object)

In [None]:
#replace 0 and less than 

In [12]:
#check for duplicate rows in cleaned_crime_df
crime_df.duplicated().sum()

0

In [33]:
#check for missing values in crime_df
crime_df.isna().sum()

dr_no                  0
date_rptd              0
date_occ               0
time_occ               0
area                   0
area_name              0
rpt_dist_no            0
part_1_2               0
crm_cd                 0
crm_cd_desc            0
mocodes            99652
vict_age               0
vict_sex           94835
vict_descent       94841
premis_cd              9
premis_desc          407
weapon_used_cd    474453
weapon_desc       474453
status                 0
status_desc            0
crm_cd_1               8
location               0
lat                    0
lon                    0
crm_cd_2          672601
cross_street      609321
crm_cd_3          724726
crm_cd_4          726477
dtype: int64

In [22]:
#check for outliers in the crime_df
crime_df.i


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 26 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   dr_no           1000 non-null   int64  
 1   date_rptd       1000 non-null   object 
 2   date_occ        1000 non-null   object 
 3   time_occ        1000 non-null   object 
 4   area            1000 non-null   int64  
 5   area_name       1000 non-null   object 
 6   rpt_dist_no     1000 non-null   int64  
 7   part_1_2        1000 non-null   int64  
 8   crm_cd          1000 non-null   int64  
 9   crm_cd_desc     1000 non-null   object 
 10  mocodes         930 non-null    object 
 11  vict_age        1000 non-null   int64  
 12  vict_sex        938 non-null    object 
 13  vict_descent    938 non-null    object 
 14  premis_cd       1000 non-null   int64  
 15  premis_desc     1000 non-null   object 
 16  weapon_used_cd  392 non-null    float64
 17  weapon_desc     392 non-null    ob

In [21]:
cleaned_crime_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 21 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   dr_no         1000 non-null   int64  
 1   date_rptd     1000 non-null   object 
 2   date_occ      1000 non-null   object 
 3   time_occ      1000 non-null   object 
 4   area          1000 non-null   int64  
 5   area_name     1000 non-null   object 
 6   rpt_dist_no   1000 non-null   int64  
 7   crm_cd        1000 non-null   int64  
 8   crm_cd_desc   1000 non-null   object 
 9   vict_age      1000 non-null   float64
 10  vict_sex      1000 non-null   object 
 11  vict_descent  1000 non-null   object 
 12  premis_cd     1000 non-null   int64  
 13  premis_desc   1000 non-null   object 
 14  weapon_desc   1000 non-null   object 
 15  status_desc   1000 non-null   object 
 16  location      1000 non-null   object 
 17  lat           1000 non-null   float64
 18  lon           1000 non-null  

In [16]:
cleaned_crime_df['vict_age'].unique()

array([36.  , 25.  , 32.44, 76.  , 31.  , 23.  , 29.  , 35.  , 41.  ,
       24.  , 34.  , 46.  , 66.  , 40.  , 27.  , 62.  , 43.  , 71.  ,
       50.  , 19.  , 51.  , 33.  , 69.  , 39.  , 57.  , 78.  , 52.  ,
       38.  , 55.  , 44.  , 18.  , 54.  , 22.  , 28.  , 42.  , 56.  ,
       67.  , 37.  , 60.  , 61.  , 59.  , 32.  , 30.  , 45.  , 20.  ,
       15.  , 58.  , 47.  , 48.  , 26.  , 21.  , 64.  , 75.  , 12.  ,
       49.  , 68.  , 14.  , 13.  , 10.  , 53.  , 74.  , 17.  , 65.  ,
       63.  ,  8.  , 16.  , 72.  , 70.  ,  9.  , 90.  , 85.  , 81.  ,
       79.  , 94.  , 73.  ])

In [17]:
cleaned_crime_df['vict_sex'].unique()

array(['F', 'M', 'X'], dtype=object)

In [28]:
df=cleaned_crime_df.groupby(by='vict_sex').count()

In [29]:
df

Unnamed: 0_level_0,dr_no,date_rptd,date_occ,time_occ,area,area_name,rpt_dist_no,crm_cd,crm_cd_desc,vict_age,vict_descent,premis_cd,premis_desc,weapon_desc,status_desc,location,lat,lon,crm_cd_2,cross_street
vict_sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
F,359,359,359,359,359,359,359,359,359,359,359,359,359,359,359,359,359,359,27,94
M,510,510,510,510,510,510,510,510,510,510,510,510,510,510,510,510,510,510,44,136
X,131,131,131,131,131,131,131,131,131,131,131,131,131,131,131,131,131,131,6,27


In [32]:
crime_df_csv = pd.read_csv('https://data.lacity.org/resource/2nrs-mtv8.csv')

In [34]:
crime_df_csv.tail()

Unnamed: 0,dr_no,date_rptd,date_occ,time_occ,area,area_name,rpt_dist_no,part_1_2,crm_cd,crm_cd_desc,...,status,status_desc,crm_cd_1,crm_cd_2,crm_cd_3,crm_cd_4,location,cross_street,lat,lon
995,200105945,2020-01-29T00:00:00.000,2020-01-28T00:00:00.000,1725,1,Central,124,2,903,CONTEMPT OF COURT,...,IC,Invest Cont,903,,,,100 W 1ST ST,,34.0522,-118.2434
996,200105946,2020-01-29T00:00:00.000,2020-01-29T00:00:00.000,720,1,Central,157,2,624,BATTERY - SIMPLE ASSAULT,...,IC,Invest Cont,624,,,,600 S SAN PEDRO ST,,34.0423,-118.2452
997,200105947,2020-01-29T00:00:00.000,2020-01-28T00:00:00.000,1730,1,Central,159,1,510,VEHICLE - STOLEN,...,IC,Invest Cont,510,,,,1800 INDUSTRIAL ST,,34.0359,-118.2344
998,210817688,2021-12-16T00:00:00.000,2020-12-09T00:00:00.000,1300,8,West LA,835,2,649,DOCUMENT FORGERY / STOLEN FELONY,...,IC,Invest Cont,649,,,,1800 CAMDEN AV,,34.0466,-118.4408
999,200504494,2020-01-11T00:00:00.000,2020-01-11T00:00:00.000,1340,5,Harbor,565,2,930,CRIMINAL THREATS - NO WEAPON DISPLAYED,...,AA,Adult Arrest,930,,,,900 S GRAND AV,,33.7361,-118.2901
