In [1]:
import pandas as pd
import scipy
from sklearn.neighbors import KNeighborsClassifier as KNN
import matplotlib.pyplot as plt
import numpy as np


In [2]:
df = pd.read_csv('Crime_Data_from_2010_to_Present.csv')

In [3]:
df.head()

Unnamed: 0,DR_NO,Date Rptd,DATE OCC,TIME OCC,AREA,AREA NAME,Rpt Dist No,Part 1-2,Crm Cd,Crm Cd Desc,...,Status,Status Desc,Crm Cd 1,Crm Cd 2,Crm Cd 3,Crm Cd 4,LOCATION,Cross Street,LAT,LON
0,1307355,02/20/2010 12:00:00 AM,02/20/2010 12:00:00 AM,1350,13,Newton,1385,2,900,VIOLATION OF COURT ORDER,...,AA,Adult Arrest,900.0,,,,300 E GAGE AV,,33.9825,-118.2695
1,11401303,09/13/2010 12:00:00 AM,09/12/2010 12:00:00 AM,45,14,Pacific,1485,2,740,"VANDALISM - FELONY ($400 & OVER, ALL CHURCH VA...",...,IC,Invest Cont,740.0,,,,SEPULVEDA BL,MANCHESTER AV,33.9599,-118.3962
2,70309629,08/09/2010 12:00:00 AM,08/09/2010 12:00:00 AM,1515,13,Newton,1324,2,946,OTHER MISCELLANEOUS CRIME,...,IC,Invest Cont,946.0,,,,1300 E 21ST ST,,34.0224,-118.2524
3,90631215,01/05/2010 12:00:00 AM,01/05/2010 12:00:00 AM,150,6,Hollywood,646,2,900,VIOLATION OF COURT ORDER,...,IC,Invest Cont,900.0,998.0,,,CAHUENGA BL,HOLLYWOOD BL,34.1016,-118.3295
4,100100501,01/03/2010 12:00:00 AM,01/02/2010 12:00:00 AM,2100,1,Central,176,1,122,"RAPE, ATTEMPTED",...,IC,Invest Cont,122.0,,,,8TH ST,SAN PEDRO ST,34.0387,-118.2488


In [4]:
df.columns

Index(['DR_NO', 'Date Rptd', 'DATE OCC', 'TIME OCC', 'AREA ', 'AREA NAME',
       'Rpt Dist No', 'Part 1-2', 'Crm Cd', 'Crm Cd Desc', 'Mocodes',
       'Vict Age', 'Vict Sex', 'Vict Descent', 'Premis Cd', 'Premis Desc',
       'Weapon Used Cd', 'Weapon Desc', 'Status', 'Status Desc', 'Crm Cd 1',
       'Crm Cd 2', 'Crm Cd 3', 'Crm Cd 4', 'LOCATION', 'Cross Street', 'LAT',
       'LON'],
      dtype='object')

In [5]:
#drop columns irrelevant to our data analysis
drop_cols = [
    'DR_NO',
    'Date Rptd',
    'AREA ',
    'Rpt Dist No',
    'Part 1-2',
    'Crm Cd',
    'Premis Cd',
    'Premis Desc',
    'Weapon Used Cd',
    'Weapon Desc',
    'Mocodes',
    'Status',
    'Status Desc',
    'Crm Cd 1',
    'Crm Cd 2',
    'Crm Cd 3',
    'Crm Cd 4',
    'Cross Street',
    'LOCATION',
]

df = df.drop(drop_cols, axis=1)
df.columns

Index(['DATE OCC', 'TIME OCC', 'AREA NAME', 'Crm Cd Desc', 'Vict Age',
       'Vict Sex', 'Vict Descent', 'LAT', 'LON'],
      dtype='object')

In [6]:
df.head()

Unnamed: 0,DATE OCC,TIME OCC,AREA NAME,Crm Cd Desc,Vict Age,Vict Sex,Vict Descent,LAT,LON
0,02/20/2010 12:00:00 AM,1350,Newton,VIOLATION OF COURT ORDER,48,M,H,33.9825,-118.2695
1,09/12/2010 12:00:00 AM,45,Pacific,"VANDALISM - FELONY ($400 & OVER, ALL CHURCH VA...",0,M,W,33.9599,-118.3962
2,08/09/2010 12:00:00 AM,1515,Newton,OTHER MISCELLANEOUS CRIME,0,M,H,34.0224,-118.2524
3,01/05/2010 12:00:00 AM,150,Hollywood,VIOLATION OF COURT ORDER,47,F,W,34.1016,-118.3295
4,01/02/2010 12:00:00 AM,2100,Central,"RAPE, ATTEMPTED",47,F,H,34.0387,-118.2488


In [7]:
# clean up date occurred to only contain month/day/year
temp = df['DATE OCC'].str.split(' ',n = 1, expand = True)
temp2 = temp[0]
df['DATE OCC'] = temp2

In [8]:
df.head()

Unnamed: 0,DATE OCC,TIME OCC,AREA NAME,Crm Cd Desc,Vict Age,Vict Sex,Vict Descent,LAT,LON
0,02/20/2010,1350,Newton,VIOLATION OF COURT ORDER,48,M,H,33.9825,-118.2695
1,09/12/2010,45,Pacific,"VANDALISM - FELONY ($400 & OVER, ALL CHURCH VA...",0,M,W,33.9599,-118.3962
2,08/09/2010,1515,Newton,OTHER MISCELLANEOUS CRIME,0,M,H,34.0224,-118.2524
3,01/05/2010,150,Hollywood,VIOLATION OF COURT ORDER,47,F,W,34.1016,-118.3295
4,01/02/2010,2100,Central,"RAPE, ATTEMPTED",47,F,H,34.0387,-118.2488


In [9]:
df['Crm Cd Desc'].unique()

array(['VIOLATION OF COURT ORDER',
       'VANDALISM - FELONY ($400 & OVER, ALL CHURCH VANDALISMS)',
       'OTHER MISCELLANEOUS CRIME', 'RAPE, ATTEMPTED',
       'SHOPLIFTING - PETTY THEFT ($950 & UNDER)',
       'BURGLARY FROM VEHICLE',
       'ASSAULT WITH DEADLY WEAPON, AGGRAVATED ASSAULT',
       'THEFT-GRAND ($950.01 & OVER)EXCPT,GUNS,FOWL,LIVESTK,PROD',
       'BATTERY - SIMPLE ASSAULT', 'ROBBERY', 'BOMB SCARE',
       'CHILD NEGLECT (SEE 300 W.I.C.)',
       'INTIMATE PARTNER - AGGRAVATED ASSAULT',
       'INTIMATE PARTNER - SIMPLE ASSAULT',
       'THEFT PLAIN - PETTY ($950 & UNDER)',
       'CRIMINAL THREATS - NO WEAPON DISPLAYED', 'ATTEMPTED ROBBERY',
       'VANDALISM - MISDEAMEANOR ($399 OR UNDER)', 'BURGLARY', 'ARSON',
       'RAPE, FORCIBLE', 'BRANDISH WEAPON',
       'THROWING OBJECT AT MOVING VEHICLE',
       'SHOPLIFTING-GRAND THEFT ($950.01 & OVER)',
       'CHILD ABUSE (PHYSICAL) - SIMPLE ASSAULT',
       'SHOTS FIRED AT INHABITED DWELLING',
       'THEFT FROM MOTOR

In [10]:
# # drop crimes that are not deemed threatening to civilians
# drop_crimes = set([
#     'ABORTION/ILLEGAL',
#     'BEASTIALITY, CRIME AGAINST NATURE SEXUAL ASSLT WITH ANIM',
#     'BIGAMY',
#     'BLOCKING DOOR INDUCTION CENTER',
#     'BOAT - STOLEN',
#     'CONSPIRACY',
#     'CREDIT CARDS, FRAUD USE ($950 & UNDER',
#     'CREDIT CARDS, FRAUD USE ($950.01 & OVER)',
#     'CONTEMPT OF COURT',
#     'CONTRIBUTING',
#     'COUNTERFEIT',
#     'CRUELTY TO ANIMALS',
#     'DEFRAUDING INNKEEPER/THEFT OF SERVICES, $400 & UNDER',
#     'DEFRAUDING INNKEEPER/THEFT OF SERVICES, OVER $400',
#     'DISHONEST EMPLOYEE - GRAND THEFT',
#     'DISHONEST EMPLOYEE - PETTY THEFT',
#     'DISHONEST EMPLOYEE ATTEMPTED THEFT',
#     'DOCUMENT FORGERY / STOLEN FELONY',
#     'DOCUMENT WORTHLESS ($200 & UNDER)',
#     'DOCUMENT WORTHLESS ($200.01 & OVER)',
#     'DRIVING WITHOUT OWNER CONSENT (DWOC)',
#     'EMBEZZLEMENT, GRAND THEFT ($950.01 & OVER)',
#     'EMBEZZLEMENT, PETTY THEFT ($950 & UNDER)',
#     'FAILURE TO YIELD',
#     'FALSE IMPRISONMENT',
#     'FALSE POLICE REPORT',
#     'LETTERS, LEWD  -  TELEPHONE CALLS, LEWD',
#     'PANDERING',
#     'REPLICA FIREARMS(SALE,DISPLAY,MANUFACTURE OR DISTRIBUTE)',
#     'TELEPHONE PROPERTY - DAMAGE',
#     'THEFT OF IDENTITY',
#     'UNAUTHORIZED COMPUTER ACCESS',
#     'VIOLATION OF COURT ORDER',
    
# ])

# df = df[~df['Crm Cd Desc'].isin(drop_crimes)]

In [11]:
#merging latitude and longitude into new column `COORDS`

# df['COORDS'] = list(zip(df['LAT'],df['LON']))
# df.drop(columns = ['LAT','LON'], axis = 1, inplace = True)

In [12]:
df.head()

Unnamed: 0,DATE OCC,TIME OCC,AREA NAME,Crm Cd Desc,Vict Age,Vict Sex,Vict Descent,LAT,LON
0,02/20/2010,1350,Newton,VIOLATION OF COURT ORDER,48,M,H,33.9825,-118.2695
1,09/12/2010,45,Pacific,"VANDALISM - FELONY ($400 & OVER, ALL CHURCH VA...",0,M,W,33.9599,-118.3962
2,08/09/2010,1515,Newton,OTHER MISCELLANEOUS CRIME,0,M,H,34.0224,-118.2524
3,01/05/2010,150,Hollywood,VIOLATION OF COURT ORDER,47,F,W,34.1016,-118.3295
4,01/02/2010,2100,Central,"RAPE, ATTEMPTED",47,F,H,34.0387,-118.2488


In [13]:
df['AREA NAME'].unique()

array(['Newton', 'Pacific', 'Hollywood', 'Central', 'Northeast',
       'Hollenbeck', 'Southwest', 'Southeast', 'Rampart', 'Olympic',
       'Wilshire', '77th Street', 'Harbor', 'West LA', 'Van Nuys',
       'West Valley', 'Topanga', 'N Hollywood', 'Mission', 'Foothill',
       'Devonshire'], dtype=object)

In [14]:
#drop all victimless crime
df.dropna(inplace = True)
df.reset_index(inplace = True)

In [15]:
#get max and min longitude
print('The max and min longitude values are ' + str(np.max(df['LON'])) + ' & ' + str(np.min(df['LON'])))
#get max and min latitude
print('The max and min latitude values are ' + str(np.max(df['LAT'])) + ' & ' + str(np.min(df['LAT'])))
#plot LAT v LON to visualize where crimes occur in LA


The max and min longitude values are 0.0 & -118.8279
The max and min latitude values are 34.7907 & 0.0


In [46]:
#df.loc[df['column_name'].isin(some_values)]
# df.loc[df['DATE OCC'] has '/2010']

SyntaxError: invalid syntax (<ipython-input-46-da4a8cca8407>, line 2)

In [None]:
df.loc?

In [21]:
years = []
for date in df['DATE OCC']:
    data = date.split('/')
    years.append(int(data[2]))

In [22]:
df['DATE OCC'].head()
df['Year'] = years

In [30]:
df['Year'].unique()

array([2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019],
      dtype=int64)

In [44]:
# split df into 2010-2012, 2013-2015, 2016-2018

# def append_year(year_range):
# df_10_12 = df[(df['Year'] == 2010)]
# df_10_12 = df_10_12.append(df[(df['Year'] == 2011)])
# df_13_15 = df.loc[df['Year'] >= 2013 and df['Year'] <= 2015]
# df_16_18 = df.loc[df['Year'] >= 2016 and df['Year'] <= 2018]
#df_10_12 = df.loc
#plot lat v lon in different colors to visualize where crime is happening in LA
#

In [45]:
df_10_12['Year'].unique()

array([2010, 2011], dtype=int64)

In [None]:
#plot of all crimes lat v lon
plt.scatter(df['LON'],df['LAT'])
plt.xlim([-118.465381,-118.138795])
plt.ylim([34.016907, 34.105999])
plt.ylabel('Latitude')
plt.xlabel('Longitude')

In [None]:
np.argmin(df['LON'])

In [None]:
df['TIME OCC'].value_counts()[:10].plot(kind = 'bar')

f0 = plt.gcf()

In [None]:
df['DATE OCC'].value_counts()[:10].plot(kind = 'bar')

f1 = plt.gcf()

In [None]:
df['AREA NAME'].value_counts().plot(kind = 'bar')

f2 = plt.gcf()

In [None]:
df['Vict Age'].value_counts()[:30].plot(kind = 'bar')

f3 = plt.gcf()

In [None]:
df['Vict Descent'].value_counts().plot(kind = 'bar')

f4 = plt.gcf()

In [None]:
df['Vict Descent'].unique()

What can we drop? (eg '0' ages, unknown descent) - new dataframes?, X,H,N genders
multi logistic regression
how to do knn
    formula = 'Vict Sex ~ TIME OCC + LAT + LON'
    age, descent too
    

In [None]:
#plot geographical data, violent and non violent separated by color

#sklearn multiple logistic regression

    # pay equal attention class_weight = balancd
    

In [None]:
df['Vict Sex'].value_counts().plot(kind = 'bar')

f4 = plt.gcf()
#plot which crimes affect which gender the most
#gender and violent v nonviolent crimes

In [None]:
df['Crm Cd Desc'].value_counts()[:10].plot(kind = 'bar')

f4 = plt.gcf()