In [2]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.express as px
pd.set_option('display.max_rows', None)
import datetime
from plotly.subplots import make_subplots

In [3]:
df = pd.read_csv('..\\data\\NYPD_Complaint_Data_Historic.csv', low_memory= False, nrows= 10000)

### [Dataset](https://data.cityofnewyork.us/Public-Safety/NYPD-Complaint-Data-Historic/qgea-i56i) Description


| Column Name | Description |
| ----------- | ----------- |
| CMPLNT_NUM | Randomly generated persistent ID for each complaint | 
| ADDR_PCT_CD | The precinct in which the incident occurred BORO The name of the borough in which the incident occurred |
| CMPLNT_FR_DT | Exact date of occurrence for the reported event (or starting date of occurrence, if CMPLNT_TO_DT exists) |
| CMPLNT_FR_TM | Exact time of occurrence for the reported event (or starting time of occurrence, if CMPLNT_TO_TM exists) |
| CMPLNT_TO_DT | Ending date of occurrence for the reported event, if exact time of occurrence is unknown |
| CMPLNT_TO_TM | Ending time of occurrence for the reported event, if exact time of occurrence is unknown |
| CRM_ATPT_CPTD_CD | Indicator of whether crime was successfully completed or attempted, but failed or was interrupted prematurely |
| HADEVELOPT | Name of NYCHA housing development of occurrence, if applicable |
| HOUSING_PSA | Development Level Code |
| JURISDICTION_CODE | Jurisdiction responsible for incident. Either internal, like Police(0), Transit(1), and Housing(2); or external(3), like Correction, Port Authority, etc. |
| JURIS_DESC |  Description of the jurisdiction code |
| KY_CD | Three digit offense classification code |
| LAW_CAT_CD | Level of offense: felony, misdemeanor, violation |
| LOC_OF_OCCUR_DESC | Specific location of occurrence in or around the premises; inside, opposite of, front of, rear of |
| OFNS_DESC | Description of offense corresponding with key code |
| PARKS_NM | Name of NYC park, playground or greenspace of occurrence, if applicable (state parks are not included) |
| PATROL_BORO | The name of the patrol borough in which the incident occurred |
| PD_CD | Three digit internal classification code (more granular than Key Code) |
| PD_DESC | Description of internal classification corresponding with PD code (more granular than Offense Description) |
| PREM_TYP_DESC |  Specific description of premises; grocery store, residence, street, etc. |
| RPT_DT | Date event was reported to police  |
| STATION_NAME | Transit station name |
| SUSP_AGE_GROUP | Suspect’s Age Group |
| SUSP_RACE | Suspect’s Race Description |
| SUSP_SEX | Suspect’s Sex Description |
| TRANSIT_DISTRICT | Transit district in which the offense occurred. |
| VIC_AGE_GROUP | Victim’s Age Group |
| VIC_RACE | Victim’s Race Description |
| VIC_SEX | Victim’s Sex Description (D=Business/Organization, E=PSNY/People of the State of New York, F=Female, M=Male) |
| X_COORD_CD | X-coordinate for New York State Plane Coordinate System, Long Island Zone, NAD 83, units feet (FIPS 3104) |
| Y_COORD_CD | Y-coordinate for New York State Plane Coordinate System, Long Island Zone, NAD 83, units feet (FIPS 3104) |
| Latitude | Midblock Latitude coordinate for Global Coordinate System, WGS 1984, decimal degrees (EPSG 4326) |
| Longitude | Midblock Longitude coordinate for Global Coordinate System, WGS 1984, decimal degrees (EPSG 4326) |

### Analysis

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 35 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   CMPLNT_NUM         10000 non-null  int64  
 1   CMPLNT_FR_DT       10000 non-null  object 
 2   CMPLNT_FR_TM       10000 non-null  object 
 3   CMPLNT_TO_DT       8385 non-null   object 
 4   CMPLNT_TO_TM       8393 non-null   object 
 5   ADDR_PCT_CD        9992 non-null   float64
 6   RPT_DT             10000 non-null  object 
 7   KY_CD              10000 non-null  int64  
 8   OFNS_DESC          10000 non-null  object 
 9   PD_CD              9912 non-null   float64
 10  PD_DESC            9912 non-null   object 
 11  CRM_ATPT_CPTD_CD   10000 non-null  object 
 12  LAW_CAT_CD         10000 non-null  object 
 13  BORO_NM            9911 non-null   object 
 14  LOC_OF_OCCUR_DESC  8274 non-null   object 
 15  PREM_TYP_DESC      9888 non-null   object 
 16  JURIS_DESC         1000

In [5]:
df.isnull().sum()

CMPLNT_NUM              0
CMPLNT_FR_DT            0
CMPLNT_FR_TM            0
CMPLNT_TO_DT         1615
CMPLNT_TO_TM         1607
ADDR_PCT_CD             8
RPT_DT                  0
KY_CD                   0
OFNS_DESC               0
PD_CD                  88
PD_DESC                88
CRM_ATPT_CPTD_CD        0
LAW_CAT_CD              0
BORO_NM                89
LOC_OF_OCCUR_DESC    1726
PREM_TYP_DESC         112
JURIS_DESC              0
JURISDICTION_CODE      88
PARKS_NM             9966
HADEVELOPT           9525
HOUSING_PSA          9295
X_COORD_CD             18
Y_COORD_CD             18
SUSP_AGE_GROUP       2737
SUSP_RACE            2733
SUSP_SEX             2734
TRANSIT_DISTRICT     9782
Latitude               18
Longitude              18
Lat_Lon                18
PATROL_BORO            88
STATION_NAME         9782
VIC_AGE_GROUP           8
VIC_RACE                0
VIC_SEX                 0
dtype: int64

#### changing column data types

In [4]:
# for col in df.columns:
#     print(f'\'{col}\':\'{df[col].dtype}\'', end = ', ')

# col_types = {'CMPLNT_NUM':'int64', 'CMPLNT_FR_DT': 'datetime64[ns]', 'CMPLNT_FR_TM':'datetime64[ns]', 'CMPLNT_TO_DT':'datetime64[ns]', 'CMPLNT_TO_TM':'object',
#              'ADDR_PCT_CD':'int64', 'RPT_DT':'object', 'KY_CD':'int64', 'OFNS_DESC':'object', 'PD_CD':'int64', 'PD_DESC':'object', 
#              'CRM_ATPT_CPTD_CD':'object', 'LAW_CAT_CD':'object', 'BORO_NM':'object', 'LOC_OF_OCCUR_DESC':'object', 'PREM_TYP_DESC':'object', 
#              'JURIS_DESC':'object', 'JURISDICTION_CODE':'int64', 'PARKS_NM':'int64', 'HADEVELOPT':'object', 'HOUSING_PSA':'float64', 
#              'X_COORD_CD':'float64', 'Y_COORD_CD':'float64', 'SUSP_AGE_GROUP':'object', 'SUSP_RACE':'object', 'SUSP_SEX':'object', 
#              'TRANSIT_DISTRICT':'float64', 'Latitude':'float64', 'Longitude':'float64', 'Lat_Lon':'object', 'PATROL_BORO':'object', 
#              'STATION_NAME':'float64', 'VIC_AGE_GROUP':'object', 'VIC_RACE':'object', 'VIC_SEX':'object'}

# df = df.astype(col_types)


df['CMPLNT_FR_DT'] = df['CMPLNT_FR_DT'].astype('datetime64[ns]')

#### The most and least committed crimes

In [6]:
crime_type_count = df.groupby(['OFNS_DESC'])['OFNS_DESC'].count().sort_values(ascending= False)

# crime_type_count = {}
# for i in df.OFNS_DESC.to_list():
#     if not i in crime_type_count:
#         crime_type_count[i] = 0
#     crime_type_count[i] += 1
# crime_type_dict = dict(sorted(crime_type_dict.items(), key= lambda item: item[1], reverse= True))

crime_type_count

OFNS_DESC
PETIT LARCENY                           2166
HARRASSMENT 2                           1599
ASSAULT 3 & RELATED OFFENSES            1134
CRIMINAL MISCHIEF & RELATED OF           935
GRAND LARCENY                            866
FELONY ASSAULT                           446
ROBBERY                                  384
MISCELLANEOUS PENAL LAW                  298
OFF. AGNST PUB ORD SENSBLTY &            296
BURGLARY                                 295
DANGEROUS DRUGS                          184
SEX CRIMES                               155
VEHICLE AND TRAFFIC LAWS                 154
GRAND LARCENY OF MOTOR VEHICLE           144
OFFENSES AGAINST PUBLIC ADMINI           135
DANGEROUS WEAPONS                        114
FORGERY                                  104
MURDER & NON-NEGL. MANSLAUGHTER           88
INTOXICATED & IMPAIRED DRIVING            76
THEFT-FRAUD                               72
RAPE                                      58
CRIMINAL TRESPASS                         51


#### Tell me the most and least crime neighborhoods

In [7]:
# OPTION 1: use borough
df.groupby(['BORO_NM'])['BORO_NM'].count().sort_values(ascending= False)

# OPTION 2: possibly use an external dataset of all precinct neighborhood locations, map to the complaint data


BORO_NM
BROOKLYN         2701
BRONX            2450
MANHATTAN        2354
QUEENS           2023
STATEN ISLAND     383
Name: BORO_NM, dtype: int64

#### Tell me the stations with the most crimes

#### Think of 3 other things you can ask your data.
- most and least commited crimes by season
-

In [5]:
df_ = df[df['CMPLNT_FR_DT'] > '01/01/2019']
df_ = df_[['CMPLNT_FR_DT', 'OFNS_DESC']].sort_values(by = 'CMPLNT_FR_DT')

In [36]:
sns.lineplot(df_, x= )

Unnamed: 0,CMPLNT_FR_DT,OFNS_DESC
1104,2019-01-02,SEX CRIMES
4294,2019-01-02,ASSAULT 3 & RELATED OFFENSES
1103,2019-01-02,HARRASSMENT 2
1102,2019-01-02,PETIT LARCENY
1106,2019-01-02,VEHICLE AND TRAFFIC LAWS


In [8]:
# Credit to FEDI BEN MESSAOUD: https://www.kaggle.com/code/fedi1996/boston-crime-analysis-with-plotly
def treemap(categories,title,path,values):
    fig = px.treemap(categories, path=path, values=values, height=700,
                 title=title, color_discrete_sequence = px.colors.sequential.RdBu)
    fig.data[0].textinfo = 'label+text+value'
    fig.show()

crime_count = df_['OFNS_DESC'].value_counts()
vals = crime_count.values

categories = pd.DataFrame(data= crime_count.index, columns=["OFNS_DESC"])
categories['values'] = vals

treemap(categories,'Crimes in New York',['OFNS_DESC'],categories['values'])