In [71]:
import pandas as pd
import seaborn as sns

In [72]:
df = pd.read_csv('..\\data\\NYPD_Complaint_Data_Historic.csv', low_memory= False, nrows= 100)

### [Dataset](https://data.cityofnewyork.us/Public-Safety/NYPD-Complaint-Data-Historic/qgea-i56i) Description


| Column Name | Description |
| ----------- | ----------- |
| CMPLNT_NUM | Randomly generated persistent ID for each complaint | 
| ADDR_PCT_CD | The precinct in which the incident occurred BORO The name of the borough in which the incident occurred |
| CMPLNT_FR_DT | Exact date of occurrence for the reported event (or starting date of occurrence, if CMPLNT_TO_DT exists) |
| CMPLNT_FR_TM | Exact time of occurrence for the reported event (or starting time of occurrence, if CMPLNT_TO_TM exists) |
| CMPLNT_TO_DT | Ending date of occurrence for the reported event, if exact time of occurrence is unknown |
| CMPLNT_TO_TM | Ending time of occurrence for the reported event, if exact time of occurrence is unknown |
| CRM_ATPT_CPTD_CD | Indicator of whether crime was successfully completed or attempted, but failed or was interrupted prematurely |
| HADEVELOPT | Name of NYCHA housing development of occurrence, if applicable |
| HOUSING_PSA | Development Level Code |
| JURISDICTION_CODE | Jurisdiction responsible for incident. Either internal, like Police(0), Transit(1), and Housing(2); or external(3), like Correction, Port Authority, etc. |
| JURIS_DESC |  Description of the jurisdiction code |
| KY_CD | Three digit offense classification code |
| LAW_CAT_CD | Level of offense: felony, misdemeanor, violation |
| LOC_OF_OCCUR_DESC | Specific location of occurrence in or around the premises; inside, opposite of, front of, rear of |
| OFNS_DESC | Description of offense corresponding with key code |
| PARKS_NM | Name of NYC park, playground or greenspace of occurrence, if applicable (state parks are not included) |
| PATROL_BORO | The name of the patrol borough in which the incident occurred |
| PD_CD | Three digit internal classification code (more granular than Key Code) |
| PD_DESC | Description of internal classification corresponding with PD code (more granular than Offense Description) |
| PREM_TYP_DESC |  Specific description of premises; grocery store, residence, street, etc. |
| RPT_DT | Date event was reported to police  |
| STATION_NAME | Transit station name |
| SUSP_AGE_GROUP | Suspect’s Age Group |
| SUSP_RACE | Suspect’s Race Description |
| SUSP_SEX | Suspect’s Sex Description |
| TRANSIT_DISTRICT | Transit district in which the offense occurred. |
| VIC_AGE_GROUP | Victim’s Age Group |
| VIC_RACE | Victim’s Race Description |
| VIC_SEX | Victim’s Sex Description (D=Business/Organization, E=PSNY/People of the State of New York, F=Female, M=Male) |
| X_COORD_CD | X-coordinate for New York State Plane Coordinate System, Long Island Zone, NAD 83, units feet (FIPS 3104) |
| Y_COORD_CD | Y-coordinate for New York State Plane Coordinate System, Long Island Zone, NAD 83, units feet (FIPS 3104) |
| Latitude | Midblock Latitude coordinate for Global Coordinate System, WGS 1984, decimal degrees (EPSG 4326) |
| Longitude | Midblock Longitude coordinate for Global Coordinate System, WGS 1984, decimal degrees (EPSG 4326) |

### Analysis

In [73]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 35 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   CMPLNT_NUM         100 non-null    int64  
 1   CMPLNT_FR_DT       100 non-null    object 
 2   CMPLNT_FR_TM       100 non-null    object 
 3   CMPLNT_TO_DT       40 non-null     object 
 4   CMPLNT_TO_TM       40 non-null     object 
 5   ADDR_PCT_CD        98 non-null     float64
 6   RPT_DT             100 non-null    object 
 7   KY_CD              100 non-null    int64  
 8   OFNS_DESC          100 non-null    object 
 9   PD_CD              91 non-null     float64
 10  PD_DESC            91 non-null     object 
 11  CRM_ATPT_CPTD_CD   100 non-null    object 
 12  LAW_CAT_CD         100 non-null    object 
 13  BORO_NM            91 non-null     object 
 14  LOC_OF_OCCUR_DESC  62 non-null     object 
 15  PREM_TYP_DESC      89 non-null     object 
 16  JURIS_DESC         100 non-

In [74]:
df.isnull().sum()

CMPLNT_NUM             0
CMPLNT_FR_DT           0
CMPLNT_FR_TM           0
CMPLNT_TO_DT          60
CMPLNT_TO_TM          60
ADDR_PCT_CD            2
RPT_DT                 0
KY_CD                  0
OFNS_DESC              0
PD_CD                  9
PD_DESC                9
CRM_ATPT_CPTD_CD       0
LAW_CAT_CD             0
BORO_NM                9
LOC_OF_OCCUR_DESC     38
PREM_TYP_DESC         11
JURIS_DESC             0
JURISDICTION_CODE      9
PARKS_NM             100
HADEVELOPT            99
HOUSING_PSA           99
X_COORD_CD             3
Y_COORD_CD             3
SUSP_AGE_GROUP        50
SUSP_RACE             49
SUSP_SEX              49
TRANSIT_DISTRICT     100
Latitude               3
Longitude              3
Lat_Lon                3
PATROL_BORO            9
STATION_NAME         100
VIC_AGE_GROUP          0
VIC_RACE               0
VIC_SEX                0
dtype: int64

#### changing column data types

In [48]:
# for col in df.columns:
#     print(f'\'{col}\':\'{df[col].dtype}\'', end = ', ')

# col_types = {'CMPLNT_NUM':'int64', 'CMPLNT_FR_DT':'object', 'CMPLNT_FR_TM':'object', 'CMPLNT_TO_DT':'object', 'CMPLNT_TO_TM':'object',
#              'ADDR_PCT_CD':'int64', 'RPT_DT':'object', 'KY_CD':'int64', 'OFNS_DESC':'object', 'PD_CD':'int64', 'PD_DESC':'object', 
#              'CRM_ATPT_CPTD_CD':'object', 'LAW_CAT_CD':'object', 'BORO_NM':'object', 'LOC_OF_OCCUR_DESC':'object', 'PREM_TYP_DESC':'object', 
#              'JURIS_DESC':'object', 'JURISDICTION_CODE':'int64', 'PARKS_NM':'int64', 'HADEVELOPT':'object', 'HOUSING_PSA':'float64', 
#              'X_COORD_CD':'float64', 'Y_COORD_CD':'float64', 'SUSP_AGE_GROUP':'object', 'SUSP_RACE':'object', 'SUSP_SEX':'object', 
#              'TRANSIT_DISTRICT':'float64', 'Latitude':'float64', 'Longitude':'float64', 'Lat_Lon':'object', 'PATROL_BORO':'object', 
#              'STATION_NAME':'float64', 'VIC_AGE_GROUP':'object', 'VIC_RACE':'object', 'VIC_SEX':'object'}

# df = df.astype(col_types)

#### The most and least committed crimes

In [88]:
crime_type_count = df.groupby(['OFNS_DESC'])['OFNS_DESC'].count().sort_values(ascending= False)

# crime_type_count = {}
# for i in df.OFNS_DESC.to_list():
#     if not i in crime_type_count:
#         crime_type_count[i] = 0
#     crime_type_count[i] += 1
# crime_type_dict = dict(sorted(crime_type_dict.items(), key= lambda item: item[1], reverse= True))

crime_type_count

OFNS_DESC
PETIT LARCENY                      20
MURDER & NON-NEGL. MANSLAUGHTER     9
RAPE                                9
HARRASSMENT 2                       8
SEX CRIMES                          6
CRIMINAL MISCHIEF & RELATED OF      5
FELONY ASSAULT                      5
ASSAULT 3 & RELATED OFFENSES        5
ROBBERY                             5
BURGLARY                            4
DANGEROUS WEAPONS                   4
FORGERY                             4
DANGEROUS DRUGS                     3
GRAND LARCENY                       3
GRAND LARCENY OF MOTOR VEHICLE      2
THEFT-FRAUD                         1
ARSON                               1
OFFENSES INVOLVING FRAUD            1
OFF. AGNST PUB ORD SENSBLTY &       1
MISCELLANEOUS PENAL LAW             1
FRAUDS                              1
CRIMINAL TRESPASS                   1
VEHICLE AND TRAFFIC LAWS            1
Name: OFNS_DESC, dtype: int64

#### Tell me the most and least crime neighborhoods

In [93]:
# OPTION 1: use borough
df.groupby(['BORO_NM'])['BORO_NM'].count().sort_index(ascending= False)

# OPTION 2: possibly use an external dataset of all precinct neighborhood locations, map to the complaint data


BORO_NM
STATEN ISLAND     5
QUEENS           18
MANHATTAN        20
BROOKLYN          9
BRONX            39
Name: BORO_NM, dtype: int64

#### Tell me the stations with the most crimes

#### Think of 3 other things you can ask your data.