In [41]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)

In [42]:
# crime_path = r"C:\Users\NCC200\Desktop\TASK\crime_work\archive.zip"
# df = pd.read_csv(crime_path, low_memory=True, compression='zip')
# df.tail()

In [43]:
def load_and_optimize_csv_clean(filepath, sample_size=5000, chunksize=100_000):
    sample = pd.read_csv(filepath, nrows=sample_size, low_memory=False)

    dtypes = {}
    for col in sample.columns:
        if sample[col].dtype == "object":
            dtypes[col] = "category"
        elif pd.api.types.is_integer_dtype(sample[col]):
            dtypes[col] = "Int32"
        elif pd.api.types.is_float_dtype(sample[col]):
            dtypes[col] = "float32"
        else:
            dtypes[col] = sample[col].dtype

    chunks = []
    for chunk in pd.read_csv(filepath, dtype=dtypes, chunksize=chunksize, low_memory=False):
        chunk = chunk.dropna().drop_duplicates()
        chunks.append(chunk)

    df_clean = pd.concat(chunks, ignore_index=True)
    return df_clean


In [44]:
df = load_and_optimize_csv_clean(r"C:\Users\NCC200\Desktop\TASK\crime_work\archive.zip")

# Create a cleaned copy
#df_clean = df.dropna().drop_duplicates()

print("Original shape:", df.shape)
#print("Cleaned shape:", df_clean.shape)

Original shape: (7084435, 22)


In [45]:
df.head()

Unnamed: 0,ID,Case Number,Date,Block,IUCR,Primary Type,Description,Location Description,Arrest,Domestic,Beat,District,Ward,Community Area,FBI Code,X Coordinate,Y Coordinate,Year,Updated On,Latitude,Longitude,Location
0,10224738,HY411648,09/05/2015 01:30:00 PM,043XX S WOOD ST,486,BATTERY,DOMESTIC BATTERY SIMPLE,RESIDENCE,False,True,924,9,12.0,61.0,08B,1165074.0,1875917.0,2015,02/10/2018 03:50:01 PM,41.815117,-87.669998,"(41.815117282, -87.669999562)"
1,10224739,HY411615,09/04/2015 11:30:00 AM,008XX N CENTRAL AVE,870,THEFT,POCKET-PICKING,CTA BUS,False,False,1511,15,29.0,25.0,06,1138875.0,1904869.0,2015,02/10/2018 03:50:01 PM,41.895081,-87.765404,"(41.895080471, -87.765400451)"
2,10224740,HY411595,09/05/2015 12:45:00 PM,035XX W BARRY AVE,2023,NARCOTICS,POSS: HEROIN(BRN/TAN),SIDEWALK,True,False,1412,14,35.0,21.0,18,1152037.0,1920384.0,2015,02/10/2018 03:50:01 PM,41.937405,-87.716652,"(41.937405765, -87.716649687)"
3,10224741,HY411610,09/05/2015 01:00:00 PM,0000X N LARAMIE AVE,560,ASSAULT,SIMPLE,APARTMENT,False,True,1522,15,28.0,25.0,08A,1141706.0,1900086.0,2015,02/10/2018 03:50:01 PM,41.881905,-87.755119,"(41.881903443, -87.755121152)"
4,10224742,HY411435,09/05/2015 10:55:00 AM,082XX S LOOMIS BLVD,610,BURGLARY,FORCIBLE ENTRY,RESIDENCE,False,False,614,6,21.0,71.0,05,1168430.0,1850165.0,2015,02/10/2018 03:50:01 PM,41.744377,-87.658432,"(41.744378879, -87.658430635)"


In [46]:
df.columns

Index(['ID', 'Case Number', 'Date', 'Block', 'IUCR', 'Primary Type', 'Description', 'Location Description', 'Arrest', 'Domestic', 'Beat', 'District', 'Ward', 'Community Area', 'FBI Code', 'X Coordinate', 'Y Coordinate', 'Year', 'Updated On', 'Latitude', 'Longitude', 'Location'], dtype='object')

In [47]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7084435 entries, 0 to 7084434
Data columns (total 22 columns):
 #   Column                Dtype  
---  ------                -----  
 0   ID                    Int32  
 1   Case Number           object 
 2   Date                  object 
 3   Block                 object 
 4   IUCR                  object 
 5   Primary Type          object 
 6   Description           object 
 7   Location Description  object 
 8   Arrest                bool   
 9   Domestic              bool   
 10  Beat                  Int32  
 11  District              Int32  
 12  Ward                  float32
 13  Community Area        float32
 14  FBI Code              object 
 15  X Coordinate          float32
 16  Y Coordinate          float32
 17  Year                  Int32  
 18  Updated On            object 
 19  Latitude              float32
 20  Longitude             float32
 21  Location              object 
dtypes: Int32(4), bool(2), float32(6), object(1

In [48]:
df.isna().sum()

ID                      0
Case Number             0
Date                    0
Block                   0
IUCR                    0
Primary Type            0
Description             0
Location Description    0
Arrest                  0
Domestic                0
Beat                    0
District                0
Ward                    0
Community Area          0
FBI Code                0
X Coordinate            0
Y Coordinate            0
Year                    0
Updated On              0
Latitude                0
Longitude               0
Location                0
dtype: int64

In [49]:
df.nunique()

ID                      7084435
Case Number             7083910
Date                    2935205
Block                     37417
IUCR                        401
Primary Type                 35
Description                 540
Location Description        215
Arrest                        2
Domestic                      2
Beat                        303
District                     24
Ward                         50
Community Area               78
FBI Code                     26
X Coordinate              75460
Y Coordinate             125705
Year                         23
Updated On                 4332
Latitude                  92686
Longitude                 38541
Location                 672492
dtype: int64

In [54]:
arrest_district = df.groupby("Ward")["Arrest"].value_counts(normalize=True) * 100
arrest_district
# arrest_district.plot(kind="pie", autopct='%1.1f%%')

Ward  Arrest
1.0   False     80.295352
      True      19.704648
2.0   False     70.281067
      True      29.718933
3.0   False     67.084868
                  ...    
48.0  True      23.762667
49.0  False     74.028533
      True      25.971467
50.0  False     83.984327
      True      16.015673
Name: proportion, Length: 100, dtype: float64