In [2]:
import os
import pandas as pd
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)
pd.set_option("display.width", None)
pd.set_option("display.max_colwidth", None)

In [3]:
PROJECT_ROOT = os.getcwd()

DATA_RAW_DIR = os.path.join(PROJECT_ROOT,"data_raw")
DATA_CLEAN_DIR = os.path.join(PROJECT_ROOT, "data_clean")
DATA_FINAL_DIR = os.path.join(PROJECT_ROOT, "data_final")

PROJECT_ROOT, DATA_RAW_DIR

('c:\\Users\\Christopher\\Documents\\Python Projects\\New_York_City_Eviction',
 'c:\\Users\\Christopher\\Documents\\Python Projects\\New_York_City_Eviction\\data_raw')

In [4]:
raw_csv_path = os.path.join(DATA_RAW_DIR, "evictions_raw.csv")
print("Loading from:", raw_csv_path)

evictions_raw = pd.read_csv(raw_csv_path)
print("Shape:", evictions_raw.shape)

Loading from: c:\Users\Christopher\Documents\Python Projects\New_York_City_Eviction\data_raw\evictions_raw.csv
Shape: (118835, 20)


In [5]:
evictions_raw.head()

Unnamed: 0,court_index_number,docket_number,eviction_address,eviction_apt_num,executed_date,marshal_first_name,marshal_last_name,residential_commercial_ind,borough,eviction_zip,ejectment,eviction_possession,latitude,longitude,community_board,council_district,census_tract,bin,bbl,nta
0,B325757/22,107082,2244 CRESTON AVENUE,5H,2024-10-29T00:00:00.000,Ileana,Rivera,Residential,BRONX,10453,Not an Ejectment,Possession,40.856808,-73.9015,5.0,14.0,23704.0,2013766.0,2031630000.0,Fordham South
1,K101614/16,377963,2612 WEST STREET,1F,2017-05-05T00:00:00.000,Richard,McCoy,Residential,BROOKLYN,11223,Not an Ejectment,Possession,40.586018,-73.969376,13.0,47.0,37402.0,3320727.0,3072350000.0,Gravesend
2,341356/23,5711,100 ELGAR PLACE,9C,2025-10-21T00:00:00.000,Salavatore,Giglio,Residential,BRONX,10475,Not an Ejectment,Possession,40.863634,-73.821678,10.0,12.0,302.0,2093859.0,2051350000.0,Co-op City
3,306977/25,54919,20-21 46TH STREET,,2025-10-07T00:00:00.000,Edward,Guida,Residential,QUEENS,11105,Not an Ejectment,Possession,40.773462,-73.898619,1.0,22.0,12301.0,4014729.0,4007720000.0,Steinway
4,57624/17,75131,2750 HOMECREST AVE LOT G - SPACE #2 OF THE HOLLYWOOBUILDING,,2017-10-20T00:00:00.000,Henry,Daley,Commercial,BROOKLYN,11235,Not an Ejectment,Possession,,,,,,,,


In [6]:
evictions_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 118835 entries, 0 to 118834
Data columns (total 20 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   court_index_number          118835 non-null  object 
 1   docket_number               118835 non-null  int64  
 2   eviction_address            118835 non-null  object 
 3   eviction_apt_num            101328 non-null  object 
 4   executed_date               118835 non-null  object 
 5   marshal_first_name          118835 non-null  object 
 6   marshal_last_name           118835 non-null  object 
 7   residential_commercial_ind  118835 non-null  object 
 8   borough                     118835 non-null  object 
 9   eviction_zip                118835 non-null  int64  
 10  ejectment                   118835 non-null  object 
 11  eviction_possession         118835 non-null  object 
 12  latitude                    108316 non-null  float64
 13  longitude     

In [7]:
evictions_raw.isna().sum().sort_values(ascending=False)

eviction_apt_num              17507
bbl                           10883
bin                           10883
nta                           10519
census_tract                  10519
council_district              10519
community_board               10519
longitude                     10519
latitude                      10519
eviction_possession               0
court_index_number                0
docket_number                     0
eviction_zip                      0
borough                           0
residential_commercial_ind        0
marshal_last_name                 0
marshal_first_name                0
executed_date                     0
eviction_address                  0
ejectment                         0
dtype: int64

In [8]:

evictions_raw[["borough", "residential_commercial_ind", "ejectment", "eviction_possession"]].nunique()

borough                       5
residential_commercial_ind    2
ejectment                     2
eviction_possession           3
dtype: int64

In [9]:
evictions_raw["executed_date"] = pd.to_datetime(
    evictions_raw["executed_date"],
    errors="coerce"
)

evictions_raw["executed_date"].head()

0   2024-10-29
1   2017-05-05
2   2025-10-21
3   2025-10-07
4   2017-10-20
Name: executed_date, dtype: datetime64[ns]

In [10]:
evictions_raw["latitude"] = pd.to_numeric(evictions_raw["latitude"], errors="coerce")
evictions_raw["longitude"] = pd.to_numeric(evictions_raw["longitude"], errors="coerce")

In [11]:
numeric_cols = [
    "community_board",
    "council_district",
    "census_tract",
    "bin",
    "bbl"
]

for col in numeric_cols:
    evictions_raw[col] = pd.to_numeric(evictions_raw[col], errors="coerce")

In [12]:
text_cols = [
    "borough",
    "residential_commercial_ind",
    "ejectment",
    "eviction_possession",
    "eviction_address",
    "eviction_apt_num",
    "marshal_first_name",
    "marshal_last_name",
    "nta"
]

for col in text_cols:
    evictions_raw[col] = evictions_raw[col].astype(str).str.strip()

In [13]:
evictions_raw.dtypes

court_index_number                    object
docket_number                          int64
eviction_address                      object
eviction_apt_num                      object
executed_date                 datetime64[ns]
marshal_first_name                    object
marshal_last_name                     object
residential_commercial_ind            object
borough                               object
eviction_zip                           int64
ejectment                             object
eviction_possession                   object
latitude                             float64
longitude                            float64
community_board                      float64
council_district                     float64
census_tract                         float64
bin                                  float64
bbl                                  float64
nta                                   object
dtype: object

In [14]:
# Year as integer (e.g., 2024)
evictions_raw["year"] = evictions_raw["executed_date"].dt.year

# Month as integer 1-12
evictions_raw["month"] = evictions_raw["executed_date"].dt.month

# Year-month as a string label (e.g., "2024-10")
evictions_raw["year_month"] = evictions_raw["executed_date"].dt.to_period("M").astype(str)

evictions_raw[["executed_date", "year", "month", "year_month"]].head(20)

Unnamed: 0,executed_date,year,month,year_month
0,2024-10-29,2024,10,2024-10
1,2017-05-05,2017,5,2017-05
2,2025-10-21,2025,10,2025-10
3,2025-10-07,2025,10,2025-10
4,2017-10-20,2017,10,2017-10
5,2019-10-03,2019,10,2019-10
6,2025-09-16,2025,9,2025-09
7,2017-10-10,2017,10,2017-10
8,2019-07-17,2019,7,2019-07
9,2024-07-17,2024,7,2024-07


In [15]:
evictions_raw[["year", "month", "year_month"]].describe(include="all")

Unnamed: 0,year,month,year_month
count,118835.0,118835.0,118835
unique,,,100
top,,,2018-01
freq,,,2246
mean,2020.604283,6.159381,
std,3.035116,3.36938,
min,2017.0,1.0,
25%,2018.0,3.0,
50%,2019.0,6.0,
75%,2024.0,9.0,


In [16]:
evictions_raw["borough"].value_counts()

borough
BRONX            37604
BROOKLYN         33536
QUEENS           23701
MANHATTAN        19667
STATEN ISLAND     4327
Name: count, dtype: int64

In [17]:
evictions_raw["residential_commercial_ind"].value_counts()

residential_commercial_ind
Residential    108177
Commercial      10658
Name: count, dtype: int64

In [18]:
evictions_raw["ejectment"].value_counts()

ejectment
Not an Ejectment    118748
Ejectment               87
Name: count, dtype: int64

In [19]:
evictions_raw["eviction_possession"].value_counts()

eviction_possession
Possession     117162
Eviction         1672
Unspecified         1
Name: count, dtype: int64

In [20]:
evictions_raw["borough"] = (
    evictions_raw["borough"]
    .astype(str)
    .str.strip()
    .str.upper()
)

evictions_raw["residential_commercial_ind"] = (
    evictions_raw["residential_commercial_ind"]
    .astype(str)
    .str.strip()
    .str.title()
)

evictions_raw["nta"] = (
    evictions_raw["nta"]
    .astype(str)
    .str.strip()
    .str.title()
)

evictions_raw["marshal_first_name"] = (
    evictions_raw["marshal_first_name"]
    .astype(str)
    .str.strip()
    .str.title()
)

evictions_raw["marshal_last_name"] = (
    evictions_raw["marshal_last_name"]
    .astype(str)
    .str.strip()
    .str.title()
)

In [21]:
# Normalize ejectment text
evictions_raw["eviction_clean"] = (
    evictions_raw["ejectment"]
    .astype(str)
    .str.strip()
    .str.lower()
)

evictions_raw[evictions_raw["ejectment"] == "Ejectment"].head(100)

evictions_raw["ejectment"].value_counts()

# Map to standardized labels
ejectment_map = {
    "ejectment": "Ejectment",
    "not an ejectment": "Not an Ejectment"
}

evictions_raw["ejectment_standardized"] = (
    evictions_raw["eviction_clean"].map(ejectment_map).fillna("Other/Unknown")
)

# Boolean flag for analysis (True if it's an ejectment case)
evictions_raw["is_ejectment"] = evictions_raw["ejectment_standardized"].eq("Ejectment")

evictions_raw[["ejectment", "ejectment_standardized", "is_ejectment"]].value_counts()

ejectment         ejectment_standardized  is_ejectment
Not an Ejectment  Not an Ejectment        False           118748
Ejectment         Ejectment               True                87
Name: count, dtype: int64

In [22]:
evictions_raw.columns

Index(['court_index_number', 'docket_number', 'eviction_address',
       'eviction_apt_num', 'executed_date', 'marshal_first_name',
       'marshal_last_name', 'residential_commercial_ind', 'borough',
       'eviction_zip', 'ejectment', 'eviction_possession', 'latitude',
       'longitude', 'community_board', 'council_district', 'census_tract',
       'bin', 'bbl', 'nta', 'year', 'month', 'year_month', 'eviction_clean',
       'ejectment_standardized', 'is_ejectment'],
      dtype='object')

In [23]:
evictions_raw["eviction_possession"] = (
    evictions_raw["eviction_possession"]
    .astype(str)
    .str.strip()
    .str.title()
)

evictions_raw["eviction_possession"].value_counts()

eviction_possession
Possession     117162
Eviction         1672
Unspecified         1
Name: count, dtype: int64

In [24]:
evictions_raw[["borough", "residential_commercial_ind", "ejectment_standardized", "is_ejectment", "eviction_possession"]].head()

Unnamed: 0,borough,residential_commercial_ind,ejectment_standardized,is_ejectment,eviction_possession
0,BRONX,Residential,Not an Ejectment,False,Possession
1,BROOKLYN,Residential,Not an Ejectment,False,Possession
2,BRONX,Residential,Not an Ejectment,False,Possession
3,QUEENS,Residential,Not an Ejectment,False,Possession
4,BROOKLYN,Commercial,Not an Ejectment,False,Possession


In [25]:
evictions_raw.isna().sum().sort_values(ascending=False)

bbl                           10883
bin                           10883
longitude                     10519
community_board               10519
census_tract                  10519
council_district              10519
latitude                      10519
ejectment_standardized            0
eviction_clean                    0
year_month                        0
month                             0
year                              0
nta                               0
court_index_number                0
docket_number                     0
eviction_possession               0
ejectment                         0
eviction_zip                      0
borough                           0
residential_commercial_ind        0
marshal_last_name                 0
marshal_first_name                0
executed_date                     0
eviction_apt_num                  0
eviction_address                  0
is_ejectment                      0
dtype: int64

In [31]:
missing_geo = evictions_raw[evictions_raw["latitude"].isna() | evictions_raw["longitude"].isna()]

missing_geo.head(30)
missing_geo.shape

(10519, 27)

In [29]:
evictions_raw["has_geo"] = evictions_raw["latitude"].notna() & evictions_raw["longitude"].notna()

In [30]:
invalid_lat = ~evictions_raw["latitude"].between(40.0, 41.0, inclusive="both")
invalid_lon = ~evictions_raw["longitude"].between(-75.0, -73.0, inclusive="both")

invalid_rows = evictions_raw[invalid_lat | invalid_lon]
invalid_rows.shape

(10519, 27)

In [32]:
evictions_raw["has_geo"].value_counts()

has_geo
True     108316
False     10519
Name: count, dtype: int64

In [33]:
cols_order = [
    # IDs
    "court_index_number",
    "docket_number",

    # Time
    "executed_date",
    "year",
    "month",
    "year_month",

    # Location + address
    "borough",
    "eviction_address",
    "eviction_apt_num",
    "eviction_zip",
    "nta",
    "community_board",
    "council_district",
    "census_tract",
    "bin",
    "bbl",

    # Coordinates + geo flag
    "latitude",
    "longitude",
    "has_geo",

    # Case type + flags
    "residential_commercial_ind",
    "ejectment",
    "ejectment_standardized",
    "is_ejectment",
    "eviction_possession",

    # Marshal info
    "marshal_first_name",
    "marshal_last_name",
]

# Build the cleaned dataframe in the desired column order
evictions_clean = evictions_raw[cols_order].copy()

evictions_clean.head()


Unnamed: 0,court_index_number,docket_number,executed_date,year,month,year_month,borough,eviction_address,eviction_apt_num,eviction_zip,nta,community_board,council_district,census_tract,bin,bbl,latitude,longitude,has_geo,residential_commercial_ind,ejectment,ejectment_standardized,is_ejectment,eviction_possession,marshal_first_name,marshal_last_name
0,B325757/22,107082,2024-10-29,2024,10,2024-10,BRONX,2244 CRESTON AVENUE,5H,10453,Fordham South,5.0,14.0,23704.0,2013766.0,2031630000.0,40.856808,-73.9015,True,Residential,Not an Ejectment,Not an Ejectment,False,Possession,Ileana,Rivera
1,K101614/16,377963,2017-05-05,2017,5,2017-05,BROOKLYN,2612 WEST STREET,1F,11223,Gravesend,13.0,47.0,37402.0,3320727.0,3072350000.0,40.586018,-73.969376,True,Residential,Not an Ejectment,Not an Ejectment,False,Possession,Richard,Mccoy
2,341356/23,5711,2025-10-21,2025,10,2025-10,BRONX,100 ELGAR PLACE,9C,10475,Co-Op City,10.0,12.0,302.0,2093859.0,2051350000.0,40.863634,-73.821678,True,Residential,Not an Ejectment,Not an Ejectment,False,Possession,Salavatore,Giglio
3,306977/25,54919,2025-10-07,2025,10,2025-10,QUEENS,20-21 46TH STREET,,11105,Steinway,1.0,22.0,12301.0,4014729.0,4007720000.0,40.773462,-73.898619,True,Residential,Not an Ejectment,Not an Ejectment,False,Possession,Edward,Guida
4,57624/17,75131,2017-10-20,2017,10,2017-10,BROOKLYN,2750 HOMECREST AVE LOT G - SPACE #2 OF THE HOLLYWOOBUILDING,,11235,Nan,,,,,,,,False,Commercial,Not an Ejectment,Not an Ejectment,False,Possession,Henry,Daley


In [34]:
evictions_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 118835 entries, 0 to 118834
Data columns (total 26 columns):
 #   Column                      Non-Null Count   Dtype         
---  ------                      --------------   -----         
 0   court_index_number          118835 non-null  object        
 1   docket_number               118835 non-null  int64         
 2   executed_date               118835 non-null  datetime64[ns]
 3   year                        118835 non-null  int32         
 4   month                       118835 non-null  int32         
 5   year_month                  118835 non-null  object        
 6   borough                     118835 non-null  object        
 7   eviction_address            118835 non-null  object        
 8   eviction_apt_num            118835 non-null  object        
 9   eviction_zip                118835 non-null  int64         
 10  nta                         118835 non-null  object        
 11  community_board             108316 non-

In [36]:
int_cols = [
    "community_board",
    "council_district",
    "census_tract",
    "bin",
    "bbl"
]

for col in int_cols:
    evictions_clean[col] = evictions_clean[col].astype("Int64")

evictions_clean.dtypes

court_index_number                    object
docket_number                          int64
executed_date                 datetime64[ns]
year                                   int32
month                                  int32
year_month                            object
borough                               object
eviction_address                      object
eviction_apt_num                      object
eviction_zip                           int64
nta                                   object
community_board                        Int64
council_district                       Int64
census_tract                           Int64
bin                                    Int64
bbl                                    Int64
latitude                             float64
longitude                            float64
has_geo                                 bool
residential_commercial_ind            object
ejectment                             object
ejectment_standardized                object
is_ejectme

In [37]:
evictions_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 118835 entries, 0 to 118834
Data columns (total 26 columns):
 #   Column                      Non-Null Count   Dtype         
---  ------                      --------------   -----         
 0   court_index_number          118835 non-null  object        
 1   docket_number               118835 non-null  int64         
 2   executed_date               118835 non-null  datetime64[ns]
 3   year                        118835 non-null  int32         
 4   month                       118835 non-null  int32         
 5   year_month                  118835 non-null  object        
 6   borough                     118835 non-null  object        
 7   eviction_address            118835 non-null  object        
 8   eviction_apt_num            118835 non-null  object        
 9   eviction_zip                118835 non-null  int64         
 10  nta                         118835 non-null  object        
 11  community_board             108316 non-

In [None]:
clean_csv_path = os.path.join(DATA_CLEAN_DIR, "evictions_clean.csv")
evictions_clean.to_csv(clean_csv_path, index=False)

clean_csv_path

'c:\\Users\\Christopher\\Documents\\Python Projects\\New_York_City_Eviction\\data_clean\\evictions_clean.csv'