In [1]:
import pandas as pd
import numpy as np

In [2]:
# Load dataset
df = pd.read_csv("AB_NYC_2019.csv")


In [3]:
print("Original Dataset Shape:", df.shape)
print("\nMissing Values:\n", df.isnull().sum())


Original Dataset Shape: (48895, 16)

Missing Values:
 id                                    0
name                                 16
host_id                               0
host_name                            21
neighbourhood_group                   0
neighbourhood                         0
latitude                              0
longitude                             0
room_type                             0
price                                 0
minimum_nights                        0
number_of_reviews                     0
last_review                       10052
reviews_per_month                 10052
calculated_host_listings_count        0
availability_365                      0
dtype: int64


In [7]:
# Fill missing reviews_per_month with mean
df['reviews_per_month'].fillna(df['reviews_per_month'].mean())

0        0.210000
1        0.380000
2        1.373221
3        4.640000
4        0.100000
           ...   
48890    1.373221
48891    1.373221
48892    1.373221
48893    1.373221
48894    1.373221
Name: reviews_per_month, Length: 48895, dtype: float64

In [9]:
# Fill missing name and host_name with "unknown"
df['name'].fillna("unknown")
df['host_name'].fillna("unknown")

0                 John
1             Jennifer
2            Elisabeth
3          LisaRoxanne
4                Laura
             ...      
48890          Sabrina
48891          Marisol
48892    Ilgar & Aysel
48893              Taz
48894       Christophe
Name: host_name, Length: 48895, dtype: object

In [10]:
# Fill missing last_review with "no review"
df['last_review'].fillna("no review")

0        2018-10-19
1        2019-05-21
2         no review
3        2019-07-05
4        2018-11-19
            ...    
48890     no review
48891     no review
48892     no review
48893     no review
48894     no review
Name: last_review, Length: 48895, dtype: object

In [11]:
# 2. Remove Duplicates
df = df.drop_duplicates()

In [12]:
# 3. Standardization

# Convert column names to lowercase
df.columns = df.columns.str.strip().str.lower()


In [13]:
# Clean text columns
for col in df.select_dtypes(include=['object']).columns:
    df[col] = df[col].astype(str).str.strip().str.lower()

In [14]:
# 4. Outlier Detection & Handling

# Remove listings with unrealistic prices (> $1000)
df = df[df['price'] <= 1000]

In [15]:
# Remove listings with unrealistic minimum_nights (> 365 days)
df = df[df['minimum_nights'] <= 365]

In [16]:
# Final Results
# -----------------------------
print("\n✅ Cleaning Completed!")
print("Cleaned Dataset Shape:", df.shape)
print("Remaining Missing Values:", df.isnull().sum().sum())


✅ Cleaning Completed!
Cleaned Dataset Shape: (48642, 16)
Remaining Missing Values: 0


In [17]:
# Show sample
print("\nSample Cleaned Data:\n", df.head())


Sample Cleaned Data:
      id                                              name  host_id  \
0  2539                clean & quiet apt home by the park     2787   
1  2595                             skylit midtown castle     2845   
2  3647               the village of harlem....new york !     4632   
3  3831                   cozy entire floor of brownstone     4869   
4  5022  entire apt: spacious studio/loft by central park     7192   

     host_name neighbourhood_group neighbourhood  latitude  longitude  \
0         john            brooklyn    kensington  40.64749  -73.97237   
1     jennifer           manhattan       midtown  40.75362  -73.98377   
2    elisabeth           manhattan        harlem  40.80902  -73.94190   
3  lisaroxanne            brooklyn  clinton hill  40.68514  -73.95976   
4        laura           manhattan   east harlem  40.79851  -73.94399   

         room_type  price  minimum_nights  number_of_reviews last_review  \
0     private room    149               1