In [None]:
# Step 1: Import Libraries
import pandas as pd
import numpy as np

# Step 2: Load Dataset
from google.colab import files
uploaded = files.upload()

df = pd.read_csv('AB_NYC_2019.csv')

# Step 3: Preview Data
print("📌 First 5 Rows:")
print(df.head())

print("\n🔍 Dataset Info:")
print(df.info())

print("\n📏 Shape of Dataset:", df.shape)

# Step 4: Check for Missing Values
print("\n🧼 Missing Values Before Cleaning:")
print(df.isnull().sum())

# Step 5: Handle Missing Values
# Fill missing 'reviews_per_month' with 0
df['reviews_per_month'] = df['reviews_per_month'].fillna(0)

# Fill missing 'last_review' with 'No Review'
df['last_review'] = df['last_review'].fillna('No Review')

# Drop rows with any remaining missing values (if necessary)
df.dropna(inplace=True)

print("\n🧼 Missing Values After Cleaning:")
print(df.isnull().sum())

# Step 6: Remove Duplicates
duplicates = df.duplicated().sum()
print("\n🔍 Duplicates Found:", duplicates)

df.drop_duplicates(inplace=True)
print("✅ Duplicates Removed")

# Step 7: Handle Outliers
# Example: Remove listings with unrealistic prices or nights
df = df[df['price'] <= 1000]  # Remove extreme high prices
df = df[df['minimum_nights'] <= 365]  # Remove extreme night stays

print("\n📏 Shape After Outlier Removal:", df.shape)

# Step 8: Standardize Column Names
df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_")

# Step 9: Save Cleaned Data
df.to_csv('AB_NYC_2019_Cleaned.csv', index=False)

print("\n💾 Cleaned dataset saved as 'AB_NYC_2019_Cleaned.csv'")


Saving AB_NYC_2019.csv to AB_NYC_2019.csv
📌 First 5 Rows:
     id                                              name  host_id  \
0  2539                Clean & quiet apt home by the park     2787   
1  2595                             Skylit Midtown Castle     2845   
2  3647               THE VILLAGE OF HARLEM....NEW YORK !     4632   
3  3831                   Cozy Entire Floor of Brownstone     4869   
4  5022  Entire Apt: Spacious Studio/Loft by central park     7192   

     host_name neighbourhood_group neighbourhood  latitude  longitude  \
0         John            Brooklyn    Kensington  40.64749  -73.97237   
1     Jennifer           Manhattan       Midtown  40.75362  -73.98377   
2    Elisabeth           Manhattan        Harlem  40.80902  -73.94190   
3  LisaRoxanne            Brooklyn  Clinton Hill  40.68514  -73.95976   
4        Laura           Manhattan   East Harlem  40.79851  -73.94399   

         room_type  price  minimum_nights  number_of_reviews last_review  \
0     