In [2]:
import pandas as pd
import numpy as np

# Load the dataset
file_path = "dataset.csv"  # Update this path as needed
data = pd.read_csv(file_path)

# 1. **Data Integrity**
# Check for duplicates
duplicates = data.duplicated()
print(f"Number of duplicate rows: {duplicates.sum()}")
data = data[~duplicates]




Number of duplicate rows: 0


In [3]:
# 2. **Handling Missing Data**
# Columns with missing data
missing_summary = data.isnull().sum()
print("Missing data summary:")
print(missing_summary)

# Handle missing 'name' and 'host_name' by filling with "Unknown"
data['name'] = data['name'].fillna('Unknown')
data['host_name'] = data['host_name'].fillna('Unknown')

# Handle missing 'reviews_per_month' by filling with 0 (no reviews)
data['reviews_per_month'] = data['reviews_per_month'].fillna(0)

# Drop rows with missing 'last_review' as it's non-critical or fill with a placeholder
data['last_review'] = data['last_review'].fillna('No Review')

Missing data summary:
id                                    0
name                                 16
host_id                               0
host_name                            21
neighbourhood_group                   0
neighbourhood                         0
latitude                              0
longitude                             0
room_type                             0
price                                 0
minimum_nights                        0
number_of_reviews                     0
last_review                       10052
reviews_per_month                 10052
calculated_host_listings_count        0
availability_365                      0
dtype: int64


In [7]:

# 4. **Standardization**
# Standardizing text case for consistency
data['name'] = data['name'].str.title()
data['host_name'] = data['host_name'].str.title()

# Ensuring 'price' has no extreme values (outliers)
data = data[data['price'] > 0]  # Remove listings with 0 or negative prices
upper_price_limit = data['price'].quantile(0.99)  # Set the upper threshold at 99th percentile
data = data[data['price'] <= upper_price_limit]

In [5]:
# 5. **Outlier Detection**
# Example: Detecting outliers in 'minimum_nights'
min_nights_upper_limit = data['minimum_nights'].quantile(0.99)
data = data[data['minimum_nights'] <= min_nights_upper_limit]

# 6. Save the cleaned dataset
output_path = "cleaned_AB_NYC_2019.csv"
data.to_csv(output_path, index=False)
print(f"Cleaned dataset saved to {output_path}")

Cleaned dataset saved to cleaned_AB_NYC_2019.csv


In [6]:

# 7. Summary of cleaned data
print(data.info())
print(data.describe())

<class 'pandas.core.frame.DataFrame'>
Index: 47928 entries, 0 to 48894
Data columns (total 16 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              47928 non-null  int64  
 1   name                            47928 non-null  object 
 2   host_id                         47928 non-null  int64  
 3   host_name                       47928 non-null  object 
 4   neighbourhood_group             47928 non-null  object 
 5   neighbourhood                   47928 non-null  object 
 6   latitude                        47928 non-null  float64
 7   longitude                       47928 non-null  float64
 8   room_type                       47928 non-null  object 
 9   price                           47928 non-null  int64  
 10  minimum_nights                  47928 non-null  int64  
 11  number_of_reviews               47928 non-null  int64  
 12  last_review                     47928