In [None]:
import pandas as pd

In [None]:
file_path = 'data/listings.csv'
df = pd.read_csv(file_path)

df = df.drop_duplicates()
print("Duplicates removed. Remaining rows:", len(df))

missing_values = df.isnull().sum()
print("Missing values before handling:\n", missing_values)

In [None]:
df.dropna(subset=['price', 'bedrooms'], inplace=True)
missing_values = df.isnull().sum()

In [None]:
df['price'] = df['price'].replace('[\$,]', '', regex=True).astype(float)
df['host_since'] = pd.to_datetime(df['host_since'])
df['last_scraped'] = pd.to_datetime(df['last_scraped'])
df['first_review'] = pd.to_datetime(df['first_review'])
df['last_review'] = pd.to_datetime(df['last_review'])

boolean_columns = ['host_is_superhost', 'instant_bookable', 'has_availability']
for col in boolean_columns:
    df[col] = df[col].apply(lambda x: True if x == 't' else False)

In [None]:
df['host_experience_days'] = (pd.to_datetime('today') - df['host_since']).dt.days

df['avg_availability'] = df[['availability_30', 'availability_60', 'availability_90', 'availability_365']].mean(axis=1)

text_columns = ['name', 'description', 'neighborhood_overview', 'host_about']
for col in text_columns:
    df[col] = df[col].str.lower().str.replace('[^\w\s]', '')

In [None]:
price_threshold = df['price'].quantile(0.99)
df = df[df['price'] < price_threshold]

In [None]:
columns_to_drop = [
    'listing_url', 'scrape_id', 'last_scraped', 'source','host_location', 'host_about','host_response_time', 'host_response_rate', 
    'host_acceptance_rate','host_thumbnail_url', 'host_picture_url','host_neighbourhood', 'host_listings_count', 'host_total_listings_count',
    'host_verifications', 'host_has_profile_pic', 'host_identity_verified','neighborhood_overview',  'calendar_updated',
    'calendar_last_scraped', 'first_review', 'last_review', 'license', 'instant_bookable', 'calculated_host_listings_count',
    'calculated_host_listings_count_entire_homes','calculated_host_listings_count_private_rooms','calculated_host_listings_count_shared_rooms',
    'neighbourhood_group_cleansed', 'neighbourhood'
]

df_cleaned = df.drop(columns=columns_to_drop)


In [None]:
df_cleaned.head(50)