In [1]:
import pandas as pd
import numpy as np
from collections import Counter
import re

In [2]:
file_path = 'data/listings.csv'
df = pd.read_csv(file_path)

df = df.drop_duplicates()

df.dropna(subset=['price', 'bedrooms', 'review_scores_rating', 'beds'], inplace=True)

def extract_bathrooms(bathrooms_text):
    try:
        return float(bathrooms_text.split()[0])
    except:
        return np.nan

df['bathrooms'] = df['bathrooms'].fillna(df['bathrooms_text'].apply(extract_bathrooms))

df['price'] = df['price'].replace('[\$,]', '', regex=True).astype(float)

df['first_review'] = pd.to_datetime(df['first_review'])
df['last_review'] = pd.to_datetime(df['last_review'])



In [3]:
import unicodedata

def remove_unicode(text):
    return ''.join(
        ch for ch in unicodedata.normalize('NFKD', text)
        if not unicodedata.combining(ch)
    )

def clean_amenities(amenities):
    amenities_list = re.findall(r'[\w\s\']+', amenities.strip('{}'))
    cleaned_list = [' '.join(remove_unicode(amenity).strip(' "\',').split()) for amenity in amenities_list]
    return [amenity for amenity in cleaned_list if amenity]

In [4]:
df['amenities'] = df['amenities'].apply(clean_amenities)

print(df.head())


                    id                                      listing_url  \
3   785706334254160321  https://www.airbnb.com/rooms/785706334254160321   
4             38675099            https://www.airbnb.com/rooms/38675099   
5   711651903116237219  https://www.airbnb.com/rooms/711651903116237219   
6             17188701            https://www.airbnb.com/rooms/17188701   
10  660814025913716692  https://www.airbnb.com/rooms/660814025913716692   

         scrape_id last_scraped       source  \
3   20240604143925   2024-06-04  city scrape   
4   20240604143925   2024-06-04  city scrape   
5   20240604143925   2024-06-04  city scrape   
6   20240604143925   2024-06-04  city scrape   
10  20240604143925   2024-06-04  city scrape   

                                               name  \
3                             Mint Hill One Bedroom   
4     #aD43 Double Sunny Room in near SoMa / Center   
5   Pristine 1 bedroom unit near Mission Dolores SF   
6                             Cozy Singl

In [5]:
columns_to_drop = [
    'listing_url', 'scrape_id', 'last_scraped', 'source', 'description', 'host_url',	'neighborhood_overview', 'picture_url', 'host_location', 
    'host_about','host_response_time', 'host_response_rate', 'host_acceptance_rate','host_thumbnail_url', 'host_picture_url','host_neighbourhood', 
    'host_verifications', 'host_has_profile_pic','host_identity_verified', 'calendar_updated','calendar_last_scraped', 'license', 'instant_bookable', 
    'calculated_host_listings_count_entire_homes','calculated_host_listings_count_private_rooms','calculated_host_listings_count_shared_rooms',
    'neighbourhood_group_cleansed', 'neighbourhood','property_type','minimum_nights','maximum_nights','minimum_minimum_nights','number_of_reviews_l30d',
    'maximum_minimum_nights','minimum_maximum_nights','maximum_maximum_nights','minimum_nights_avg_ntm','number_of_reviews_ltm',
    'maximum_nights_avg_ntm','has_availability','availability_30','availability_60','availability_90','review_scores_accuracy',
    'review_scores_cleanliness','review_scores_checkin','review_scores_communication','review_scores_location','review_scores_value', 
    'minimum_nights', 'bathrooms_text','host_since','host_is_superhost'

]

df = df.drop(columns=columns_to_drop)

df.to_csv('data/cleaned.csv', index=False)