In [1]:
import pandas as pd

In [2]:
file = "listings.csv"
df = pd.read_csv(file)
df.head()

Unnamed: 0,id,listing_url,scrape_id,last_scraped,source,name,description,neighborhood_overview,picture_url,host_id,...,review_scores_communication,review_scores_location,review_scores_value,license,instant_bookable,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month
0,40824219,https://www.airbnb.com/rooms/40824219,20251001171547,2025-10-02,city scrape,Room close to Manhattan for FEMALE guests,This cozy spacious room includes a twin size b...,Sunnyside is a safe residental area. <br />The...,https://a0.muscache.com/pictures/hosting/Hosti...,317540555,...,4.88,4.94,4.69,,f,3,0,3,0,0.23
1,40833186,https://www.airbnb.com/rooms/40833186,20251001171547,2025-10-02,previous scrape,Soho LES East village private room downtown,,,https://a0.muscache.com/pictures/1f093bbc-936c...,68718914,...,,,,,t,1,0,1,0,
2,40837137,https://www.airbnb.com/rooms/40837137,20251001171547,2025-10-02,previous scrape,Sunset Park - Quiet and close to subway!,"Cozy, lovely bedroom with a comfortable full s...",the sunset park of Brooklyn,https://a0.muscache.com/pictures/01c4e91e-4012...,317770098,...,5.0,5.0,5.0,,f,1,0,1,0,0.01
3,40838018,https://www.airbnb.com/rooms/40838018,20251001171547,2025-10-02,previous scrape,Cozy One Bedroom in Clinton Hill,This sunny one-bedroom apartment is located in...,Clinton Hill is one of the best neighborhoods ...,https://a0.muscache.com/pictures/9322d54a-6eb7...,17211451,...,5.0,5.0,5.0,,t,1,1,0,0,0.01
4,40839416,https://www.airbnb.com/rooms/40839416,20251001171547,2025-10-02,city scrape,ðŸª´XL dojo ðŸŒ¾ shared green yogi palace apt ðŸŒ¿,"New York City living at its best. A massive, c...",Live like the Ramones > The East Village is st...,https://a0.muscache.com/pictures/hosting/Hosti...,4765305,...,5.0,5.0,4.95,,f,8,0,8,0,0.4


In [3]:
df.columns

Index(['id', 'listing_url', 'scrape_id', 'last_scraped', 'source', 'name',
       'description', 'neighborhood_overview', 'picture_url', 'host_id',
       'host_url', 'host_name', 'host_since', 'host_location', 'host_about',
       'host_response_time', 'host_response_rate', 'host_acceptance_rate',
       'host_is_superhost', 'host_thumbnail_url', 'host_picture_url',
       'host_neighbourhood', 'host_listings_count',
       'host_total_listings_count', 'host_verifications',
       'host_has_profile_pic', 'host_identity_verified', 'neighbourhood',
       'neighbourhood_cleansed', 'neighbourhood_group_cleansed', 'latitude',
       'longitude', 'property_type', 'room_type', 'accommodates', 'bathrooms',
       'bathrooms_text', 'bedrooms', 'beds', 'amenities', 'price',
       'minimum_nights', 'maximum_nights', 'minimum_minimum_nights',
       'maximum_minimum_nights', 'minimum_maximum_nights',
       'maximum_maximum_nights', 'minimum_nights_avg_ntm',
       'maximum_nights_avg_ntm', 'ca

In [4]:
# only keep columns we need
col_to_keep = ["id", "listing_url", "name", "description", "host_id", "host_name",
               "host_listings_count", "host_total_listings_count",
               "neighbourhood_cleansed", "neighbourhood_group_cleansed",
               "latitude", "longitude", "room_type", "bathrooms", "bedrooms",
               "beds", "price", "minimum_nights", "maximum_nights",
               "number_of_reviews", "estimated_occupancy_l365d",
               "estimated_revenue_l365d", "review_scores_rating", "license",
               "calculated_host_listings_count"]

df_clean = df.loc[:, col_to_keep]

len(df_clean.columns)

25

In [5]:
df_clean.isna().sum()

id                                    0
listing_url                           0
name                                  2
description                         958
host_id                               0
host_name                          1166
host_listings_count                1167
host_total_listings_count          1167
neighbourhood_cleansed                0
neighbourhood_group_cleansed          0
latitude                              0
longitude                             0
room_type                             0
bathrooms                         14411
bedrooms                           5949
beds                              14468
price                             14783
minimum_nights                        0
maximum_nights                        0
number_of_reviews                     0
estimated_occupancy_l365d             0
estimated_revenue_l365d           14783
review_scores_rating              11188
license                           30735
calculated_host_listings_count        0


Some missing values will need to be fixed for analysis and visualization. 

I thought about putting in 0s for missing values, but that would distort our mean and median. Instead, we can just handle them when necessary.

In [6]:
simple_listings = df_clean.drop(columns=["host_listings_count", "host_total_listings_count","bathrooms", 
                                         "bedrooms", "beds", "number_of_reviews"])

# fix spelling of neighborhood
simple_listings.rename(columns={"neighbourhood_cleansed": "neighborhood_cleansed",
                                "neighbourhood_group_cleansed": "neighborhood_group_cleansed"}, inplace=True)

simple_listings.columns

Index(['id', 'listing_url', 'name', 'description', 'host_id', 'host_name',
       'neighborhood_cleansed', 'neighborhood_group_cleansed', 'latitude',
       'longitude', 'room_type', 'price', 'minimum_nights', 'maximum_nights',
       'estimated_occupancy_l365d', 'estimated_revenue_l365d',
       'review_scores_rating', 'license', 'calculated_host_listings_count'],
      dtype='object')

I think these remaining columns will be interesting for analysis and creating our project.

Name and description can be good for our visualization. Maybe we can include some other ones like price, minimum nights, estimated occupancy, estimated revenue, or review scores rating.

In [7]:
# save to .csv so we don't need to run this code again
simple_listings.to_csv("airbnb_clean.csv")