In [None]:
# Dependencies
import pandas as pd
from pathlib import Path

## Importing and cleaning up data

In [None]:
# Importing csv files and creating dataframes
data_2020 = Path("resources/AB_US_2020.csv")
data_2023 = Path("resources/AB_US_2023.csv")

df_2020 = pd.read_csv(data_2020, low_memory=False)
df_2023 = pd.read_csv(data_2023, low_memory=False)

In [None]:
# Display 2020 data head
df_2020.head()

In [None]:
# Display 2023 data head
df_2023.head()

In [None]:
# Show unique cities for 2020 data
df_2020["city"].unique()

In [None]:
# Show unique columns for the 2020 data
df_2020.columns

In [None]:
# Show unique cities for 2023 data
df_2023["city"].unique()

In [None]:
# Show unique columns for the 2023 data
df_2023.columns

## Narrow the data to the target cities

In [None]:
# Narrow the data to only San Francisco, New York City, and Twin Cities MSA for 2020
df_2020 = df_2020.loc[(df_2020["city"] == "San Francisco") | (df_2020["city"] == "New York City") | (df_2020["city"] == "Twin Cities MSA")]

# df_2020["city"].unique()
df_2020.head()

In [None]:
# Narrow the data to only San Francisco, New York City, and Twin Cities MSA for 2023
df_2023 = df_2023.loc[(df_2023["city"] == "San Francisco") | (df_2023["city"] == "New York City") | (df_2023["city"] == "Twin Cities MSA")]

# df_2020["city"].unique()
df_2023.head()

## Narrow the data to the target columns

In [None]:
# Select specific columns for review
df_2020 = df_2020[["id", "name", "host_id", "latitude", "longitude", "room_type", "price", "minimum_nights", "number_of_reviews", "calculated_host_listings_count","availability_365", "city"]]


In [None]:
# Rename the columns
df_2020 = df_2020.rename(columns={"id": "Listing ID", 
                                  "name": "Description",
                                  "host_id": "Host ID", 
                                  "latitude": "Latitude", 
                                  "longitude": "Longitude", 
                                  "room_type": "Listing Type", 
                                  "price": "Price", 
                                  "minimum_nights": "Minimum Nights", 
                                  "number_of_reviews": 
                                  "Number of Reviews", 
                                  "calculated_host_listings_count": "Number of Listings Per Host",
                                  "availability_365": "Availability", 
                                  "city": "City"})

In [None]:
# Reorganize the columns
df_2020 = df_2020[["Listing ID", "Host ID", "Number of Listings Per Host", "City", "Latitude", "Longitude", "Listing Type", "Price", "Minimum Nights", "Availability", "Number of Reviews"]]



In [None]:
df_2020 = df_2020.loc[df_2020["Availability"] != 0]

In [None]:
# Set the index to listing id
df_2020 = df_2020.set_index("Listing ID")

# Display
df_2020.head()

In [None]:

# Select specific columns for review
df_2023 = df_2023[["id", "name", "host_id", "latitude", "longitude", "room_type", "price", "minimum_nights", "number_of_reviews", "calculated_host_listings_count", "availability_365", "city"]]


In [None]:
# Rename the columns
df_2023 = df_2023.rename(columns={"id": "Listing ID", 
                                  "name": "Description",
                                  "host_id": "Host ID",
                                  "calculated_host_listings_count":  "Number of Listings Per Host", 
                                  "latitude": "Latitude", 
                                  "longitude": "Longitude", 
                                  "room_type": "Listing Type", 
                                  "price": "Price", 
                                  "minimum_nights": "Minimum Nights", 
                                  "number_of_reviews": 
                                  "Number of Reviews", 
                                  "availability_365": "Availability", 
                                  "city": "City"})

In [None]:
# Reorganize the columns
df_2023 = df_2023[["Listing ID", "Host ID", "Number of Listings Per Host", "City", "Latitude", "Longitude", "Listing Type", "Price", "Minimum Nights", "Availability", "Number of Reviews"]]

In [None]:
df_2023 = df_2023.loc[df_2023["Availability"] != 0]

In [None]:
# Set the index to listing id
df_2023 = df_2023.set_index("Listing ID")

# Display
df_2023.head()

## Exporting finished dataframes to csv files

In [None]:
# Save finished dataframes as csv files
df_2020.to_csv("output/clean_2020.csv", index=True)
df_2023.to_csv("output/clean_2023.csv", index=True)
