Load the Dataset (or you can load # outted the paths below and load your dataset from your computer)

In [None]:
import pandas as pd

# Paths to your CSV files
calendar_path = './data/raw/calendar.csv'
listings_path = './data/raw/listings.csv'
reviews_path = './data/raw/reviews.csv'

# Load the CSV files
calendar_df = pd.read_csv(calendar_path)
listings_df = pd.read_csv(listings_path)
reviews_df = pd.read_csv(reviews_path)

Explore the Dataset

In [None]:
# Display the first few rows of each DataFrame to understand their structure
print("Calendar DataFrame:")
print(calendar_df.head())

print("\nListings DataFrame:")
print(listings_df.head())

print("\nReviews DataFrame:")
print(reviews_df.head())

# Display summary info for each DataFrame
print("\nCalendar DataFrame Info:")
print(calendar_df.info())

print("\nListings DataFrame Info:")
print(listings_df.info())

print("\nReviews DataFrame Info:")
print(reviews_df.info())


calendar dataframe cleaning

1. price to numeric after removing the dollar sign
2. date to datetime
3. available to boolean
4. handle missing values

In [None]:
# Inspect the initial values in the 'available' column
print("Initial unique values in 'available':", calendar_df['available'].unique())

# Convert available to boolean if values are 't' and 'f'
calendar_df['available'] = calendar_df['available'].map({'t': True, 'f': False})

# Check the unique values in 'available' after mapping
print("Unique values in 'available' after mapping:", calendar_df['available'].unique())

# Remove dollar sign and convert price to numeric
calendar_df['price'] = calendar_df['price'].replace('[\$,]', '', regex=True).astype(float)

# Convert date to datetime
calendar_df['date'] = pd.to_datetime(calendar_df['date'])

# Handle missing values by dropping rows where price or available is NaN
calendar_df.dropna(subset=['price', 'available'], inplace=True)

# Display cleaned Calendar DataFrame
print(calendar_df.head())
print(calendar_df.info())


Listings dataframe cleaning

Remove Unwanted Columns

In [None]:
# List of columns to remove
columns_to_remove = [
    'scrape_id', 'last_scraped', 'space', 'experiences_offered', 'thumbnail_url',
    'medium_url', 'picture_url', 'xl_picture_url', 'host_url', 'host_location', 'host_about',
    'host_thumbnail_url', 'host_picture_url', 'neighbourhood', 'calendar_updated',
    'calendar_last_scraped', 'requires_license', 'license', 'availability_30',
    'availability_60', 'availability_90', 'availability_365'
]

# Remove the specified columns
listings_df.drop(columns=columns_to_remove, inplace=True)

# Display the updated Listings DataFrame
print(listings_df.head())
print(listings_df.info())



Price-Related Columns to Numeric and Handle Missing Values

filling missing values in specific text columns with empty strings. Any missing values in these columns are replaced with an empty string rather than leaving them as NaN (Not a Number), which can cause issues in data processing and analysis.

In [None]:
# Remove dollar signs and convert price-related columns to numeric
price_columns = ['price', 'weekly_price', 'monthly_price', 'security_deposit', 'cleaning_fee', 'extra_people']
for col in price_columns:
    listings_df[col] = listings_df[col].replace('[\$,]', '', regex=True).astype(float)

# Convert date columns to datetime
date_columns = ['host_since', 'first_review', 'last_review']
for col in date_columns:
    listings_df[col] = pd.to_datetime(listings_df[col])

# Fill missing values with empty strings for text columns
text_columns = ['summary', 'neighborhood_overview', 'notes', 'transit']
listings_df[text_columns] = listings_df[text_columns].fillna('')

# Display the cleaned Listings DataFrame
print(listings_df.head())
print(listings_df.info())


Feature Engineering for Pricing Strategy

In [None]:
# Create new features
listings_df['price_per_bedroom'] = listings_df['price'] / listings_df['bedrooms']
listings_df['price_per_guest'] = listings_df['price'] / listings_df['accommodates']

# Display the updated Listings DataFrame with new features
print(listings_df.head())


Final Checks

In [None]:
# Final check for any remaining missing values
missing_values = listings_df.isnull().sum()
print(missing_values[missing_values > 0])

# Display the final cleaned Listings DataFrame
print(listings_df.head())
print(listings_df.info())

# Save the cleaned Listings DataFrame
listings_df.to_csv('/content/drive/My Drive/IST 652 final project/cleaned_listings.csv', index=False)



Reviews dataframe cleaning

date to Datetime format

In [None]:
# Convert date to datetime
reviews_df['date'] = pd.to_datetime(reviews_df['date'])

# Display the updated Reviews DataFrame to verify changes
print(reviews_df[['date']].head())


Handle Missing Values


We will fill missing values in the comments column with empty strings. This ensures consistency and prevents issues during text analysis.

In [None]:
# Handle missing values in comments by filling with empty strings
reviews_df['comments'].fillna('', inplace=True)

# Display the cleaned Reviews DataFrame
print(reviews_df.head())
print(reviews_df.info())


Final Checks for Missing Values

In [None]:
# Final check for any remaining missing values
missing_values = reviews_df.isnull().sum()
print(missing_values[missing_values > 0])


Save the Cleaned Reviews DataFrame

In [None]:
# Save the cleaned Reviews DataFrame
cleaned_reviews_path = '/content/drive/My Drive/IST 652 final project/cleaned_reviews.csv'
reviews_df.to_csv(cleaned_reviews_path, index=False)

# Display the final cleaned Reviews DataFrame
print(reviews_df.head())
print(reviews_df.info())


EDA

Import libraries and load cleaned dataframes

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

cleaned_calendar_path = './data/processed/cleaned_calendar.csv'
cleaned_listings_path = './data/processed/cleaned_listings.csv'
cleaned_reviews_path = './data/processed/cleaned_reviews.csv'


calendar_df = pd.read_csv(cleaned_calendar_path)
listings_df = pd.read_csv(cleaned_listings_path)
reviews_df = pd.read_csv(cleaned_reviews_path)


Distribution of Prices

In [None]:
# Distribution of Prices
plt.figure(figsize=(10, 6))
sns.histplot(listings_df['price'], bins=50, kde=True)
plt.title('Distribution of Listing Prices')
plt.xlabel('Price')
plt.ylabel('Frequency')
plt.show()


Calculate and Visualize Average Prices by Neighborhood Group

In [None]:
# Group by neighborhood group and calculate the average price
average_prices = listings_df.groupby('neighbourhood_group_cleansed')['price'].mean()

# Print the average prices
print(average_prices)

# Plotting
plt.figure(figsize=(10, 6))
average_prices.plot(kind='bar')
plt.title('Average Airbnb House Prices by Zipcode')
plt.xlabel('neighbourhood_group_cleansed')
plt.ylabel('Average Price')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

Clean Up Data for Regression Analysis

In [None]:
# Convert 'zipcode' column to numeric, ignoring errors
listings_df['zipcode'] = pd.to_numeric(listings_df['zipcode'], errors='coerce')

# Convert 'bedrooms' column to numeric, ignoring errors
listings_df['bedrooms'] = pd.to_numeric(listings_df['bedrooms'], errors='coerce')

# Convert 'bathrooms' column to numeric, ignoring errors
listings_df['bathrooms'] = pd.to_numeric(listings_df['bathrooms'], errors='coerce')

# Drop rows with missing values
listings_df.dropna(inplace=True)

# Check the data types of all columns
print(listings_df.dtypes)

Predict House Listing Prices in Different Areas of Seattle

In [None]:
# Write a program to predict what a house listing price should be in different areas of Seattle.
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Select relevant features (bedrooms, bathrooms, and zipcode) and target variable (price)
X = listings_df[['bedrooms', 'bathrooms', 'zipcode']]
y = listings_df['price']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
model = LinearRegression()
model.fit(X_train, y_train)

# Predict prices
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

# Function to predict price based on input features
def predict_price(bedrooms, bathrooms, zipcode):
    input_data = pd.DataFrame([[bedrooms, bathrooms, zipcode]], columns=['bedrooms', 'bathrooms', 'zipcode'])
    price = model.predict(input_data)
    return price[0]

#Collect input from user
bedrooms = float(input("Enter the number of bedrooms in the listing: "))
bathrooms = float(input("Enter the number of bathrooms in the listing: "))
zipcode = float(input("Enter the zipcode of the listing: "))
predicted_price = predict_price(bedrooms, bathrooms, zipcode)
print("Predicted price:", predicted_price)

KeyWord Search for Amenities at Listings

In [None]:
# Function to perform keyword search on amenities
def search_listings(keyword):
    # Filter the DataFrame to include only listings containing the keyword in amenities
    filtered_data = listings_df[listings_df['amenities'].str.contains(keyword, case=False)]

    # Check if any listings match the keyword
    if not filtered_data.empty:
        # Display relevant details of the matching listings
        print("Listings containing the keyword '{}':".format(keyword))
        for index, row in filtered_data.iterrows():
            print("\nListing ID:", row['id'])
            print("Name:", row['name'])
            print("Host Response Time:", row['host_response_time'])
            print("Bedrooms:", row['bedrooms'])
            print("Bathrooms:", row['bathrooms'])
            print("Beds:", row['beds'])
            print("Price:", row['price'])
            print("Neighbourhood Cleansed:", row['neighbourhood_cleansed'])
            print("Minimum Nights:", row['minimum_nights'])
            print("Guests Included:", row['guests_included'])
            print("Number of Reviews:", row['number_of_reviews'])
            print("Review Scores Rating:", row['review_scores_rating'])
            print("Listing Link:", row['listing_url'])
            print("-" * 50)
    else:
        print("No listings found containing the keyword '{}'.".format(keyword))

# Main function
def main():
    # Prompt the user to enter a keyword for amenity search
    keyword = input("Enter a keyword to search for listings: ")

    # Perform keyword search on amenities
    search_listings(keyword)

# Entry point of the program
if __name__ == "__main__":
    main()

Correlation Analysis between Host Response Rate and Review Type Categories

In [None]:
#Correlation Matrix between host response rate and review type categories
# Select relevant columns
selected_data = listings_df[['host_response_rate', 'number_of_reviews', 'host_total_listings_count','review_scores_rating','reviews_per_month']]

# Convert 'host_response_rate' to numeric (remove '%' sign)
selected_data['host_response_rate'] = selected_data['host_response_rate'].str.rstrip('%').astype('float')

# Drop rows with missing values
selected_data.dropna(inplace=True)

# Calculate correlation matrix
correlation_matrix = selected_data.corr()

# Print correlation matrix
print(correlation_matrix)

# Plot correlation matrix
plt.figure(figsize=(7, 5))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix')
plt.show()

Import Necessary Libraries for Modeling and Visualization

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier
from sklearn.model_selection import train_test_split # Import train_test_split function
from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation
from sklearn.datasets import load_iris
from sklearn import tree
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from scipy.stats import randint


# Convert 'zipcode' column to numeric, ignoring errors
listings_df['zipcode'] = pd.to_numeric(listings_df['zipcode'], errors='coerce')

# Convert 'bedrooms' column to numeric, ignoring errors
listings_df['bedrooms'] = pd.to_numeric(listings_df['bedrooms'], errors='coerce')

# Convert 'bathrooms' column to numeric, ignoring errors
listings_df['bathrooms'] = pd.to_numeric(listings_df['bathrooms'], errors='coerce')



Top 10 Airbnb Listings by Review Rating

In [None]:

sorted_listings_df = listings_df.sort_values(by=['review_scores_rating'],ascending=False).head(10)
sorted_listings_df[['id','name','summary','neighbourhood_cleansed','room_type','zipcode','accommodates','bathrooms','bedrooms','beds','price','review_scores_rating']]

Data Cleaning for Analyzing Impact of Reviews on Price

In [None]:
# How do the number of reviews and review scores impact the price of a listing?


# Data cleaning
review = listings_df[['id','name','summary','neighbourhood_cleansed','zipcode','property_type','room_type','accommodates','bathrooms','bedrooms','beds','price','review_scores_rating','number_of_reviews']]

review = review.fillna(0) # replacing all NaN to -1

review

Split Dataset into Features and Target

In [None]:
#split dataset in features and target variable
feature_cols = ['review_scores_rating','number_of_reviews']

x = review[feature_cols] # Features
y = review.price # Target variable

# Split dataset into training set and test set
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=1) # 70% training and 30% test

Train Random Forest Regressor and Evaluate Accuracy

In [None]:
# Create and train model
rf = RandomForestRegressor(n_estimators = 300, max_features = 'sqrt', max_depth = 7, random_state = 18)
rf.fit(x_train, y_train)
# Predict on test data
prediction = rf.predict(x_test)
# Compute mean squared error
mse = mean_squared_error(y_test, prediction)
rmse = mse**.5
# Print results
print(mse)
print(rmse)
r2 = r2_score(y_test, prediction, force_finite=False)
print(r2)

Predict the future prices of listings based on historical data and identified trends

In [None]:
#split dataset in features and target variable
feature_cols = ['zipcode','accommodates','bathrooms','bedrooms','beds']

x = review[feature_cols] # Features
y = review.price # Target variable

# Split dataset into training set and test set
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=1) # 70% training and 30% test

 Train and Evaluate Random Forest Regressor

In [None]:
# Create and train model
rf = RandomForestRegressor(n_estimators = 500, max_features = 'sqrt', max_depth = 10, random_state = 20)
rf.fit(x_train, y_train)
# Predict on test data
prediction = rf.predict(x_test)
# Compute mean squared error
mse = mean_squared_error(y_test, prediction)
rmse = mse**.5
# Print results
print(mse)
print(rmse)
r2 = r2_score(y_test, prediction, force_finite=False)
print(r2)