In [1]:
# Set up
import numpy as np
import pandas as pd
import seaborn as sns # for visualiation
import altair as alt
alt.renderers.enable('notebook') # enable altair rendering
from scipy.stats import ttest_ind # t-tests
import statsmodels.formula.api as smf # linear modeling
import statsmodels.api as sm
import matplotlib.pyplot as plt # plotting
import matplotlib
from sklearn import metrics
matplotlib.style.use('ggplot')
%matplotlib inline

In [2]:
listings_df = pd.read_csv('./data/detailed_listings.csv')
bookings_df = pd.read_csv('./data/calendar.csv')

  interactivity=interactivity, compiler=compiler, result=result)


FileNotFoundError: File b'./data/calendar.csv' does not exist

In [None]:
# The columns we have to work with for listings (house details)
listings_df.columns

In [None]:
# Looking at things related to the pricing of the house
pricing_df = listings_df[['id','price', 'weekly_price', 'monthly_price']]
pricing_df

## Observations:

There are some houses that don't have a weekly or monthy rate. Further research shows that "price" indicates the _price per night_. This can be multiplied with how often the house is booked to show how much monthly income the owners are making from the home.

In [None]:
# Looking at columns related to booking the homes
bookings_df.columns

In [None]:
bookings_df

## Observations: 

"True" in the available column indicates that the house can be rented out. We want to know monthy income so we might want to only look at where the "available" column is _false_. 

In [None]:
# Looking at the listings where it's false

booked_houses = bookings_df[bookings_df['available'] == 'f']

In [None]:
booked_houses

## Observations:

Many of the houses that are booked don't have a listed price. This may indicate that we need to combine the dataframes together in order to get a price for the listings.

This, combined with the fact that not all houses have weekly and monthly prices may lead us to make estimates based solely off of the `price` column.

## Data preparation

To explore the data further, our columns of interest need to be created. This includes: `estimated_income_per_month`, `estimated_occupancy`, and `bookings_per_month`.


Here is a picture of the equations the website uses to calculate this information (we should also justify this during our paper as to why we chose these equations / why they represent what we're measuring):

![equations](./images/equations.png)

Here are constants (the variables in all caps) within the equations that were found on the page:

![constants](./images/constants.png)

## Equations for calculation:

- `bookings per month`: number of reviews per month divided by the review rate, rounded to one decimal place.
- `estimated occupancy`: the minimum between bookings per month * the max between the minimum nights and the average nights divided by 30 or the maximum occupancy rounded to 3 decimal places
- `income per month`: estimated occupancy times 30 times the price per night, rounded to the nearest whole number.

In [None]:
# Declare the constants

REVIEW_RATE = 0.3
AVERAGE_NIGHTS = 5
MAXIMUM_OCCUPANCY = 0.7

## Calculate the bookings per month
reviews_per_month = listings_df[['reviews_per_month']]

## Create a copy of the original dataframe to modify
listings_with_income = listings_df.copy()

# Bookings per month
listings_with_income['bookings_per_month'] = reviews_per_month / REVIEW_RATE

In [None]:
# Estimated occupancy

# Get minumum number of nights
minimum_nights = listings_df[['minimum_nights']]

# Create array to hold max number of nights stayed
max_nights_stayed = []

# For each row, choose the max between the minimum number of nights and the average number of nights
for index, min_nights in minimum_nights.iterrows():
    max_nights_stayed.append(max(float(min_nights), AVERAGE_NIGHTS))
    
# Calculate average number of stays per day to multiply by bookings per month to get number of bookings per month
average_stays_per_day = [max_nights / 30 for max_nights in max_nights_stayed]

# Holds average stays per month
average_stays_per_month = listings_with_income['bookings_per_month'] * average_stays_per_day

# Final value for estimated occupancy
estimated_occupancy = []
for average_stay_per_month in average_stays_per_month:
    estimated_occupancy.append(min(average_stay_per_month, MAXIMUM_OCCUPANCY))

# Add the values to the dataframe
listings_with_income['estimated_occupancy'] = estimated_occupancy

In [None]:
# Estimated income per month
prices = listings_with_income['price']
formatted_prices = []

for price in prices:
    formatted_price = price.replace("$", "")
    formatted_price = formatted_price.replace(",", "")
    formatted_prices.append(formatted_price)

listings_with_income['price'] = formatted_prices

listings_with_income.price = listings_with_income.price.astype(float)

listings_with_income['income_per_month'] = round(listings_with_income['estimated_occupancy'] * 30 * listings_with_income['price'])

In [None]:
# test = listings_with_income[listings_with_income['income_per_month'] > 2000]
# test['income_per_month']