In [5]:
# This file explores any correlations between the three factors we are exploring: price, rating, and host.  
# While we are comparing price, rating, and host to other data points, we wanted to also discover if those three factors have any correlation with each other.
# We found that price, rating, and whether or not a host is a super host did not have any correlation.

In [6]:
import pandas as pd 
import matplotlib.pyplot as plt  
import scipy.stats as st 
import numpy as np 
import datetime as dt 

In [7]:
# increase max number of columns and rows visible in order to see more data in VS code.
# Read in master data file
pd.set_option('display.max_columns', 45)
pd.set_option('display.max_rows', 45)
price_df = pd.read_csv('../BM_files/all_listings_df')
price_df.head()

Unnamed: 0,id,last_scraped,description,neighborhood_overview,host_id,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,host_total_listings_count,host_has_profile_pic,host_identity_verified,neighbourhood_cleansed,latitude,longitude,property_type,room_type,accommodates,bathrooms,bathrooms_text,bedrooms,beds,price,has_availability,availability_30,availability_60,availability_90,availability_365,number_of_reviews,number_of_reviews_ltm,number_of_reviews_l30d,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,instant_bookable,calculated_host_listings_count,reviews_per_month,city
0,360,2023-09-24,Enjoy the famous Colorado weather and unplug i...,The cottage is located in the center of Lower ...,666,2008-07-08,"Denver, CO",within an hour,100%,97%,t,3,4,t,t,Highland,39.766415,-105.002098,Entire guesthouse,Entire home/apt,3,,1 bath,2.0,2.0,90.0,t,4,27,57,147,179,7,0,4.99,4.99,4.96,5.0,5.0,5.0,4.91,f,2,2.87,Denver
1,364,2023-09-24,"Modern 1,000 square foot loft in the heart of ...","Ten brewpubs within walking distance, two grea...",783,2008-07-11,"Denver, CO",,,,f,1,1,t,t,Five Points,39.76672,-104.97906,Entire loft,Entire home/apt,3,,1.5 baths,1.0,1.0,179.0,t,23,53,83,358,87,0,0,4.85,4.78,4.81,4.95,4.96,4.65,4.71,f,1,0.5,Denver
2,590,2023-09-24,"Large guest room in my home, where I also live...",I love the diversity of my neighborhood and it...,933,2008-07-21,"Denver, CO",within an hour,100%,95%,t,2,2,t,t,North Park Hill,39.75511,-104.91109,Private room in home,Private room,3,,1 shared bath,,1.0,64.0,t,5,27,53,233,712,43,2,4.85,4.78,4.58,4.93,4.95,4.76,4.85,f,2,4.04,Denver
3,592,2023-09-24,This room is in the basement. It does not hav...,,933,2008-07-21,"Denver, CO",within an hour,100%,95%,t,2,2,t,t,North Park Hill,39.75481,-104.91106,Private room in home,Private room,2,,1 shared bath,,1.0,57.0,t,0,0,0,158,168,1,0,4.87,4.75,4.55,4.94,4.94,4.81,4.86,f,2,0.95,Denver
4,686,2023-09-24,Thanks for visiting my Queen Bed Room site for...,"I love my Uptown neighborhood, which is within...",990,2008-07-23,"Denver, CO",within a few hours,100%,100%,t,2,4,t,t,North Capitol Hill,39.74695,-104.97838,Private room in home,Private room,2,,1 private bath,,2.0,33.0,t,0,0,13,288,256,0,0,4.76,4.75,4.8,4.86,4.91,4.87,4.81,f,2,1.39,Denver


In [8]:
# row count
num_rows = len(price_df)
print(f"Number of rows in price_df is: {num_rows}")

Number of rows in price_df is: 22851


In [9]:
# completeness check, count for cities should be: 
# Boston 4033
# Denver 5388
# Nashville 8584
# Portland 4846

grouped_df = price_df.groupby(['city'])

grouped_df['city'].value_counts()

city
Boston       4033
Denver       5388
Nashville    8584
Portland     4846
Name: count, dtype: int64

In [10]:
# check for null values in 'host_is_superhost'
# need to filter these out before using superhost as comparison
null_counts_host = price_df['host_is_superhost'].isna().sum()
print(null_counts_host)

558


In [11]:
# check for null values in 'review_scores_rating'
# need to filter these out before using review score rating as a comparison, null because no reviews yet
null_counts_reviews = price_df['review_scores_rating'].isna().sum()
print(null_counts_reviews)

3530


In [12]:
# check for null values in 'price'
# need to filter these out before using price as a comparison. Would not expect there to be any null values, but worth checking.
null_counts_price = price_df['price'].isna().sum()
print(null_counts_price)

0


In [13]:
# Look at all 3 hypotheses indicators together: price, rating, and if host is super host.
# Set index to listing id

all_3 = pd.DataFrame(price_df[["id", "price", "number_of_reviews", "review_scores_rating", "host_is_superhost", "city"]])

all_3 = all_3.set_index("id")
all_3

Unnamed: 0_level_0,price,number_of_reviews,review_scores_rating,host_is_superhost,city
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
360,90.0,179,4.99,t,Denver
364,179.0,87,4.85,f,Denver
590,64.0,712,4.85,t,Denver
592,57.0,168,4.87,t,Denver
686,33.0,256,4.76,t,Denver
...,...,...,...,...,...
979474665987354279,258.0,0,,f,Portland
980422903985425172,99.0,0,,f,Portland
980439098128842446,165.0,0,,f,Portland
980549321160587970,195.0,0,,f,Portland


In [14]:
# Above we found that there are some nulls in review columns due to 0 reviews, and some nulls in host_is_superhost column
# Filter out records with 0 reviews and records with null value in host_is_superhost column

filtered_all_3 = all_3.loc[(all_3["number_of_reviews"] > 0) & (all_3["host_is_superhost"].notnull())]
filtered_all_3.head()

Unnamed: 0_level_0,price,number_of_reviews,review_scores_rating,host_is_superhost,city
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
360,90.0,179,4.99,t,Denver
364,179.0,87,4.85,f,Denver
590,64.0,712,4.85,t,Denver
592,57.0,168,4.87,t,Denver
686,33.0,256,4.76,t,Denver


In [15]:
# replace t / f in superhost as true = 1, false = 0, then convert to integer data types
# this is necessary to have numerical data to be able to use the .corr() function

filtered_all_3['host_is_superhost'] = filtered_all_3['host_is_superhost'].replace({'t': '1', 'f': '0'})
filtered_all_3['host_is_superhost'] = filtered_all_3['host_is_superhost'].astype(int)
filtered_all_3.dtypes

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_all_3['host_is_superhost'] = filtered_all_3['host_is_superhost'].replace({'t': '1', 'f': '0'})
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_all_3['host_is_superhost'] = filtered_all_3['host_is_superhost'].astype(int)


price                   float64
number_of_reviews         int64
review_scores_rating    float64
host_is_superhost         int32
city                     object
dtype: object

In [16]:
# check to make sure that filtered dataframe above removed all null values
null_counts = filtered_all_3.isna().sum()
print(null_counts)

price                   0
number_of_reviews       0
review_scores_rating    0
host_is_superhost       0
city                    0
dtype: int64


In [17]:
# check to see what correlation (if any) between price, number of reviews, review score rating, and superhost
# appears that there is not much correlation, strongest being .27
filtered_all_3_no_city = filtered_all_3.drop(columns='city')
all_3_corr = filtered_all_3_no_city.corr()
all_3_corr.unstack().sort_values()

price                 number_of_reviews      -0.026635
number_of_reviews     price                  -0.026635
price                 review_scores_rating    0.010384
review_scores_rating  price                   0.010384
price                 host_is_superhost       0.017632
host_is_superhost     price                   0.017632
number_of_reviews     review_scores_rating    0.101734
review_scores_rating  number_of_reviews       0.101734
number_of_reviews     host_is_superhost       0.251396
host_is_superhost     number_of_reviews       0.251396
review_scores_rating  host_is_superhost       0.270180
host_is_superhost     review_scores_rating    0.270180
price                 price                   1.000000
number_of_reviews     number_of_reviews       1.000000
review_scores_rating  review_scores_rating    1.000000
host_is_superhost     host_is_superhost       1.000000
dtype: float64

In [18]:
######################################################################################################

In [None]:
# based on results of the .corr() function, the strongest correlation is .27 between the review score rating and if the host is a super host.  
# This is not a strong correlation, and we declined to investigate it further in this file.