In [15]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go

#Load the entire csv file into a dataframe
df_loadcsv = pd.read_csv("Seattle Listings Airbnb.csv")

#for easier previews
pd.options.display.max_rows=100

#listing all columns we have for analysis
df_loadcsv.columns

Index(['id', 'listing_url', 'scrape_id', 'last_scraped', 'name', 'summary',
       'space', 'description', 'experiences_offered', 'neighborhood_overview',
       'notes', 'transit', 'thumbnail_url', 'medium_url', 'picture_url',
       'xl_picture_url', 'host_id', 'host_url', 'host_name', 'host_since',
       'host_location', 'host_about', 'host_response_time',
       'host_response_rate', 'host_acceptance_rate', 'host_is_superhost',
       'host_thumbnail_url', 'host_picture_url', 'host_neighbourhood',
       'host_listings_count', 'host_total_listings_count',
       'host_verifications', 'host_has_profile_pic', 'host_identity_verified',
       'street', 'neighbourhood', 'neighbourhood_cleansed',
       'neighbourhood_group_cleansed', 'city', 'state', 'zipcode', 'market',
       'smart_location', 'country_code', 'country', 'latitude', 'longitude',
       'is_location_exact', 'property_type', 'room_type', 'accommodates',
       'bathrooms', 'bedrooms', 'beds', 'bed_type', 'amenities', '

In [16]:
#Checking for classifiers
df_loadcsv['room_type'].unique()

array(['Entire home/apt', 'Private room', 'Shared room'], dtype=object)

In [17]:
#Checking for classifiers for Q1
df_loadcsv['host_is_superhost'].unique()

array(['f', 't', nan], dtype=object)

In [18]:
#Checking for classifiers for Q1
df_loadcsv['neighbourhood_cleansed'].unique()

array(['West Queen Anne', 'Adams', 'West Woodland', 'East Queen Anne',
       'Wallingford', 'North Queen Anne', 'Green Lake', 'Westlake',
       'Mann', 'Madrona', 'University District', 'Harrison/Denny-Blaine',
       'Minor', 'Leschi', 'Atlantic', 'Pike-Market', 'Eastlake',
       'South Lake Union', 'Lawton Park', 'Briarcliff', 'Belltown',
       'International District', 'Central Business District',
       'First Hill', 'Yesler Terrace', 'Pioneer Square', 'Gatewood',
       'Arbor Heights', 'Alki', 'North Admiral', 'Crown Hill',
       'Fairmount Park', 'Genesee', 'Interbay', 'Industrial District',
       'Mid-Beacon Hill', 'South Beacon Hill', 'Greenwood', 'Holly Park',
       'Fauntleroy', 'North Beacon Hill', 'Mount Baker', 'Brighton',
       'South Delridge', 'View Ridge', 'Dunlap', 'Rainier Beach',
       'Columbia City', 'Seward Park', 'North Delridge', 'Maple Leaf',
       'Ravenna', 'Riverview', 'Portage Bay', 'Bryant', 'Montlake',
       'Broadway', 'Loyal Heights', 'Vict

In [19]:
#checking for null values
df_loadcsv.isnull().sum()

id                                     0
listing_url                            0
scrape_id                              0
last_scraped                           0
name                                   0
summary                              177
space                                569
description                            0
experiences_offered                    0
neighborhood_overview               1032
notes                               1606
transit                              934
thumbnail_url                        320
medium_url                           320
picture_url                            0
xl_picture_url                       320
host_id                                0
host_url                               0
host_name                              2
host_since                             2
host_location                          8
host_about                           859
host_response_time                   523
host_response_rate                   523
host_acceptance_

In [22]:
#pulling aside interestiong columns, removing nulls for SuperHost as there are only two in the dataset
df_target = df_loadcsv.filter(['host_is_superhost', 'room_type','neighbourhood_cleansed','price']).dropna()
df_target.dtypes


host_is_superhost         object
room_type                 object
neighbourhood_cleansed    object
price                     object
dtype: object

In [23]:
# Cleaning data, as the price object is un USD, string 
df_target['price'] = df_target['price'].str.replace('$','')
df_target['price'] = df_target['price'].str.replace(',','')
df_target['price'] = df_target['price'].apply(pd.to_numeric)
df_target.dtypes

host_is_superhost          object
room_type                  object
neighbourhood_cleansed     object
price                     float64
dtype: object

In [24]:
# # Q1 : Is being a superhost Quantifyably Valuable ?

#Q1 selecting dataframes with Superhost and not superhost status
df_question1_s = df_target[df_target['host_is_superhost']=='t'].groupby(['room_type']).mean().reset_index()
df_question1_ns = df_target[df_target['host_is_superhost']=='f'].groupby(['room_type']).mean().reset_index()
df_question1_s  # checking data 

Unnamed: 0,room_type,price
0,Entire home/apt,157.001927
1,Private room,77.108871
2,Shared room,58.363636


In [25]:
# Q1 Plotting the graph
x1 = df_question1_s['room_type']
y1 = df_question1_s['price']
y2 = df_question1_ns['price']


fig = go.Figure()

fig.add_trace(go.Scatter(
    x= x1,
    y=y1,
    name="Superhost"
))

fig.add_trace(go.Scatter(
    x= x1,
    y=y2,
    name="Not Superhost"
))


fig.update_layout(
    title="Is being a superhost Valuable?",
    xaxis_title="Apartment Types",
    yaxis_title="Average Price",
    legend_title="Status",
    )


fig.show()

In [26]:
#Q2 : What about the neighbourhoods? Does this relation hold true for all ? Lets check with One : West Queen Anne Neighbourhood

df_question2_s = df_target[(df_target['host_is_superhost']=='t')&(df_target['neighbourhood_cleansed'] =='West Queen Anne')].groupby(['room_type']).mean().reset_index()
df_question2_ns = df_target[(df_target['host_is_superhost']=='f')&(df_target['neighbourhood_cleansed'] =='West Queen Anne')].groupby(['room_type']).mean().reset_index()
df_question2_s  # checking data 
#almost immediately we see that there are less options

Unnamed: 0,room_type,price
0,Entire home/apt,136.666667
1,Private room,68.8


In [27]:
# Plot for Q2 
# Todo : This is really easily passable as a function
x1 = df_question2_s['room_type']
y1 = df_question2_s['price']
y2 = df_question2_ns['price']


fig = go.Figure()

fig.add_trace(go.Scatter(
    x= x1,
    y=y1,
    name="Superhost"
))

fig.add_trace(go.Scatter(
    x= x1,
    y=y2,
    name="Not Superhost"
))


fig.update_layout(
    title="Is being a superhost Valuable in West Queen Anne District?",
    xaxis_title="Apartment Types",
    yaxis_title="Average Price",
    legend_title="Status",
    )


fig.show()

In [None]:
#Q3 : What about reviews ? Do ratings enable someone to charge a higher price ? 

In [28]:
df_target2 = df_loadcsv.filter(['room_type','neighbourhood_cleansed','price', 'review_scores_rating'])
df_target2.head()



Unnamed: 0,room_type,neighbourhood_cleansed,price,review_scores_rating
0,Entire home/apt,West Queen Anne,$85.00,95.0
1,Entire home/apt,West Queen Anne,$150.00,96.0
2,Entire home/apt,West Queen Anne,$975.00,97.0
3,Entire home/apt,West Queen Anne,$100.00,
4,Entire home/apt,West Queen Anne,$450.00,92.0


In [29]:
#count the nulls
df_target2.isnull().sum()

room_type                   0
neighbourhood_cleansed      0
price                       0
review_scores_rating      647
dtype: int64

In [30]:
#drop the nulls as imputing them would not be logical, and we have enough entries
df_target2.dropna()

Unnamed: 0,room_type,neighbourhood_cleansed,price,review_scores_rating
0,Entire home/apt,West Queen Anne,$85.00,95.0
1,Entire home/apt,West Queen Anne,$150.00,96.0
2,Entire home/apt,West Queen Anne,$975.00,97.0
4,Entire home/apt,West Queen Anne,$450.00,92.0
5,Private room,West Queen Anne,$120.00,95.0
...,...,...,...,...
3810,Entire home/apt,Fremont,$154.00,92.0
3811,Entire home/apt,Fremont,$65.00,100.0
3812,Entire home/apt,Fremont,$95.00,96.0
3813,Entire home/apt,Fremont,$359.00,80.0


In [31]:
#we clean price again. ToDo: do for main data while refactoring

df_target2['price'] = df_target2['price'].str.replace('$','')
df_target2['price'] = df_target2['price'].str.replace(',','')
df_target2['price'] = df_target2['price'].apply(pd.to_numeric)
df_target2.dtypes

room_type                  object
neighbourhood_cleansed     object
price                     float64
review_scores_rating      float64
dtype: object

In [35]:
df_question3_m = df_target2[df_target2['room_type']=='Entire home/apt']
df_question3_p = df_target2[df_target2['room_type']=='Private room']
df_question3_s = df_target2[df_target2['room_type']=='Shared room']
df_question3_m 

Unnamed: 0,room_type,neighbourhood_cleansed,price,review_scores_rating
0,Entire home/apt,West Queen Anne,85.0,95.0
1,Entire home/apt,West Queen Anne,150.0,96.0
2,Entire home/apt,West Queen Anne,975.0,97.0
3,Entire home/apt,West Queen Anne,100.0,
4,Entire home/apt,West Queen Anne,450.0,92.0
...,...,...,...,...
3813,Entire home/apt,Fremont,359.0,80.0
3814,Entire home/apt,Portage Bay,79.0,100.0
3815,Entire home/apt,Rainier Beach,93.0,
3816,Entire home/apt,Madison Park,99.0,


In [47]:
x1 = df_question3_m['review_scores_rating']
y1 = df_question3_m['price']
y2 = df_question3_p['price']
y3 = df_question3_s['price']

fig = go.Figure()

fig.add_trace(go.Scatter(
    x= x1,
    y=y1,
    name="Apartments",
     mode="markers",
))

fig.add_trace(go.Scatter(
    x= x1,
    y=y2,
    name="Private rooms",
    mode="markers",
))

fig.add_trace(go.Scatter(
    x= x1,
    y=y2,
    name="Shared rooms",
    mode="markers",
))


fig.update_layout(
    title="Is there any correlation between High Ratings and the demanded price?",
    xaxis_title="Ratings",
    yaxis_title="Price",
    legend_title="Status",
    )


fig.show()

In [43]:
# clearly, some outliers are skewing the result. Dropping some ratings are not advisable, but there seem to be only a few

# using the tukey method

def outlier_trim(df_large):
    rat_lw = df_large['review_scores_rating'].quantile(0.25)
    rat_hi = df_large['review_scores_rating'].quantile(0.75)
    
    IQR = rat_hi - rat_lw
    rat_max_value = rat_hi + 1.5 * IQR
    rat_min_value = rat_lw - 1.5 * IQR

    #filter dataset 

    df_large = df_large[(df_large['review_scores_rating'] > rat_max_value) | (df_large['review_scores_rating']< rat_min_value )]
    return df_large



In [45]:
#trim the ratings outliers

df_question3_mr = outlier_trim(df_question3_m)
df_question3_pr = outlier_trim(df_question3_p)
df_question3_sr = outlier_trim(df_question3_s)



In [48]:
# repeat plot

x1 = df_question3_mr['review_scores_rating']
y1 = df_question3_mr['price']
y2 = df_question3_pr['price']
y3 = df_question3_sr['price']

fig = go.Figure()

fig.add_trace(go.Scatter(
    x= x1,
    y=y1,
    name="Apartments",
     mode="markers",
))

fig.add_trace(go.Scatter(
    x= x1,
    y=y2,
    name="Private rooms",
    mode="markers",
))

fig.add_trace(go.Scatter(
    x= x1,
    y=y2,
    name="Shared rooms",
    mode="markers",
))


fig.update_layout(
    title="Is there any correlation between High Ratings and the demanded price?",
    xaxis_title="Ratings",
    yaxis_title="Price",
    legend_title="Status",
    )


fig.show()