In [336]:
import pandas as pd 
import datetime

cols = ['id', 'host_id', 'host_since', 'host_is_superhost', 'host_has_profile_pic', 'host_identity_verified', 
        'neighbourhood_cleansed', 'latitude', 'longitude', 'host_listings_count',
        'property_type', 'room_type', 'accommodates', 'bathrooms', 'bedrooms', 
        'beds', 'bed_type', 'amenities', 'square_feet', 'price', 'weekly_price', 
        'monthly_price', 'security_deposit', 'cleaning_fee', 'guests_included', 
        'extra_people', 'minimum_nights', 'maximum_nights', 'number_of_reviews',
        'first_review', 'last_review', 'review_scores_rating',
        'review_scores_accuracy', 'review_scores_cleanliness',
        'review_scores_checkin', 'review_scores_communication',
        'review_scores_location', 'review_scores_value', 'reviews_per_month']

df = pd.read_csv('paris_airbnb.csv', usecols=cols)

def excel_dt_to_dt(x):
    """Method to be applied to df column of excel dates in day format. 
    Args:
        x - value to be processed

    Returns:
        processed value of x
    """
    try:
        x = datetime.date(1899,12,30) + datetime.timedelta(days=int(x))
    except ValueError:
        x = datetime.date(1899,12,30)
    return x

df['host_since'] = df['host_since'].apply(excel_dt_to_dt)
df['first_review'] = df['first_review'].apply(excel_dt_to_dt)
df['last_review'] = df['last_review'].apply(excel_dt_to_dt)


In [None]:
def mapping_data(df: "Dataframe", sh: int = 0, min_cd: tuple = (-200, -200), max_cd: tuple = (200, 200)) -> list:
    """Function to create lists of coordinates from a dataframe of Airbnb listing data.
    Args:
        df - pandas Dataframe containing latitude and longitude
        sh (Optional) - value of 1 will turn on superhost filter
        min_cd (Optional) - minimum coordiante vals to accept in form (x, y)
        max_cd (Optional) - maximum coordiante vals to accept in form (x, y)

    Returns:
        x - list of longitudes
        y - list of latitudes
    """

    x , y = [], []
    for i in range(len(df)):
        if sh == 1:
            if df['host_is_superhost'][i] == 't':
                if max_cd[0] > df['longitude'][i] > min_cd[0] and max_cd[1] > df['latitude'][i] > min_cd[1]:
                    x.append(df['longitude'][i])
                    y.append(df['latitude'][i])                
        else:
            if max_cd[0] > df['longitude'][i] > min_cd[0] and max_cd[1] > df['latitude'][i] > min_cd[1]:
                x.append(df['longitude'][i])
                y.append(df['latitude'][i]) 
    return x, y

In [None]:
import matplotlib.pyplot as plt
import matplotlib.image as mpimg

fig, ax = plt.subplots(figsize=(20, 20))

x, y = mapping_data(df)

ax.scatter(x, y, edgecolor='red', linewidths=0.5, zorder=2, s=0.1)
ax.imshow(mpimg.imread('paris_map.png'), extent=(2.2164, 2.4664, 48.799, 48.917), zorder=1)

plt.show()


In [None]:
fig, ax = plt.subplots(figsize=(20, 20))

x1, y1 = mapping_data(df)
x2, y2 = mapping_data(df, 1)

ax.scatter(x1, y1, edgecolor='red', linewidths=0.5, zorder=2, s=0.1)
ax.scatter(x2, y2, edgecolor='yellow', linewidths=0.5, zorder=2, s=0.3)
ax.imshow(mpimg.imread('paris_map.png'), extent=(2.2164, 2.4664, 48.799, 48.917), zorder=1)

plt.show()


In [None]:
# We can see that the number of superhosts is most concentrated within the coordinates 48D53'N, 2D20'E and 48D51'N, 2D23'E; 
# and the lone grid square 48D54'N, 2D20'E and 48D53'N, 2D21'E
# These need to be converted to decimal before we can plot
import re

def min_to_dec(cd: list) -> list:
    """Function to convert minute coords of the form xxDyy' into decimal coords.
    Args:
        cd - coords to be converted
        
    Returns:
        cd - converted coords in decimal form
    """
    dec_cds = []
    for coord in cd:
        split = re.findall(r'(\d*)D(\d\d)\'.*', coord)[0]
        dec_min = int(split[1])/60
        dec_cd = int(split[0]) + dec_min
        dec_cds.append(dec_cd)

    return dec_cds

# The first four coords relate to the top left corner and bottom right corner of 6 densely packed grid squares
# The last four coords relate to the top left and bottom right of a single densely packed square above this
coords = ['48D53\'N', '2D20\'E', '48D51\'N', '2D23\'E', '48D54\'N', '2D21\'E']
decimal_coords = min_to_dec(coords)

# Our first block of data will be between 48D53'N, 48D51'N and 2D20'E, 2D23'E
min_1 = (decimal_coords[1], decimal_coords[2])
max_1 = (decimal_coords[3], decimal_coords[0])
# Our second block will be between 48D54'N, 48D53'N and 2D20'E and 2D21'E
min_2 = (decimal_coords[1], decimal_coords[0])
max_2 = (decimal_coords[5], decimal_coords[4])


In [None]:
fig, ax = plt.subplots(figsize=(20, 20))

x1, y1 = mapping_data(df, min_cd=min_1, max_cd=max_1)
x2, y2 = mapping_data(df, 1, min_1, max_1)
x3, y3 = mapping_data(df, min_cd=min_2, max_cd=max_2)
x4, y4 = mapping_data(df, 1, min_2, max_2)

ax.scatter(x1, y1, edgecolor='red', linewidths=0.5, zorder=2, s=0.1)
ax.scatter(x2, y2, edgecolor='yellow', linewidths=0.5, zorder=2, s=0.3)
ax.scatter(x3, y3, edgecolor='red', linewidths=0.5, zorder=2, s=0.1)
ax.scatter(x4, y4, edgecolor='yellow', linewidths=0.5, zorder=2, s=0.3)
ax.imshow(mpimg.imread('paris_map.png'), extent=(2.2164, 2.4664, 48.799, 48.917), zorder=1)

plt.show()

In [None]:
# Find number of listings and superhosts in dense region
num_list_dense = len(x3)
num_list_dense_sh = len(x4)
ratio_dense = num_list_dense_sh/num_list_dense

# Find number of listings and superhosts across all regions
num_list_tot = len(x1)
num_list_tot_sh = len(x2)
ratio_tot = num_list_tot_sh/num_list_tot

print(ratio_dense, ratio_tot)
# Superhosts account for a smaller number of total listings in the more densely packed region than across the entirety of Paris -> does this indicate oversaturation?

# Find number of listings and superhosts per neighbourhood
import pprint
def get_neighbourhood_vals(df: "Dataframe") -> dict:
    """Get the number of listings and superhosts per region.
    Args:
        df - pandas dataframe of AirBnB listings

    Returns:
        nbhd_vals - dict of reviews and listings for superhosts and total per neighbourhood
            in form {neighbourhood: [std_listing, std_reviews, superhost_listing, superhost_reviews, 
            std_nobook, superhost_nobook]}
    """
    nbhd_vals = {}
    for i in range(len(df)):
        nbhd_vals.setdefault(df['neighbourhood_cleansed'][i], [0, 0, 0, 0, 0, 0])
        if df['host_is_superhost'][i] == 't':
            nbhd_vals[df['neighbourhood_cleansed'][i]][2] += 1
            if df['number_of_reviews'][i] == 0:
                nbhd_vals[df['neighbourhood_cleansed'][i]][5] += 1
            else:
                nbhd_vals[df['neighbourhood_cleansed'][i]][3] += df['number_of_reviews'][i]
        else:
            nbhd_vals[df['neighbourhood_cleansed'][i]][0] += 1
            if df['number_of_reviews'][i] == 0:
                nbhd_vals[df['neighbourhood_cleansed'][i]][4] += 1
            else:
                nbhd_vals[df['neighbourhood_cleansed'][i]][1] += df['number_of_reviews'][i]

    return nbhd_vals

# Then find average number of listings for all of the above
nbhd_vals = get_neighbourhood_vals(df)



In [None]:
# Find ratio of average bookings to total listings -> identify oversaturation
from copy import deepcopy
def neighourhood_ratios(nbhd_vals: dict) -> dict:
    """Generate ratio of superhost listings to total listings and superhost bookings(review proxy) to total bookings.
    Args:
        nbhd_vals - dict of reviews and listings for superhosts and total per neighbourhood
            in form {neighbourhood: [std_listing, std_reviews, superhost_listing, superhost_reviews]}

    Returns:
        ratio_vals - dict of reviews and listings for superhosts and total per neighbourhood
            in form {neighbourhood: [std_listing, std_reviews, superhost_listing, superhost_reviews,
                listing_ratio, review_ratio]}
    """
    ratio_vals = deepcopy(nbhd_vals)
    for k,v in ratio_vals.items():
        ratio_vals[k].append(v[2]/v[0])
        ratio_vals[k].append(v[3]/v[1])
    return ratio_vals

nbhd_vals_ratio = neighourhood_ratios(nbhd_vals)


In [None]:
# Find number of inactive properties that have at least 1 review (i.e. no reviews within last 365 days)
# Find most recent date in dataset

# Find properties with at least one review but non within 365 days

def find_inactive_listings(df: "Dataframe", nbhd_vals: dict) -> dict:
    """Get the number of inactive listings per region.
    Args:
        df - pandas dataframe of AirBnB listings
        nbhd_vals - dict of reviews and listings for superhosts and total per neighbourhood
            in form {neighbourhood: [std_listing, std_reviews, superhost_listing, superhost_reviews, 
            std_nobook, superhost_nobook, listing_ratio, review_ratio]}

    Returns:
        new_vals - dict of reviews and listings for superhosts and total per neighbourhood
            in form {neighbourhood: [std_listing, std_reviews, superhost_listing, superhost_reviews, 
            std_nobook, superhost_nobook, listing_ratio, review_ratio, std_inactive, superhost_inactive]}
    """
    inactive_threshold = df['last_review'].max() - datetime.timedelta(days=365)
    # Extend our placeholders
    new_vals = deepcopy(nbhd_vals)
    for k in new_vals.keys():
        new_vals[k].append(0)
        new_vals[k].append(0)

    # Count inactive properties
    for i in range(len(df)):
        if df['host_is_superhost'][i] == 't':
            if df['last_review'][i] < inactive_threshold:
                new_vals[df['neighbourhood_cleansed'][i]][9] += 1
        else:
            if df['last_review'][i] < inactive_threshold:
                new_vals[df['neighbourhood_cleansed'][i]][8] += 1

    return new_vals

nbhd_vals_inact = find_inactive_listings(df, nbhd_vals_ratio)


In [None]:
# Find ratio of inactive bookings
def inactive_ratios(nbhd_vals: dict) -> dict:
    """Generate ratio of inactive superhost and total listings.
    Args:
        nbhd_vals - dict of reviews and listings for superhosts and total per neighbourhood
            in form {neighbourhood: [std_listing, std_reviews, superhost_listing, superhost_reviews, 
            std_nobook, superhost_nobook, listing_ratio, review_ratio, std_inactive, superhost_inactive]}
    Returns:
        inactive_vals - dict of reviews and listings for superhosts and total per neighbourhood
            in form {neighbourhood: [std_listing, std_reviews, superhost_listing, superhost_reviews, 
            std_nobook, superhost_nobook, listing_ratio, review_ratio, std_inactive, superhost_inactive,
            total_inact_rat, superhost_inact_rat]}
    """
    inactive_vals = deepcopy(nbhd_vals)
    for k,v in inactive_vals.items():
        inactive_vals[k].append(v[8]/v[0])
        inactive_vals[k].append(v[9]/v[2])
    return inactive_vals

nbhd_inact_ratio = inactive_ratios(nbhd_vals_inact)

In [310]:

# Create new dataframe of neighbourhood level data
df2 = pd.DataFrame.from_dict(nbhd_inact_ratio, orient='index',
                            columns = ['Total Listings', 'Total Reviews', 'Superhost Listings', 
                            'Superhost Reviews', 'No Reviews (Total)', 'No Reviews (Superhost)', 
                            'Listing Ratio', 'Review Ratio', 'Total Inactive (365 days)', 
                            'Superhost Inactive (365 days)', 'Total Inactive Ratio', 'Superhost Inactive Ratio'])
df2.head()
print(df2.sum())

Total Listings                    59942.000000
Total Reviews                    818361.000000
Superhost Listings                 6091.000000
Superhost Reviews                289651.000000
No Reviews (Total)                15094.000000
No Reviews (Superhost)              158.000000
Listing Ratio                         2.169724
Review Ratio                          7.075490
Total Inactive (365 days)         26412.000000
Superhost Inactive (365 days)       277.000000
Total Inactive Ratio                  8.594592
Superhost Inactive Ratio              0.925122
dtype: float64


In [None]:
import wikipedia 
from decimal import getcontext
# wikpedia API returns decimal objects, set precision to 3
getcontext().prec = 3
# Popular destinations taken from: https://upload.wikimedia.org/wikipedia/commons/a/a8/Paris_printable_tourist_attractions_map.jpg
pages = ['Arc de Triomphe', 'Eiffel Tower', 'Champ de Mars', 'Champs-Élysées',
    'Grand Palais', 'Pont Alexandre III', 'Les Invalides', 'Place de la Concorde',
    'Tuileries Garden', 'Musée d\'Orsay', 'Sacré-Cœur, Paris', 'Moulin Rouge', 
    'Galeries Lafayette', 'Palais Garnier', 'Louvre', 'Pont Neuf', 'Sainte-Chapelle',
    'Notre-Dame de Paris', 'University of Paris', 'Panthéon', 'Centre Pompidou',
    'Hôtel de Ville, Paris', 'Place de la Bastille', 'Les Halles']

def parse_page_coords(pages: list) -> dict:
    """Function to parse coordinates from wiki pages.
    Args:
        pages - list of wiki pages to parse

    Returns:
        df-friendly dict format of page coordinates [lat, long]
    """
    attraction_coords = {}
    for page in pages:
        wiki_page = wikipedia.page(title=page)
        attraction_coords.setdefault(page, [])
        try:
            coords = wiki_page.coordinates
            coords = [float(coords[0]), float(coords[1])]
            attraction_coords[page] = coords
        except KeyError:
            # If we can't find the coordinates, we aren't interested
            attraction_coords.pop(page)
    return attraction_coords

attraction_coords = parse_page_coords(pages)


In [None]:
# We will then use the Google Maps API to determine the time between the (listing) and 
# each of the popular attractions. We will write this to a df friendly dict in the form
# {listing_id: [time_to_1, time_to_2, time_to_3...]}

import importlib
import helpers
importlib.reload(helpers)
def get_attraction_times(origin: dict, dest: dict) -> dict:
    """Get time between Paris neighbourhood and attractions.
    Args:
        origin - df-friendly dict format of neighbourhoods
        dest - df-friendly dict format of page coordinates [lat, long]

    Returns:
        times - df-friendly dict format of listing ids and travel times to attractions
    """
    times = {}
    for loc in neighorigin     try:
            times.setdefault(loc, [])
            origin_ = helpers.geocode_location(loc)
            for j in dest.keys():
                dest_ = (dest[j][0], dest[j][1])
                distance = helpers.get_distance(origin_, dest_)
                time = helpers.parse_request_data(distance)
                times[loc].append(time)
        except (IndexError, KeyError) as e:
            print(loc)
            print(e)
    return times

timeneighbourhoods = ['Montparnasse', 'Hotel-de-Ville', 'Menilmontant', 'Rue de Vaugirard', '11th arrondissement',                              'Montmartre', 'Elysee', 'Place du Panthéon', '10th arrondissement', '13th arrondissement', 
                '9th arrondissement', '6th arrondissement', '19th arrondissement', '3rd arrondissement', 
                '1st arrondissement', '7th arrondissement', '12th arrondissement', '2nd arrondissement', 
                '17th arrondissement', '16th arrondissement']s = get_attraction_times(nbhd_neighbourhoodsraction_coords)

In [None]:
# Now we can generate a new dataframe of distance data and regions
df3 = pd.DataFrame.from_dict(times, orient='index',
                            columns=attraction_coords.keys())

In [308]:
# Determine which hosts have multiple properties
def find_multi_hosts(df: "DataFrame") -> list:
    """Determine which host IDs appear multiple times.
    Args:
        df - pandas dataframe of AirBnB listings
    
    Returns:
        multi_hosts - list of host IDs
    """
    multi_hosts = df[df.duplicated('host_id')]['host_id'].tolist()
    # Number of unique multi_hosts
    print(df[df.duplicated('host_id')]['host_id'].nunique())
    return multi_hosts

multi_hosts = find_multi_hosts(df)

# Total number of hosts
num_hosts = df['host_id'].nunique()
print(num_hosts)

3858
9820
56213


In [337]:
# Generate multi-host df
df1 = df[df['host_listings_count'] > 1]

In [342]:
# Find number of multi-hosts
num_mh = df1[df1.duplicated('host_id')]['host_id'].nunique()
# Find number of hosts
num_hosts = df['host_id'].nunique()
# Number of listings
num_list_mh = len(df1)
num_list_tot = len(df)

In [343]:
# Generate df of active listings
date_range = [df['last_review'].max()-datetime.timedelta(days=x) for x in range(366)]
df_act = df[df['last_review'].isin(date_range)]
df1_act = df1[df1['last_review'].isin(date_range)]


In [344]:
# Find average price
avg_prc_tot = df_act['price'].sum()/len(df_act)
avg_prc_mh = df1_act['price'].sum()/len(df1_act)
# Find average price per room
av_rm_tot = avg_prc_tot/df_act['accommodates'].mean()
av_rm_mh = avg_prc_mh/df1_act['accommodates'].mean()
# Find average bookings
avg_book_tot = df_act['number_of_reviews'].sum()/len(df_act)
avg_book_mh = df1_act['number_of_reviews'].sum()/len(df1_act)
# Estimated cash flow
avg_cf_tot = avg_prc_tot*avg_book_tot*len(df_act)
avg_cf_mh = avg_prc_mh*avg_book_mh*len(df1_act)
# Estimated AirBnB revenue, @3% of host and 6% guest
avg_rev_tot = avg_cf_tot*0.03 + avg_cf_tot*0.06
avg_rev_mh = avg_cf_mh*0.03 + avg_cf_mh*0.06

In [328]:
# Generate df of booked and unbooked listings
df_b = df[df['number_of_reviews'].isin(range(1, df['number_of_reviews'].max()))]
df_ub = df[df['number_of_reviews'].isin([0])]

In [334]:
# Generate subset dfs of people with and without profile pictures & verfication
df_b_pp = df_b[(df_b['host_has_profile_pic'] == 't') & (df_b['host_identity_verified'] == 't')]
df_b_npp = df_b[~((df_b['host_has_profile_pic'] == 't') & (df_b['host_identity_verified'] == 't'))]

df_ub_pp = df_ub[(df_ub['host_has_profile_pic'] == 't') & (df_ub['host_identity_verified'] == 't')]
df_ub_npp = df_ub[~((df_ub['host_has_profile_pic'] == 't') & (df_ub['host_identity_verified'] == 't'))]

In [347]:
# % with profile pic and verification
ppv_b = len(df_b_pp)/len(df_b)
ppv_ub = len(df_ub_pp)/len(df_ub)

In [354]:
# Find average price
avg_prc_ub = df_ub['price'].sum()/len(df_ub)
# Find average bookings
avg_book_ub = 7
# Estimated cash flow
avg_cf_ub = avg_prc_ub*avg_book_ub*len(df_ub)
# Estimated AirBnB revenue, @3% of host and 6% guest
avg_rev_ub = avg_cf_ub*0.03 + avg_cf_ub*0.06

In [357]:
# Find number of active properties in each neighbourhood
def get_neighbourhood_vals(df: "Dataframe") -> dict:
    """Get the number of active listings per region.
    Args:
        df - pandas dataframe of AirBnB listings

    Returns:
        nbhd_vals - dict of reviews and listings for superhosts and total per neighbourhood
            in form {neighbourhood: [active_listings]}
    """
    nbhd_vals = {}
    for i in range(len(df)):
        nbhd_vals.setdefault(df['neighbourhood_cleansed'][i], 0)
        nbhd_vals[df['neighbourhood_cleansed'][i]] += 1
    return nbhd_vals

nbhd_act = get_neighbourhood_vals(df)