In [None]:
#to enable autocomplete
%config Completer.use_jedi = False

**Reminder: TODO every time you want to  commit changes**

Go to `Edit > Clear all outputs` to clear all Notebook outputs before committing changes to the repository.

# Import necessary libraries

In [None]:
import numpy as np
import pandas as pd

In [None]:
import re
import json

In [None]:
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns

In [None]:
%matplotlib inline

plt.style.use('seaborn')

mpl.rcParams['figure.dpi'] = 100
mpl.rcParams['figure.figsize'] = [15, 10]
mpl.rcParams['axes.titlesize'] = 24
mpl.rcParams['axes.labelsize'] = 20
mpl.rcParams['lines.linewidth'] = 2
mpl.rcParams['lines.markersize'] = 2
mpl.rcParams['xtick.labelsize'] = 16
mpl.rcParams['ytick.labelsize'] = 16

# Listings

EDA for the `listings` data.

## Loading the data

In [None]:
# Alternatively, if you are not running the Notebook in Google Colab

#from google.colab import drive 
#drive.mount('/content/drive/')
#listings = pd.read_csv('drive/MyDrive/com-480-cam/data/vaud/listings-detailed.csv')

listings = pd.read_csv('../data/vaud/listings-detailed.csv')

In [None]:
print(listings.columns)

### Some routine formatting

In [None]:
# Convert dates and datetimes to pandas.DateTime
def format_dates(df: pd.DataFrame, feature: str, format: str = '%Y-%m-%d'):
    df[feature] = pd.to_datetime(df[feature], format=format)
    
# Format price by removing commas and dollar sign
def format_price(price: str):
    return float(price[1:].replace(',', ''))

def find_type(property_type: str):
    
    if 'entire' in property_type.lower():
        return 'place'
    
    if 'room' in property_type.lower():
        return 'room'
    
    return 'other'

In [None]:
dated_features = ['last_scraped', 'host_since', 'calendar_last_scraped', 
                  'first_review', 'last_review']

timestamped_features = ['scrape_id']

for feature in dated_features:
    format_dates(listings, feature)
    
for feature in timestamped_features:
    format_dates(listings, feature, format='%Y%m%d%H%M%S')
    
listings['price'] = listings.price.apply(format_price)
listings['amenities_count'] = listings.amenities.apply(lambda a: len(a))
listings['type'] = listings.property_type.apply(find_type)

## Visualizations

### Pair plot of some variables of interest

In [None]:
sns.pairplot(listings[['price', 'beds', 'amenities_count', 'availability_90', 'review_scores_rating']])

plt.show()

### What's the most common property type?

In [None]:
h = sns.histplot(listings[listings.property_type.isin(listings['property_type'].value_counts().keys()[:10])], x='property_type')

plt.xticks(rotation=90)
plt.show()

In [None]:
sns.displot(listings[listings.beds < 25], x="beds", y="accommodates")

plt.show()

### How does price distribution vary between entire places and private rooms?

In [None]:
sns.displot(listings[(listings.price < 600) & (listings.beds < 10) & (listings.type.isin(['place', 'room']))], x="price", y="beds", hue="type", kind="kde", fill=False, common_norm=False)

plt.show()

In [None]:
f, a = plt.subplots(2, 3)
    
sns.histplot(listings, x='review_scores_value',         ax=a[0, 0])
sns.histplot(listings, x='review_scores_accuracy',      ax=a[0, 1])
sns.histplot(listings, x='review_scores_cleanliness',   ax=a[0, 2])
sns.histplot(listings, x='review_scores_checkin',       ax=a[1, 0])
sns.histplot(listings, x='review_scores_communication', ax=a[1, 1])
sns.histplot(listings, x='review_scores_location',      ax=a[1, 2])

plt.show()

In [None]:
g = sns.PairGrid(listings[['review_scores_accuracy',
       'review_scores_cleanliness', 'review_scores_checkin',
       'review_scores_communication', 'review_scores_location',
       'review_scores_value']],  diag_sharey=False)

g.map_upper(sns.scatterplot)
g.map_lower(sns.kdeplot)
g.map_diag(sns.kdeplot)

plt.show()

# Reviews

In [None]:
reviews = pd.read_csv('../data/vaud/reviews.csv')

In [None]:
reviews['date'] = pd.to_datetime(reviews['date'], format='%Y-%m-%d')

In [None]:
reviews.sample(10)

In [None]:
def find_top_reviews(n=10, data=reviews):
    """return list of n ids containing the most amount of reviews"""
    return data.listing_id.value_counts()[0:n].index.tolist()

In [None]:
def plot_timeperiod_reviews(start_date="2018-01", end_date="2021-01", n=1 , data=reviews):
    """plot the n average count reviews over the timeperiod"""
    data = data.set_index('date')
    data_to_plot = data[data['listing_id'].isin(find_top_reviews(n))].groupby(pd.Grouper(freq='M')).count()
    data_to_plot = data_to_plot.fillna(0)
    
    
    title_ = "Number of Reviews for the top %s Listings" %n
    data_to_plot[start_date:end_date].plot(title=title_, ylabel="number of reviews")
    

## how does the general trend of Reviews compare over a period of time ?

In [None]:
plot_timeperiod_reviews(start_date="2015-01", end_date='2021-01', n=40)

In [None]:
plot_timeperiod_reviews(start_date="2017-01", end_date='2020-01', n=30)

## Within the top 5 most reviewed listings, how do they temporally compare ? 

In [None]:
from scipy.signal import savgol_filter

In [None]:
def plot_n_individual_reviews(start_date="2018-01", end_date="2021-01", n=2 , data=reviews, smoothed=True):
    """plots the top n individual reviews between the given time period"""
    
    data= reviews.set_index('date')
    data['count'] = 1 #needed to count
    data_to_plot = data[data['listing_id'].isin(find_top_reviews(n))].groupby(['listing_id', pd.Grouper(freq='M')]).count()

    pivoted_data = data_to_plot.pivot_table(index='date', columns='listing_id', values='count', fill_value=0)
    title_ = "Number of Reviews of top %s Listings" %n
    
    if smoothed:
        for col in pivoted_data.columns.to_list():
            pivoted_data[col] = savgol_filter(pivoted_data[col],11,3)

        pivoted_data[pivoted_data < 0]=0
    
    pivoted_data.plot(title=title_, ylabel="number of reviews")
        

In [None]:
plot_n_individual_reviews(n=5)

In [None]:
plot_n_individual_reviews(n=5,smoothed=False)

# Calendar