In [19]:
# Statistics
import pandas as pd
import numpy as np
import math as mt

# Data Visualization
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

# Data Preprocessing - Standardization, Encoding, Imputation
from sklearn.preprocessing import StandardScaler # Standardization
from sklearn.preprocessing import Normalizer # Normalization
from sklearn.preprocessing import OneHotEncoder # One-hot Encoding
from sklearn.preprocessing import OrdinalEncoder # Ordinal Encoding
from category_encoders import MEstimateEncoder # Target Encoding
from sklearn.preprocessing import PolynomialFeatures # Create Polynomial Features
from sklearn.impute import SimpleImputer # Imputation

# Exploratory Data Analysis - Feature Engineering
from sklearn.preprocessing import PolynomialFeatures
from sklearn.feature_selection import mutual_info_regression
from sklearn.decomposition import PCA

# Modeling - ML Pipelines
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

# Modeling - Algorithms
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
#from catboost import CatBoostRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

# ML - Evaluation
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score

# ML - Tuning
import optuna
#from sklearn.model_selection import GridSearchCV

# Settings
# Settings for Seaborn
sns.set_theme(context='notebook', style='ticks', palette="bwr_r", font_scale=0.7, rc={"figure.dpi":240, 'savefig.dpi':240})

In [20]:
import os
kaggle_project = 'settle-airbnb'
# Import dataset from local directory './data' or from Kaggle
data_dir = ('./data' if os.path.exists('data') else f'/kaggle/input/{kaggle_project}')

# print all files in data_dir
for dirname, _, filenames in os.walk(data_dir):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Import three datasets
review = pd.read_csv(f'{data_dir}/reviews.csv')
listings = pd.read_csv(f'{data_dir}/listings.csv')
calendar = pd.read_csv(f'{data_dir}/calendar.csv')

./data/run-DataSink0-1-part-r-00000
./data/reviews.csv
./data/listings.csv
./data/calendar.csv
./data/listings.json
./data/.ipynb_checkpoints/listings-checkpoint.json
./data/.ipynb_checkpoints/run-DataSink0-1-part-r-00000-checkpoint
./data/.ipynb_checkpoints/listings-checkpoint.csv


In [147]:
# Parsing calendar

# Convert date from object to datetime
calendar.date = pd.to_datetime(calendar.date)
# Convert price from object to float
# Convert '$' and ',' to ''
calendar.price = calendar.price.replace('[\$,]', '', regex=True).astype(float)

# Drop the data in 2017. The number of dropped rows is 7636 (0.54%).
calendar.drop(index = calendar[calendar.date.dt.year == 2017].index, inplace=True)
assert calendar.date.dt.year.mean() == 2016, 'calendar data must in 2016'

# Add month, quarter to calendar
calendar['month'] = calendar.date.dt.month
calendar['quarter'] = calendar.date.dt.quarter

"""
SELECT listing_id, AVG(price) AS avg_price
FROM calendar
GROUP BY listing_id
"""
# calendar.groupby('listing_id').mean('price')['price'].reset_index(name='avg_price')

'\nSELECT listing_id, AVG(price) AS avg_price\nFROM calendar\nGROUP BY listing_id\n'

In [33]:
# Parsing listings

# Replace amenities from {}" to ''
listings.amenities.replace('[{}"]', '', regex=True, inplace=True)
# Split amenities with ,
amenities = listings.amenities.str.split(',', expand=True)

# For each col, extract the unique amenities
amenities_uniques = []
for col in amenities.columns:
    amenities_uniques += list(amenities[col].unique())

# Remove the duplicate values
amenities_uniques = set(amenities_uniques)
amenities_uniques.remove('')
amenities_uniques.remove(None)
# Only two rows have Washer / Dryer, and they both have washer and dryer
amenities_uniques.remove('Washer / Dryer')
# When 'Pets live on this property' is True, one or more from 'Cat(s)', 'Dog(s)', 'Other pet(s)' will appear
amenities_uniques


_df = pd.DataFrame()

for amenity in amenities_uniques:
    _df[amenity] = listings.amenities.str.contains(amenity)

  _df[amenity] = listings.amenities.str.contains(amenity)


In [37]:
listings.amenities.str.contains('TV')

0        True
1        True
2        True
3       False
4        True
        ...  
3813     True
3814     True
3815     True
3816     True
3817     True
Name: amenities, Length: 3818, dtype: bool

In [35]:
listings.amenities.str.extract?

[0;31mSignature:[0m
[0mlistings[0m[0;34m.[0m[0mamenities[0m[0;34m.[0m[0mstr[0m[0;34m.[0m[0mextract[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mpat[0m[0;34m:[0m [0;34m'str'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mflags[0m[0;34m:[0m [0;34m'int'[0m [0;34m=[0m [0;36m0[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mexpand[0m[0;34m:[0m [0;34m'bool'[0m [0;34m=[0m [0;32mTrue[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m [0;34m->[0m [0;34m'FrameOrSeriesUnion | Index'[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Extract capture groups in the regex `pat` as columns in a DataFrame.

For each subject string in the Series, extract groups from the
first match of regular expression `pat`.

Parameters
----------
pat : str
    Regular expression pattern with capturing groups.
flags : int, default 0 (no flags)
    Flags from the ``re`` module, e.g. ``re.IGNORECASE``, that
    modify regular expression matching for things like case,
    spaces, etc.

In [148]:
calendar_annual = calendar.groupby('listing_id').mean('price')['price'].reset_index(name='avg_price')

In [149]:
"""
SELECT *
FROM listings
LEFT JOIN calendar_annual
    ON listings.id = calendar_annual.listing_id
"""

calendar_annual.set_index(['listing_id'], inplace=True)
listings.set_index(['id'], inplace=True)

df = listings.join(calendar_annual, how='left')

{'24-Hour Check-in',
 'Air Conditioning',
 'Breakfast',
 'Buzzer/Wireless Intercom',
 'Cable TV',
 'Carbon Monoxide Detector',
 'Cat(s)',
 'Dog(s)',
 'Doorman',
 'Dryer',
 'Elevator in Building',
 'Essentials',
 'Family/Kid Friendly',
 'Fire Extinguisher',
 'First Aid Kit',
 'Free Parking on Premises',
 'Gym',
 'Hair Dryer',
 'Hangers',
 'Heating',
 'Hot Tub',
 'Indoor Fireplace',
 'Internet',
 'Iron',
 'Kitchen',
 'Laptop Friendly Workspace',
 'Lock on Bedroom Door',
 'Other pet(s)',
 'Pets Allowed',
 'Pets live on this property',
 'Pool',
 'Safety Card',
 'Shampoo',
 'Smoke Detector',
 'Smoking Allowed',
 'Suitable for Events',
 'TV',
 'Washer',
 'Wheelchair Accessible',
 'Wireless Internet'}

In [527]:
df = pd.DataFrame()

for amenity in amenities_uniques:
    df[amenity] = listings.amenities.str.contains(amenity)

In [29]:
[1] + [1, 2]

[1, 1, 2]

In [529]:
df

Unnamed: 0_level_0,Pets Allowed,Free Parking on Premises,Elevator in Building,Dryer,Pets live on this property,Other pet(s),Hangers,Fire Extinguisher,Wheelchair Accessible,TV,...,Heating,Smoking Allowed,Cable TV,Carbon Monoxide Detector,Air Conditioning,Internet,Lock on Bedroom Door,Cat(s),Suitable for Events,Smoke Detector
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
241032,False,False,False,True,False,False,False,False,False,True,...,True,False,True,False,True,True,False,False,False,False
953595,False,True,False,True,False,False,False,True,False,True,...,True,False,False,True,False,True,False,False,False,True
3308979,True,True,False,True,True,False,False,False,False,True,...,True,False,True,True,True,True,False,False,False,True
7421966,False,False,False,True,False,False,False,True,False,False,...,True,False,False,True,False,True,False,False,False,True
278830,False,False,False,False,False,False,False,True,False,True,...,True,False,True,True,False,True,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8101950,False,True,False,True,False,False,False,True,True,True,...,True,False,True,False,True,True,False,False,False,True
8902327,False,True,False,True,False,False,True,True,False,True,...,True,False,True,False,False,True,False,False,False,True
10267360,False,True,False,True,True,False,True,True,False,True,...,True,False,True,False,False,True,False,False,False,True
9604740,False,True,True,True,False,False,True,False,False,True,...,True,False,False,False,False,True,False,False,False,True


In [21]:
listings.amenities

0       {TV,"Cable TV",Internet,"Wireless Internet","A...
1       {TV,Internet,"Wireless Internet",Kitchen,"Free...
2       {TV,"Cable TV",Internet,"Wireless Internet","A...
3       {Internet,"Wireless Internet",Kitchen,"Indoor ...
4       {TV,"Cable TV",Internet,"Wireless Internet",Ki...
                              ...                        
3813    {TV,"Cable TV",Internet,"Wireless Internet","A...
3814    {TV,"Cable TV",Internet,"Wireless Internet",Ki...
3815    {"Cable TV","Wireless Internet",Kitchen,"Free ...
3816    {TV,"Wireless Internet",Kitchen,"Free Parking ...
3817    {TV,"Cable TV",Internet,"Wireless Internet",Ki...
Name: amenities, Length: 3818, dtype: object

In [22]:
listings.amenities[0]

'{TV,"Cable TV",Internet,"Wireless Internet","Air Conditioning",Kitchen,Heating,"Family/Kid Friendly",Washer,Dryer}'

In [23]:
listings.amenities.replace('[{}"]', '', regex=True, inplace=True)

In [25]:
listings.amenities[0]

'TV,Cable TV,Internet,Wireless Internet,Air Conditioning,Kitchen,Heating,Family/Kid Friendly,Washer,Dryer'

In [32]:
listings.amenities.str.split(',', expand=True).unique()

AttributeError: 'DataFrame' object has no attribute 'unique'