# AirBnB Price Prediction Model

This notebook goes over building a machine learning model to try to predict the price of an Airbnb listing in NYC based on a dataset of approximately 50,000 listings.

This report contains several sections
1. [Data Preparation](#Data-Preparation)
2. [Preprocessing and Transformation Pipelines](#Preprocessing-and-Transformation-Pipelines)
3. [Dummy Regressor Model](#Dummy-Regressor)
4. [KNN Regression Model](#KNN-Regressor)
5. [Ridge Regression Model](#Ridge-Regressor)
6. [Decision Tree Regression Model](#Decision-Tree-Regressor)
7. [Alternate Model Using Price Categories](#Binning-Price-and-Using-a-Classification-Model)

The various models were built to try to find the optimal algorithm to use for this dataset.

## Data Preparation

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
airbnb=pd.read_csv("AB_NYC_2019.csv")
airbnb.drop(['id','host_id','host_name','last_review'], axis=1, inplace=True)
airbnb['name'] = airbnb['name'].fillna('missing') # could not get simpleimputer to work in pipeline with count vectorizer This seems to be an issue with sklearn 
#https://stackoverflow.com/questions/63000388/how-to-include-simpleimputer-before-countvectorizer-in-a-scikit-learn-pipeline

In [None]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(airbnb, test_size=0.2, random_state=123)


In [None]:
train_df.info()

In [None]:
airbnb.isnull().sum() #how many nulls are found in each column in dataset 

In [None]:
#Correlation Matrix for numerical values, low correlation between features and target variable likely means it will be difficult to get a high scoring model
#Code sourced from https://stackoverflow.com/questions/29432629/plot-correlation-matrix-using-pandas
f= plt.figure(figsize=(19, 15))
plt.matshow(train_df.corr(), fignum=f.number)
plt.xticks(range(train_df.select_dtypes(['number']).shape[1]), train_df.select_dtypes(['number']).columns, fontsize=14, rotation=45)
plt.yticks(range(train_df.select_dtypes(['number']).shape[1]), train_df.select_dtypes(['number']).columns, fontsize=14)
cb = plt.colorbar()
cb.ax.tick_params(labelsize=14)
plt.title('Correlation Matrix', fontsize=16);

In [None]:
#correlation matrix
train_df.corr()

In [None]:
# Numerical description
train_df.describe(include=[np.number])

In [None]:
# Categorical
train_df.describe(include=['O'])

In [None]:
train_df.groupby('neighbourhood_group').median()[['price','minimum_nights']]

In [None]:
#Plot histogram
import altair as alt
alt.data_transformers.disable_max_rows() 

alt.Chart(train_df).mark_bar(
    opacity=0.5,
    binSpacing=0
).encode(
    alt.X('price:Q', bin=alt.Bin(maxbins=100)),
    alt.Y('count()', stack=None),
    alt.Color('neighbourhood_group:N'))

In [None]:
#create df with lower prices
lowerprices = train_df[train_df['price'] <1000]

In [None]:
alt.Chart(lowerprices).mark_bar(
    opacity=0.6,
    binSpacing=0
).encode(
    alt.X('price:Q', bin=alt.Bin(maxbins=100)),
    alt.Y('count()', stack=None),
    alt.Color('neighbourhood_group:N'))

In [None]:
alt.Chart(lowerprices).mark_bar(
    opacity=0.6,
    binSpacing=0
).encode(
    alt.X('price:Q', bin=alt.Bin(maxbins=100)),
    alt.Y('count()', stack=None),
    alt.Color('room_type:N'))

In [None]:
alt.Chart(lowerprices).mark_bar(
    opacity=0.6,
    binSpacing=0
).encode(
    alt.X('price:Q', bin=alt.Bin(maxbins=100)),
    alt.Y('count()', stack=None),
    alt.Color('number_of_reviews:Q',bin = True))

In [None]:
alt.Chart(lowerprices).mark_bar(
    opacity=0.6,
    binSpacing=0
).encode(
    alt.X('price:Q', bin=alt.Bin(maxbins=100)),
    alt.Y('count()', stack=None),
    alt.Color('reviews_per_month:Q',bin = True))

In [None]:
#Function to bin the listings by price according to quartile
def pricerange(price):
    if price <=69:
        return 'low'
    elif price > 69 and price <=106:
        return 'medium-low'
    elif price > 106 and price <=175:
        return 'medium-high'
    elif price >175:
        return 'high'

In [None]:
#Create a new column in the dataframe that is a price category
train_df['price_range'] = airbnb.apply(lambda x: pricerange(x['price']),axis = 1)

In [None]:
alt.Chart(train_df).mark_bar(
    opacity=1,
    binSpacing=0
).encode(
    alt.X('price:Q', bin=alt.Bin(maxbins=100)),
    alt.Y('count()', stack=None),
    alt.Color('price_range:N',scale=alt.Scale(
            domain=['high', 'medium-high','medium-low','low'],
            range=['black', 'blue','green','red'])))

In [None]:
#REALLY SLOW
#interactive map slow to load as each point is added to map
#code is taken from a previous project
# create map
import folium
import matplotlib.cm as cm
import matplotlib.colors as colors
import matplotlib.pyplot as plt

map_airbnb = folium.Map(location=[40.73293,-73.99782], zoom_start=11)
pricedict = {'low':0,'medium-low':1,'medium-high':2, 'high':3}
colorlist = ['#FF0000','#00FF00','#0000FF','#000000']

# add markers to the map
markers_colors = []
for lat, lon, name, price in zip(train_df['latitude'], train_df['longitude'], train_df['name'], train_df['price_range']):
    label = folium.Popup(str(name) + ' PRICE RANGE: ' + str(price), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=2,
        popup=label,
        color=colorlist[pricedict[price]],
        fill=True,
        fill_color=colorlist[pricedict[price]],
        fill_opacity=0.4).add_to(map_airbnb)
       
map_airbnb

In [None]:
#save the map as html file
map_airbnb
output_file = "nyccolormap.html"
map_airbnb.save(output_file)


In [None]:
#Split the data into x,y sets
X_train = train_df.drop(columns = ['price'])
y_train = train_df["price"]

X_test = test_df.drop(columns=["price"])
y_test = test_df["price"]

## Preprocessing and Transformation Pipelines

In [None]:
numeric_features = ['latitude', 'longitude', 'number_of_reviews','reviews_per_month', 'reviews_per_month']
categorical_features = ['room_type']
#text_features = ['name']

In [None]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import CountVectorizer





numeric_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="constant",fill_value = 0)),
        ("scaler", StandardScaler()),
    ]
)

categorical_transformer= Pipeline(steps=[
                ('onehotencoder', OneHotEncoder(handle_unknown="ignore"))])

#text_transformer = make_pipeline(CountVectorizer(max_features = 3),StandardScaler(with_mean=False)) 

In [None]:
from sklearn.compose import ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ("numeric", numeric_transformer, numeric_features),
        ("categorical", categorical_transformer, categorical_features)
        #("text", text_transformer, 'name')
       
    ], 
    remainder='drop'    
)

## Dummy Regressor

In [None]:
from sklearn.dummy import DummyRegressor

pipe1 = make_pipeline(
    preprocessor,DummyRegressor())
pipe1

In [None]:
from sklearn.model_selection import RandomizedSearchCV

param_grid = {
    "dummyregressor__strategy": ['mean']
 
}
random_search = RandomizedSearchCV(pipe1,  param_distributions= param_grid, cv=2, verbose=2, n_jobs=2, n_iter=5, return_train_score = True)
random_search.fit(X_train, y_train);

In [None]:
pd.DataFrame(random_search.cv_results_)[["params", "mean_test_score","mean_train_score"  , "rank_test_score"]]

## KNN Regressor

In [None]:
from sklearn.neighbors import KNeighborsRegressor

pipe = make_pipeline(
    preprocessor,KNeighborsRegressor())
pipe

In [None]:

param_grid = {
    "kneighborsregressor__n_neighbors": [70,80,90,100, 110,120,130,140]
    
 
}
random_search = RandomizedSearchCV(pipe, param_distributions= param_grid, cv=3, verbose=2, n_jobs=1, n_iter=16, return_train_score = True)
random_search.fit(X_train, y_train);

In [None]:
from IPython.display import HTML
HTML(pd.DataFrame(random_search.cv_results_)[["params","mean_test_score","mean_train_score" ]].to_html())

## Ridge Regressor

In [None]:
from sklearn.linear_model import Ridge

pipe3 = make_pipeline(
    preprocessor,Ridge())
pipe3

In [None]:
from sklearn.model_selection import RandomizedSearchCV
param_grid = {
    "ridge__alpha": 10.0**np.arange(-2, 4, 1)
 
}
random_search = RandomizedSearchCV(pipe3,  param_distributions= param_grid, cv=2, verbose=2, n_jobs=2, n_iter=6, return_train_score = True)
random_search.fit(X_train, y_train);

In [None]:
pd.DataFrame(random_search.cv_results_)[["params", "mean_test_score","mean_train_score"  , "rank_test_score"]]

## Decision Tree Regressor

In [None]:
from sklearn.tree import DecisionTreeRegressor
pipe4 = make_pipeline(
    preprocessor,DecisionTreeRegressor())
pipe4

In [None]:
from sklearn.model_selection import RandomizedSearchCV

param_grid = {
    "decisiontreeregressor__max_depth": [2,3,4,5]
 
}
random_search = RandomizedSearchCV(pipe4,  param_distributions= param_grid, cv=3, verbose=2, n_jobs=2, n_iter=10, return_train_score = True)
random_search.fit(X_train, y_train);

In [None]:
pd.DataFrame(random_search.cv_results_)[["params", "mean_test_score","mean_train_score"  , "rank_test_score"]]

## Binning Price and Using a Classification Model

Due to the low accuracy scores from the regression modeling, we adjusted the model to try to predict the price range that a listing was in by binning the prices into four categories corresponding to approximately the four price quartiles: low, medium-low, medium-high and high.

In [None]:
#Re-read in the data to start fresh
airbnb=pd.read_csv("AB_NYC_2019.csv")
airbnb.drop(['id','host_id','host_name','last_review'], axis=1, inplace=True)
airbnb['name'] = airbnb['name'].fillna('missing') # could not get simpleimputer to work in pipeline with count vectorizer This seems to be an issue with sklearn 
#https://stackoverflow.com/questions/63000388/how-to-include-simpleimputer-before-countvectorizer-in-a-scikit-learn-pipeline

In [None]:
def pricerange(price):
    if price <=69:
        return 'low'
    elif price > 69 and price <=106:
        return 'medium-low'
    elif price > 106 and price <=175:
        return 'medium-high'
    elif price >175:
        return 'high'


In [None]:
airbnb['price_range'] = airbnb.apply(lambda x: pricerange(x['price']),axis = 1)

In [None]:
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(airbnb, test_size=0.2, random_state=123)


In [None]:
X_train = train_df.drop(columns = ['price', 'price_range'])
y_train = train_df["price_range"]

X_test = test_df.drop(columns=["price","price_range"])
y_test = test_df["price_range"]

In [None]:
numeric_features = ['latitude', 'longitude', 'minimum_nights' , 'number_of_reviews', 'reviews_per_month','availability_365']
categorical_features = ['room_type']
text_features = ['name']

In [None]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MaxAbsScaler
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import CountVectorizer





numeric_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="constant",fill_value = 0)),
        ("scaler", StandardScaler()),
    ]
)

categorical_transformer= Pipeline(steps=[
                ('onehotencoder', OneHotEncoder(handle_unknown="ignore"))])

text_transformer = make_pipeline(CountVectorizer(max_features=3), StandardScaler(with_mean = False)) 

In [None]:
from sklearn.compose import ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ("numeric", numeric_transformer, numeric_features),
        ("categorical", categorical_transformer, categorical_features),
        ("text", text_transformer, 'name')
       
    ], 
    remainder='drop'    
)

## Dummy Classifier Model

In [None]:
from sklearn.dummy import DummyClassifier

dummy = DummyClassifier(strategy = 'most_frequent')
dummy.fit(X_train, y_train)
dummy.score(X_train, y_train)

## KNeighbours Classifier

In [None]:
from sklearn.neighbors import KNeighborsClassifier

pipe6 = make_pipeline(
    preprocessor,KNeighborsClassifier())
pipe6

In [None]:
from sklearn.model_selection import RandomizedSearchCV

param_grid = {
    "kneighborsclassifier__n_neighbors": [5,10,15,20,25,30,35,40,50,60,70,80,90,100]
    
 
}
random_search = RandomizedSearchCV(pipe6, param_distributions= param_grid, cv=3, verbose=2, n_jobs=1, n_iter=16, return_train_score = True)
random_search.fit(X_train, y_train);

In [None]:
pd.DataFrame(random_search.cv_results_)[["params", "mean_test_score","mean_train_score"  , "rank_test_score"]]