In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
import os
from re import sub
from decimal import Decimal
from sklearn.preprocessing import MultiLabelBinarizer
from scipy import stats

airbnb_uc=pd.read_csv("data/listings_CPH.csv")

In [12]:
#print(airbnb_uc[['neighbourhood_group']].to_string(index=False)) 
#print(airbnb_uc[['license']].to_string(index=False)) 


In [2]:
#Dropping unnecessary columns = 

#airbnb_uc.drop("name","host_id", "neighbourhood_group","license","host_name","last_review", inplace=True, axis=1)

#name not relevant
airbnb_uc.drop('name', inplace=True, axis=1)

#host_id not relevant
airbnb_uc.drop('host_id', inplace=True, axis=1)

#neighbourhood column only contains NaN values
airbnb_uc.drop('neighbourhood_group', inplace=True, axis=1)

#license column only contains NaN values
airbnb_uc.drop("license", inplace=True, axis=1)

#host_names not relevant
airbnb_uc.drop("host_name", inplace=True, axis=1)

#last_review not relevant
airbnb_uc.drop("last_review", inplace=True, axis=1)

In [3]:
#Check for the null values in each column
airbnb_uc.isnull().sum()

id                                   0
neighbourhood                        0
latitude                             0
longitude                            0
room_type                            0
price                                0
minimum_nights                       0
number_of_reviews                    0
reviews_per_month                 1645
calculated_host_listings_count       0
availability_365                     0
number_of_reviews_ltm                0
dtype: int64

In [4]:
#Replace NaN values in reviews_per_month with 0
airbnb_uc.fillna({'reviews_per_month':0}, inplace=True)
airbnb_uc.reviews_per_month.isnull().sum()

0

In [5]:
# Create dummy variables for neighboorhood column

set(airbnb_uc['neighbourhood'])
neighbourhood = pd.get_dummies(airbnb_uc['neighbourhood'].str.lower().str.replace(' ', '_'))
airbnb_uc1 = pd.merge(airbnb_uc, neighbourhood, left_index=True, right_index=True)
airbnb_uc1 = airbnb_uc1.drop('neighbourhood', 1)

airbnb_uc1.head()

Unnamed: 0,id,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365,...,amager_vest,bispebjerg,brnshj-husum,frederiksberg,indre_by,nrrebro,sterbro,valby,vanlse,vesterbro-kongens_enghave
0,6983,55.68641,12.54741,Entire home/apt,898,3,172,1.08,1,0,...,0,0,0,0,0,1,0,0,0,0
1,26057,55.69307,12.57649,Entire home/apt,2600,4,59,0.55,1,303,...,0,0,0,0,1,0,0,0,0,0
2,26473,55.67602,12.5754,Entire home/apt,3250,3,300,2.06,3,56,...,0,0,0,0,1,0,0,0,0,0
3,29118,55.67023,12.55504,Entire home/apt,725,7,24,0.16,1,59,...,0,0,0,0,0,0,0,0,0,1
4,31094,55.666602,12.555283,Entire home/apt,1954,3,19,0.13,1,0,...,0,0,0,0,0,0,0,0,0,1


In [7]:
# Create dummy variables for room type column

room_type = pd.get_dummies(airbnb_uc['room_type'])
airbnb = pd.merge(airbnb_uc1, room_type, left_index=True, right_index=True)
airbnb = airbnb.drop('room_type', 1)

airbnb.head()

Unnamed: 0,id,latitude,longitude,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365,number_of_reviews_ltm,...,indre_by,nrrebro,sterbro,valby,vanlse,vesterbro-kongens_enghave,Entire home/apt,Hotel room,Private room,Shared room
0,6983,55.68641,12.54741,898,3,172,1.08,1,0,4,...,0,1,0,0,0,0,1,0,0,0
1,26057,55.69307,12.57649,2600,4,59,0.55,1,303,8,...,1,0,0,0,0,0,1,0,0,0
2,26473,55.67602,12.5754,3250,3,300,2.06,3,56,7,...,1,0,0,0,0,0,1,0,0,0
3,29118,55.67023,12.55504,725,7,24,0.16,1,59,2,...,0,0,0,0,0,1,1,0,0,0
4,31094,55.666602,12.555283,1954,3,19,0.13,1,0,2,...,0,0,0,0,0,1,1,0,0,0


In [8]:
# normalizing price, minimum nights, number of reviews, reviews per month, 
# calculated host listings count and availibilty for values between 0 and 1.

def normalizing(column):
    new_column = (column - column.min()) / (column.max() - column.min())
    return new_column

airbnb['price'] = normalizing(airbnb['price'])
airbnb['minimum_nights'] = normalizing(airbnb['minimum_nights'])
airbnb['number_of_reviews'] = normalizing(airbnb['number_of_reviews'])
airbnb['reviews_per_month'] = normalizing(airbnb['reviews_per_month'])
airbnb['calculated_host_listings_count'] = normalizing(airbnb['calculated_host_listings_count'])
airbnb['availability_365'] = normalizing(airbnb['availability_365'])
airbnb['number_of_reviews_ltm'] = normalizing(airbnb['number_of_reviews_ltm'])

airbnb.head()



Unnamed: 0,id,latitude,longitude,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365,number_of_reviews_ltm,...,indre_by,nrrebro,sterbro,valby,vanlse,vesterbro-kongens_enghave,Entire home/apt,Hotel room,Private room,Shared room
0,6983,55.68641,12.54741,0.013837,0.001802,0.241913,0.044963,0.0,0.0,0.008493,...,0,1,0,0,0,0,1,0,0,0
1,26057,55.69307,12.57649,0.040062,0.002703,0.082982,0.022898,0.0,0.830137,0.016985,...,1,0,0,0,0,0,1,0,0,0
2,26473,55.67602,12.5754,0.050077,0.001802,0.421941,0.085762,0.011236,0.153425,0.014862,...,1,0,0,0,0,0,1,0,0,0
3,29118,55.67023,12.55504,0.011171,0.005405,0.033755,0.006661,0.0,0.161644,0.004246,...,0,0,0,0,0,1,1,0,0,0
4,31094,55.666602,12.555283,0.030108,0.001802,0.026723,0.005412,0.0,0.0,0.004246,...,0,0,0,0,0,1,1,0,0,0


In [9]:
airbnb.describe()

Unnamed: 0,id,latitude,longitude,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365,number_of_reviews_ltm,...,indre_by,nrrebro,sterbro,valby,vanlse,vesterbro-kongens_enghave,Entire home/apt,Hotel room,Private room,Shared room
count,13815.0,13815.0,13815.0,13815.0,13815.0,13815.0,13815.0,13815.0,13815.0,13815.0,...,13815.0,13815.0,13815.0,13815.0,13815.0,13815.0,13815.0,13815.0,13815.0,13815.0
mean,1.719385e+17,55.680569,12.557805,0.018581,0.003248,0.02612,0.033147,0.019105,0.255624,0.011573,...,0.149692,0.164169,0.109374,0.047991,0.025335,0.165762,0.88527,0.001086,0.112269,0.001375
std,2.852538e+17,0.021088,0.033512,0.022082,0.015225,0.051199,0.045081,0.119655,0.33743,0.024149,...,0.356783,0.370443,0.312119,0.213756,0.157146,0.37188,0.318708,0.032934,0.315709,0.037061
min,6983.0,55.60951,12.43567,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,19079620.0,55.66611,12.540285,0.011233,0.000901,0.002813,0.006245,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
50%,39315420.0,55.68119,12.55552,0.015331,0.001802,0.009845,0.017902,0.0,0.054795,0.006369,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
75%,5.665348e+17,55.69577,12.580741,0.021086,0.002703,0.028129,0.041632,0.0,0.476712,0.012739,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
max,7.222925e+17,55.7428,12.65174,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
