In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
import pandas as pd
dfc = pd.read_csv('/kaggle/input/seattle/calendar.csv')
dfl = pd.read_csv('/kaggle/input/seattle/listings.csv')
dfr = pd.read_csv('/kaggle/input/seattle/reviews.csv')

In [31]:
dfl

#  Find how Airbnb's prices vary based on features of the listing. 

# 1. Data Understanding
What is the std, mean, percentile values etc of this data. 
Verify data quality, identify null values percent

In [4]:
pd.set_option('display.max_columns', 100)
dfl.describe()

In [5]:
missing_df = pd.DataFrame({"column": dfl.columns, "percent": dfl.isnull().sum()/len(dfl)})
missing_df[missing_df["percent"] > 0.5]

In [6]:
dfl.dtypes.value_counts()

In [7]:
dfl['jurisdiction_names'].value_counts()

Visualise data using histogram if needed. 

In [8]:
dfl.hist()

See if there are columns which have low variance

In [9]:
dfl.std() < 0.2

In [10]:
import seaborn as sns
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [11]:
fig, ax = plt.subplots(figsize=(30,30))
sns.heatmap(dfl.corr(), annot=True, fmt=".2f");

# 2. Data Preparation 

Drop columns which have high correlation

In [12]:
dfl.drop(columns=['license', 'availability_30', 'availability_60'], inplace=True)

Drop columns with low std deviation/variance


In [None]:
dfl.drop(columns=['latitude', 'longitude'], inplace=True)

In [14]:
dfl.drop(columns=['square_feet'], inplace=True)

Price is not in float so convert to float

In [16]:
dfl['price'] = dfl['price'].apply(lambda x: float(x.replace("$", "").replace(",", "")))

In [17]:
dfl['price']

In [18]:
fig, ax = plt.subplots(figsize=(30,30))
sns.heatmap(dfl.corr(), annot=True, fmt=".2f");

# 3. Modeling

Try by removing null values

In [19]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
features = ['accommodates', 'bathrooms', 'bedrooms', 'beds', 'guests_included']

dfl_new = dfl[features + ['price']]

#Removing null values
dfl_new.dropna(inplace=True)
X = dfl_new[features]
y = dfl_new['price']

from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .10, random_state=62)

#Four steps:

#Instantiate
lm_model = LinearRegression(normalize=True) 


lm_model.fit(X_train, y_train) 

y_predict = lm_model.predict(X_test)

score = lm_model.score(X_test, y_test)
#Predict
#Score
mse = mean_squared_error(y_test, y_predict)
r2e = r2_score(y_test, y_predict)

# 4. Evaluation

In [20]:
mse

In [21]:

r2e

Since r2e is very low, we try imputing

In [22]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
features = ['accommodates', 'bathrooms', 'bedrooms', 'beds', 'guests_included']

dfl_new = dfl[features + ['price']]

#Imputing unwanted values
fill_lambda = lambda col: col.fillna(col.mean())
dfl_new = dfl_new.apply(fill_lambda, axis=0)
X = dfl_new[features]
y = dfl_new['price']

from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .10, random_state=62)

#Four steps:

#Instantiate
lm_model = LinearRegression(normalize=True) 


lm_model.fit(X_train, y_train) 

y_predict = lm_model.predict(X_test)

score = lm_model.score(X_test, y_test)
#Predict
#Score
mse = mean_squared_error(y_test, y_predict)
r2e = r2_score(y_test, y_predict)

In [23]:
mse

In [24]:
r2e

Since r2 score is low, we cannot predict price accurately with the given data or features. We can experiment with another model also, but since data is less come to a conclusion

In [None]:
dfl_new

In [None]:
dfl.columns

In [None]:
dfl['neighbourhood_group_cleansed'].value_counts()

In [None]:
dfl['room_type'].value_counts()

In [28]:
import seaborn as sns
plt.axis([0, 4000, 0, 400])
sns.scatterplot(data=dfl, x=dfl.index, y='price', hue='room_type')

In [29]:
import seaborn as sns
plt.axis([0, 4000, 0, 400])
sns.scatterplot(data=dfl, x=dfl.index, y='price', hue='neighbourhood_group_cleansed')

In [37]:
import seaborn as sns
plt.axis([0, 4000, 0, 200])
sns.scatterplot(data=dfl, x=dfl.index, y='price', hue='property_type')

In [40]:
dfl['property_type'].value_counts()

In [53]:
dfl.groupby('zipcode')['price'].mean()

In [38]:
import seaborn as sns
plt.axis([0, 4000, 0, 200])
sns.scatterplot(data=dfl, x=dfl.index, y='price', hue='bed_type')

In [39]:
dfl['bed_type'].value_counts()

In [52]:
dfl['zipcode'].value_counts()

In [43]:
import seaborn as sns
plt.axis([0, 4000, 0, 400])
sns.scatterplot(data=dfl, x=dfl.index, y='price', hue='cancellation_policy')

After analysing value_countset columns against price, we are attempt

In [54]:
cat_cols = ['neighbourhood_group_cleansed', 'property_type', 'room_type', 'bed_type', 'cancellation_policy', 'zipcode']





from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
features = ['accommodates', 'bathrooms', 'bedrooms', 'beds', 'guests_included']

dfl_new = dfl[features + ['price'] + cat_cols]

In [55]:
dfl_new.hist()

In [56]:
miss_df = pd.DataFrame({"column": dfl_new.columns, "percent": dfl_new.isnull().sum()/len(dfl_new)})
miss_df

In [69]:

#Imputing unwanted values
dfl_new.dropna(subset=features, inplace=True, axis=0)
X = dfl_new[features + cat_cols]

#One hot encode dummy values
X = pd.get_dummies(X, columns=cat_cols, dummy_na=False)
y = dfl_new['price']


from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .15, random_state=42)

#Four steps:

#Instantiate
lm_model = LinearRegression(normalize=True) 


lm_model.fit(X_train, y_train) 

y_predict = lm_model.predict(X_test)

score = lm_model.score(X_test, y_test)
#Predict
#Score
mse = mean_squared_error(y_test, y_predict)
r2e = r2_score(y_test, y_predict)

In [70]:
r2e



In [68]:
mse

**By including the categorical variables, 'neighbourhood_group_cleansed', 'property_type', 'room_type', we increased our model r2 value by almost 30% 0.47 to 0.64**

# Finding how price varies by time of the year and week

In [86]:
dfc = pd.read_csv('/kaggle/input/seattle/calendar.csv', parse_dates=['date'])
dfc.dropna(inplace=True)

In [87]:
dfc['price'] = dfc['price'].apply(lambda x: float(x.replace("$", "").replace(",", "")))

In [88]:
#Remove listings with less than 20 data points
dfc_new = dfc.groupby('listing_id').filter(lambda x : len(x)==365)

In [89]:
fig, axs = plt.subplots(figsize=(20, 4))
dfc.groupby('date').mean()["price"].plot(ax=axs)

In [98]:

fig, axs = plt.subplots(figsize=(20, 4))
dfc.groupby(dfc['date'].dt.weekday).mean()["price"].plot(ax=axs)