In [156]:
import pandas as pd
import numpy as np
from pathlib import Path
import plotly.express as px
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

## Loading and Cleaning up the data set 

In [157]:
# Load the airbnh train dataset.
file_path = "AirBnB_data/train.csv"
airbnb_df = pd.read_csv(file_path)
airbnb_df.head(10)

Unnamed: 0,id,log_price,property_type,room_type,amenities,accommodates,bathrooms,bed_type,cancellation_policy,cleaning_fee,...,latitude,longitude,name,neighbourhood,number_of_reviews,review_scores_rating,thumbnail_url,zipcode,bedrooms,beds
0,6901257,5.010635,Apartment,Entire home/apt,"{""Wireless Internet"",""Air conditioning"",Kitche...",3,1.0,Real Bed,strict,True,...,40.696524,-73.991617,Beautiful brownstone 1-bedroom,Brooklyn Heights,2,100.0,https://a0.muscache.com/im/pictures/6d7cbbf7-c...,11201.0,1.0,1.0
1,6304928,5.129899,Apartment,Entire home/apt,"{""Wireless Internet"",""Air conditioning"",Kitche...",7,1.0,Real Bed,strict,True,...,40.766115,-73.98904,Superb 3BR Apt Located Near Times Square,Hell's Kitchen,6,93.0,https://a0.muscache.com/im/pictures/348a55fe-4...,10019.0,3.0,3.0
2,7919400,4.976734,Apartment,Entire home/apt,"{TV,""Cable TV"",""Wireless Internet"",""Air condit...",5,1.0,Real Bed,moderate,True,...,40.80811,-73.943756,The Garden Oasis,Harlem,10,92.0,https://a0.muscache.com/im/pictures/6fae5362-9...,10027.0,1.0,3.0
3,13418779,6.620073,House,Entire home/apt,"{TV,""Cable TV"",Internet,""Wireless Internet"",Ki...",4,1.0,Real Bed,flexible,True,...,37.772004,-122.431619,Beautiful Flat in the Heart of SF!,Lower Haight,0,,https://a0.muscache.com/im/pictures/72208dad-9...,94117.0,2.0,2.0
4,3808709,4.744932,Apartment,Entire home/apt,"{TV,Internet,""Wireless Internet"",""Air conditio...",2,1.0,Real Bed,moderate,True,...,38.925627,-77.034596,Great studio in midtown DC,Columbia Heights,4,40.0,,20009.0,0.0,1.0
5,12422935,4.442651,Apartment,Private room,"{TV,""Wireless Internet"",Heating,""Smoke detecto...",2,1.0,Real Bed,strict,True,...,37.753164,-122.429526,Comfort Suite San Francisco,Noe Valley,3,100.0,https://a0.muscache.com/im/pictures/82509143-4...,94131.0,1.0,1.0
6,11825529,4.418841,Apartment,Entire home/apt,"{TV,Internet,""Wireless Internet"",""Air conditio...",3,1.0,Real Bed,moderate,True,...,33.980454,-118.462821,Beach Town Studio and Parking!!!11h,,15,97.0,https://a0.muscache.com/im/pictures/4c920c60-4...,90292.0,1.0,1.0
7,13971273,4.787492,Condominium,Entire home/apt,"{TV,""Cable TV"",""Wireless Internet"",""Wheelchair...",2,1.0,Real Bed,moderate,True,...,34.046737,-118.260439,"Near LA Live, Staple's. Starbucks inside. OWN ...",Downtown,9,93.0,https://a0.muscache.com/im/pictures/61bd05d5-c...,90015.0,1.0,1.0
8,180792,4.787492,House,Private room,"{TV,""Cable TV"",""Wireless Internet"",""Pets live ...",2,1.0,Real Bed,moderate,True,...,37.781128,-122.501095,Cozy Garden Studio - Private Entry,Richmond District,159,99.0,https://a0.muscache.com/im/pictures/0ed6c128-7...,94121.0,1.0,1.0
9,5385260,3.583519,House,Private room,"{""Wireless Internet"",""Air conditioning"",Kitche...",2,1.0,Real Bed,moderate,True,...,33.992563,-117.895997,No.7 Queen Size Cozy Room 舒适大床房,,2,90.0,https://a0.muscache.com/im/pictures/8d2f08ce-b...,91748.0,1.0,1.0


In [158]:
# Check the size of the data set
airbnb_df.shape

(74111, 29)

In [159]:
# As the first step, only "accomadates", "bathrooms" and the "city" are considered as the features
# "log_price" is the target variable
airbnb_df = airbnb_df[["log_price", "accommodates","bathrooms", "city"]]

In [160]:
# Drop nans
airbnb_df = airbnb_df.dropna()

In [161]:
# Check the size of the data set
airbnb_df.shape

(73911, 4)

In [162]:
# First 5 rows of the data set
airbnb_df.head(5)

Unnamed: 0,log_price,accommodates,bathrooms,city
0,5.010635,3,1.0,NYC
1,5.129899,7,1.0,NYC
2,4.976734,5,1.0,NYC
3,6.620073,4,1.0,SF
4,4.744932,2,1.0,DC


In [163]:
# Count the number of unique values in the city column
airbnb_df.city.value_counts()

NYC        32250
LA         22387
SF          6417
DC          5679
Chicago     3716
Boston      3462
Name: city, dtype: int64

In [164]:
# Use the label Encoder to assign number for each city
LE = LabelEncoder()
airbnb_df['location'] = LE.fit_transform(airbnb_df['city'])

In [165]:
# Drop the city column
airbnb_df= airbnb_df.drop(["city"], axis=1)
airbnb_df.head(5)

Unnamed: 0,log_price,accommodates,bathrooms,location
0,5.010635,3,1.0,4
1,5.129899,7,1.0,4
2,4.976734,5,1.0,4
3,6.620073,4,1.0,5
4,4.744932,2,1.0,2


In [166]:
# Check the data type
airbnb_df.dtypes

log_price       float64
accommodates      int64
bathrooms       float64
location          int64
dtype: object

In [167]:
# Separate X and y training values
y = airbnb_df.log_price.values
X = airbnb_df.drop(columns="log_price").values

In [168]:
# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

## Train the model and check the accuracy 

In [169]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [170]:
# Create a linear regression instance
model = LinearRegression() 

#fitting / training the model
model.fit(X_train_scaled,y_train) 

# predict the price
y_pred = model.predict(X_test_scaled) 
print(y_pred.shape)

(18478,)


In [171]:
# Compute model parameters
print(model.coef_) 
print(model.intercept_)  

[0.37991242 0.07118778 0.06725865]
4.7819995927704095


In [173]:
# Find the error
error_df = pd.DataFrame({"Predicted": y_pred, "Actual": y_test, "Error": y_pred - y_test})[["Predicted", "Actual", "Error"]]
error_df.head(10)

Unnamed: 0,Predicted,Actual,Error
0,4.53319,4.304065,0.229125
1,4.53319,4.077537,0.455652
2,4.766113,5.273,-0.506886
3,4.885112,4.382027,0.503086
4,4.950829,7.090077,-2.139248
5,4.652188,4.248495,0.403693
6,4.647115,5.857933,-1.210818
7,5.012865,5.834811,-0.821945
8,5.003414,5.164786,-0.161372
9,4.884416,5.273,-0.388584


In [179]:
# Graph the absolute error
fig = px.scatter(x=error_df.index, y=error_df["Error"].abs(),
                title="Absolute Error for Each Prediction")
#px.xlabel("Data row")
#px.ylabel("Absolute Error")
fig.show()

In [180]:
# Get the size of the X-train data set
X_train.shape

(55433, 3)

In [181]:
X_train_df = pd.DataFrame(data=X_train, columns=["accommodates", "bathrooms", "location"])
X_train_df.shape

(55433, 3)

In [182]:
X_train_df = pd.DataFrame(data=X_train, columns=["accommodates", "bathrooms", "location"])
X_train_df.shape

(55433, 3)

In [183]:
fig = px.scatter_3d(X_train_df, x="accommodates", y="bathrooms", z="location",
              color=y_train)
fig.show()
# This doen't mean anything... just wanted to check