In [24]:
import numpy as np
import pandas as pd
import plotly.express as px # imports Plotly for data visualizations

# Basic Data Visualizations Using [Plotly](https://plotly.com/)


In [26]:
!wget --no-check-certificate 'https://drive.google.com/uc?export=download&id=12dwk7mCGc633VLEx69IF8r_xyxxEbDTH' -O listings_project.csv

--2024-03-27 22:40:33--  https://drive.google.com/uc?export=download&id=12dwk7mCGc633VLEx69IF8r_xyxxEbDTH
Resolving drive.google.com (drive.google.com)... 142.251.2.101, 142.251.2.113, 142.251.2.138, ...
Connecting to drive.google.com (drive.google.com)|142.251.2.101|:443... connected.
HTTP request sent, awaiting response... 303 See Other
Location: https://drive.usercontent.google.com/download?id=12dwk7mCGc633VLEx69IF8r_xyxxEbDTH&export=download [following]
--2024-03-27 22:40:33--  https://drive.usercontent.google.com/download?id=12dwk7mCGc633VLEx69IF8r_xyxxEbDTH&export=download
Resolving drive.usercontent.google.com (drive.usercontent.google.com)... 142.250.101.132, 2607:f8b0:4023:c0d::84
Connecting to drive.usercontent.google.com (drive.usercontent.google.com)|142.250.101.132|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 504529 (493K) [application/octet-stream]
Saving to: ‘listings_project.csv’


2024-03-27 22:40:34 (5.25 MB/s) - ‘listings_project.csv’ sav

In [27]:
df_list = pd.read_csv("listings_project.csv")

In [28]:
px.histogram(df_list, x="price_in_dollar", title="Histogram")

In [30]:
fig = px.bar(
    df_list,
    x="room_type",
    title="Bar chart",
    template="none",
)
fig.show()

In [29]:
fig = px.box(
    df_list, x="price_in_dollar", title="Boxplot", color="room_type", points="all"
)
fig.show()

In [31]:
fig = px.scatter(df_list, x="price_in_dollar", y="amenities")
fig.show()

In [17]:
# Explore your own Plotly visualizations here

# Scikit-Learn

In [18]:
# Displaying the data

In [32]:
df_list = df_list[["neighbourhood", "accommodates", "amenities", "room_type", "price_in_dollar"]]
df_list.head(2)

Unnamed: 0,neighbourhood,accommodates,amenities,room_type,price_in_dollar
0,Noord-Oost,2,5,Entire home/apt,105.0
1,Watergraafsmeer,6,14,Entire home/apt,279.0


## Spliting Data into training, evaluation, and testing datasets

In [33]:
import sklearn
from sklearn.model_selection import train_test_split

# Setting seed allows us to generate a random dataset split that
# is the same on every computer. Otherwise, every time you ran
# the split, you'd get a different dataset split.
SEED = 42

# features (X), label (y)
X, y = (
    df_list[["neighbourhood", "accommodates", "amenities", "room_type"]],
    df_list[["price_in_dollar"]],
)


def train_validation_test_split(
    X, y, train_ratio: float, validation_ratio: float, test_ratio: float
):
    # Split up dataset into train and test, of which we will split up test
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=(1 - train_ratio), random_state=SEED
    )

    # Split up test into two (validation and test)
    X_val, X_test, y_val, y_test = train_test_split(
        X_test,
        y_test,
        test_size=(test_ratio / (test_ratio + validation_ratio)),
        random_state=SEED,
    )

    # Return the splits
    return X_train, X_val, X_test, y_train, y_val, y_test


# Splits according to 80/10/10 ratio
X_train, X_val, X_test, y_train, y_val, y_test = train_validation_test_split(
    X, y, 0.8, 0.1, 0.1
)

In [34]:
X_train.head(3)

Unnamed: 0,neighbourhood,accommodates,amenities,room_type
1808,IJburg - Zeeburgereiland,2,4,Entire home/apt
1426,Oud-Noord,5,1,Entire home/apt
964,Centrum-Oost,4,14,Private room


In [35]:
y_train.head(3)

Unnamed: 0,price_in_dollar
1808,120.0
1426,195.0
964,256.0


In [36]:
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder

# Define how the encoding should work
oh_encoder = OneHotEncoder(  # Define one-hot encoding
    sparse_output=False,  # Sparse matrix doesn't work well with Pandas DataFrame
    dtype="int",  # Set type to integer
)

# Define which columns to transform
oh_enc_transformer = make_column_transformer(  # Define how to output columns
    (oh_encoder, ["room_type", "neighbourhood"]),  # Columns for one-hot encoding
    verbose_feature_names_out=False,  # Column names are "more concise"
    remainder="passthrough",  # All other columns should be left untouched
)

# Train (fit) the transformation on the training set
oh_encoded = oh_enc_transformer.fit(X_train)  # Change from category to number


`sparse` was renamed to `sparse_output` in version 1.2 and will be removed in 1.4. `sparse_output` is ignored unless you leave `sparse` to its default value.



In [37]:
# Transform the columns into one-hot encoding
X_train_oh_enc = oh_encoded.transform(X_train)

# Turn the encoded columns into a DataFrame
X_train = pd.DataFrame(
    X_train_oh_enc,  # Input the transformed dataset
    columns=oh_encoded.get_feature_names_out(),  # Set column names
    index=X_train.index,  # Keep index numbering from original df
)

# Show what the df looks like
X_train.head(2)

Unnamed: 0,room_type_Entire home/apt,room_type_Hotel room,room_type_Private room,room_type_Shared room,neighbourhood_Bijlmer-Centrum,neighbourhood_Bijlmer-Oost,neighbourhood_Bos en Lommer,neighbourhood_Buitenveldert - Zuidas,neighbourhood_Centrum-Oost,neighbourhood_Centrum-West,...,neighbourhood_Oostelijk Havengebied - Indische Buurt,neighbourhood_Osdorp,neighbourhood_Oud-Noord,neighbourhood_Oud-Oost,neighbourhood_Slotervaart,neighbourhood_Watergraafsmeer,neighbourhood_Westerpark,neighbourhood_Zuid,accommodates,amenities
1808,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,2,4
1426,1,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,5,1


In [38]:
# Transform the columns into one-hot encoding
X_val_oh_enc = oh_encoded.transform(X_val)

# Turn the encoded columns into a df
X_val = pd.DataFrame(
    X_val_oh_enc,  # Input the transformed dataset
    columns=oh_encoded.get_feature_names_out(),  # Set column names
    index=X_val.index,
)

In [39]:
# Transform the columns into one-hot encoding
X_test_oh_enc = oh_encoded.transform(X_test)

# Turn the encoded columns into a df
X_test = pd.DataFrame(
    X_test_oh_enc,  # Input the transformed dataset
    columns=oh_encoded.get_feature_names_out(),  # Set column names
    index=X_test.index,  # Keep index numbering from original df
)

### SPLOM charts and visualizing correlated features

In [40]:
import numpy as np
import plotly.express as px

# Exclude "neighbourhood" columns for better visualization
X_train_filtered = X_train.filter(regex="^((?!neighbourhood).)*$")

# Combine X_train with Y_train
ndf_list = pd.concat([X_train_filtered, y_train], axis=1)

# Create a DataFrame that can be used as a heatmap
fig = px.imshow(
    ndf_list.corr().round(2),
    text_auto=True,
    aspect="auto",
    color_continuous_scale="rdylgn",
)
fig.show()

# Decision Trees

In [41]:
from sklearn.tree import DecisionTreeRegressor # Import algorithm


In [42]:
# Create a classifier algorithm + its predictions
model = DecisionTreeRegressor(random_state=SEED)  # Our algorithm

In [43]:
model.fit(  # Train it ("learn the material")
    X_train[["room_type_Private room"]],  # Subset of the columns used
    y_train,
)

In [44]:
from sklearn.metrics import r2_score

y_predict = model.predict(X_test[["room_type_Private room"]])  # Do a "final exam"

# Compare algorithms' "final exam" vs. expected
r2_score(y_predict, y_test).round(4)

-17.8515

In [45]:
from sklearn.metrics import r2_score

y_predict = model.predict(X_train[["room_type_Private room"]])  # Do a "practice exam"

# Compare algorithms' "practice exam" vs. expected
r2_score(y_predict, y_train).round(4)

-20.8077

In [46]:
from sklearn.metrics import r2_score
from sklearn.tree import DecisionTreeRegressor

# Create a classifier algorithm + its predictions
model = DecisionTreeRegressor(random_state=SEED)  # Our algorithm
model.fit(  # Train it ("learn the material")
    X_train[
        [
            "room_type_Entire home/apt",
            "room_type_Hotel room",
            "room_type_Private room",
            "room_type_Shared room",
            "accommodates",
        ]
    ],
    y_train,
)

y_predict = model.predict(
    X_train[
        [
            "room_type_Entire home/apt",
            "room_type_Hotel room",
            "room_type_Private room",
            "room_type_Shared room",
            "accommodates",
        ]
    ]
)

# Compare algorithms' "practice exam" vs. expected
r2_score(y_predict, y_train).round(4)

-1.5158

In [47]:
from sklearn.metrics import r2_score
from sklearn.tree import DecisionTreeRegressor

# Create a classifier algorithm + its predictions
model = DecisionTreeRegressor(random_state=SEED)  # Our algorithm
model.fit(  # Train it ("learn the material")
    X_train[
        [
            "room_type_Entire home/apt",
            "room_type_Hotel room",
            # "room_type_Private room",
            "room_type_Shared room",
            "accommodates",
        ]
    ],
    y_train,
)

y_predict = model.predict(
    X_train[
        [
            "room_type_Entire home/apt",
            "room_type_Hotel room",
            # "room_type_Private room",
            "room_type_Shared room",
            "accommodates",
        ]
    ]
)

# Compare algorithms' "practice exam" vs. expected
r2_score(y_predict, y_train).round(4)

-1.5158

In [48]:
from sklearn.metrics import r2_score
from sklearn.tree import DecisionTreeRegressor

# Create a classifier algorithm + its predictions
model = DecisionTreeRegressor(random_state=SEED)  # Our algorithm
model.fit(X_train, y_train)

y_predict = model.predict(X_train)

# Compare algorithms' "practice exam" vs. expected
r2_score(y_predict, y_train).round(4)

0.9031

In [49]:
from sklearn.metrics import r2_score
from sklearn.tree import DecisionTreeRegressor

# Create a classifier algorithm + its predictions
model = DecisionTreeRegressor(random_state=SEED)  # Our algorithm
model.fit(X_train, y_train)

y_predict = model.predict(X_test)

# Compare algorithms' "final exam" vs. expected
r2_score(y_predict, y_test).round(4)

0.5587