# Listings Data Cleaning

In [175]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
# pandas setting
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_colwidth', None)


In [167]:
#code to read in csv.gz
listings = pd.read_csv('./data/listings.csv.gz', compression='gzip')
print(listings.shape)
# listings.head(1)

(6853, 75)


In [168]:
#remove columns that are not needed
listings.drop(listings.columns.difference(['listing_url', 'name', 'latitude', 'longitude', 'price','neighbourhood_cleansed', 'property_type', 'room_type', 'accommodates', 'bathrooms_text', 'availability_30', 'availability_60',	'availability_90', 'availability_365']), axis=1, inplace=True)
#create a new col with the number of bedrooms
listings['bedrooms'] = listings['name'].str.extract(r'(\d+) bedroom', expand=False).astype(float)
#replace Nan in bedrooms for the studios
listings['bedrooms'].fillna(0, inplace=True)
#remove units that are only a room and not the entire building
listings = listings[listings['property_type'].str.contains('entire', case=False, na=False)]
#remove digits from bathrooms_text
listings['bathrooms_text'] = listings['bathrooms_text'].str.replace(r'[^\d.]', '')
listings['bathrooms_text'] = listings['bathrooms_text'].replace('', np.nan)
listings['bathrooms_text'] = listings['bathrooms_text'].astype(float)
#update bad bathroom data based on current listing
listings.loc[listings.index == 3341, 'bathrooms_text'] = 1.5
listings.loc[listings.index == 4584, 'bathrooms_text'] = 1.5
listings.loc[listings.index == 4634, 'bathrooms_text'] = 1
listings.loc[listings.index == 3040, 'bathrooms_text'] = 0.5
#remove rows with 0 bathrooms (listing no longer exists)
listings = listings[listings['bathrooms_text'] != 0]
#change price to float
listings['price'] = listings['price'].str.replace(r'[^\d.]', '')
listings['price'] = listings['price'].astype(float)
#remove more cols
listings.drop(listings.columns.difference(['latitude', 'longitude', 'price','neighbourhood_cleansed', 'bathrooms_text', 'availability_30', 'availability_60',	'availability_90', 'availability_365']), axis=1, inplace=True)
# ohe neighbourhood
one_hot_encoded = pd.get_dummies(listings['neighbourhood_cleansed'], prefix='encoded_col_name')
listings = pd.concat([listings, one_hot_encoded], axis=1)
#drop nulls
listings = listings.dropna()
#details
print(listings.shape)
# print(listings.dtypes)
# listings.sample(5)

(4551, 48)


  listings['bathrooms_text'] = listings['bathrooms_text'].str.replace(r'[^\d.]', '')
  listings['price'] = listings['price'].str.replace(r'[^\d.]', '')


In [169]:
#train, val, test data
X = listings.drop(columns=['price'] + [col for col in listings.columns if col.startswith('availability')])
y = listings['price'] * (90 - listings['availability_90'])



In [170]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the numerical features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train.select_dtypes(include=[np.number]))
X_test_scaled = scaler.transform(X_test.select_dtypes(include=[np.number]))

# # Concatenate the scaled numerical features with the one-hot encoded categorical features
# X_train_encoded = np.concatenate([X_train_scaled, X_train.select_dtypes(include=['uint8'])], axis=1)
# X_test_encoded = np.concatenate([X_test_scaled, X_test.select_dtypes(include=['uint8'])], axis=1)


In [177]:

# Define the model
model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train_encoded.shape[1],)),
    Dropout(0.2),
    Dense(128, activation='relu'),
    Dropout(0.2),
    Dense(64, activation='relu'),
    Dropout(0.2),
    Dense(1, activation='linear')
])

# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error')

early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# Train the model with early stopping
history = model.fit(X_train_encoded, y_train, epochs=5000, batch_size=len(X_train_encoded),
                    validation_split=0.2, callbacks=[early_stopping])

# Evaluate the model
test_loss = model.evaluate(X_test_encoded, y_test)
print("Test Loss:", test_loss)

Epoch 1/5000
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 683ms/step - loss: 249687392.0000 - val_loss: 297507520.0000
Epoch 2/5000
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step - loss: 249684480.0000 - val_loss: 297504512.0000
Epoch 3/5000
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step - loss: 249681696.0000 - val_loss: 297501632.0000
Epoch 4/5000
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step - loss: 249678880.0000 - val_loss: 297498912.0000
Epoch 5/5000
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step - loss: 249676320.0000 - val_loss: 297496352.0000
Epoch 6/5000
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step - loss: 249673520.0000 - val_loss: 297493888.0000
Epoch 7/5000
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step - loss: 249671232.0000 - val_loss: 297491392.0000
Epoch 8/5000
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m

In [172]:
print(y)

5        3540.0
7          66.0
8         340.0
10      11125.0
14        645.0
         ...   
6846      992.0
6847     1221.0
6848    15570.0
6850        0.0
6851        0.0
Length: 4551, dtype: float64
