In [1]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LogisticRegression,Lasso,LinearRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.metrics import classification_report, roc_auc_score, RocCurveDisplay, PrecisionRecallDisplay,precision_recall_curve
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import matplotlib.pyplot as plt
from sklearn.preprocessing import FunctionTransformer
from matplotlib import rcParams
from snownlp import SnowNLP
import re
rcParams['font.sans-serif'] = ['Microsoft YaHei']
rcParams['axes.unicode_minus'] = False
pd.set_option('display.max_columns', None)

In [2]:
train=pd.read_csv('train.csv')
test=pd.read_csv('test.csv')
test.head()

Unnamed: 0,id,name,description,property_type,neighbourhood_cleansed,neighbourhood_group_cleansed,latitude,longitude,host_since,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,host_total_listings_count,host_verifications,host_has_profile_pic,host_identity_verified,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,room_type,accommodates,bathrooms,bathrooms_text,bedrooms,beds,amenities,has_availability,availability_30,availability_60,availability_90,availability_365,instant_bookable,minimum_nights,maximum_nights,number_of_reviews,number_of_reviews_ltm,number_of_reviews_l30d,first_review,last_review,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,reviews_per_month,reviews
0,3917,"Beautiful 3 bedroom House in Woodside, Queens",Beautiful 3 bedroom house in the heart of Wood...,Entire home,Woodside,Queens,40.74462,-73.90452,2013-05-07 00:00:00,,,,False,1.0,12.0,"['email', 'phone']",True,True,1,1,0,0,Entire home/apt,6,1.5,1.5 baths,3.0,4.0,"[""Dishwasher"", ""Dining table"", ""Keypad"", ""Smok...",True,29,59,89,89,False,30,90,28,2,0,2022-10-17 00:00:00,2023-09-17 00:00:00,5.0,5.0,4.96,5.0,5.0,4.96,4.96,1.22,Home was perfect and exactly what we needed to...
1,1885,"Queens HDTV Room 13 mins to Manhattan, 3 bath ...",Bright bedroom in the best part of Long Island...,Private room in rental unit,Long Island City,Queens,40.753407,-73.934995,2012-08-11 00:00:00,within an hour,99.0,23.0,False,727.0,1336.0,"['email', 'phone']",True,True,719,0,719,0,Private room,1,3.0,3 shared baths,4.0,1.0,"[""Kitchen"", ""Hot water"", ""Dedicated workspace""...",True,29,59,89,364,False,30,365,0,0,0,,,,,,,,,,,
2,1305,Lovely Bright & Spacious Loft in Brooklyn.,"Our loft is spacious and calm, filled with nat...",Entire loft,Crown Heights,Brooklyn,40.67709,-73.94381,2012-05-14 00:00:00,,,,False,1.0,1.0,"['email', 'phone']",True,True,1,1,0,0,Entire home/apt,2,1.0,1 bath,1.0,1.0,"[""Kitchen"", ""Hair dryer"", ""Iron"", ""Heating"", ""...",True,29,59,89,89,False,30,365,31,0,0,2021-10-17 00:00:00,2022-10-31 00:00:00,4.94,5.0,4.9,4.9,4.97,4.68,4.9,0.88,I would give this place 10 stars if it were op...
3,19328,Classic Petite Room SB #17 - Furnished Studio,"Extended stay hotel , all rooms are fully furn...",Entire rental unit,Upper West Side,Manhattan,40.79576,-73.97157,2021-04-05 00:00:00,within a day,70.0,37.0,False,36.0,79.0,"['email', 'phone']",True,True,36,18,18,0,Entire home/apt,1,3.0,3 baths,0.0,1.0,"[""Kitchen"", ""Hair dryer"", ""Hot water"", ""Dedica...",True,0,0,0,157,False,30,500,4,3,1,2023-05-29 00:00:00,2024-08-21 00:00:00,4.75,4.5,4.75,5.0,4.5,5.0,4.5,0.26,The front-desk staff was absolutely wonderful ...
4,16511,Petit chalet with secret garden,Small loft type of apartment on ground floor w...,Entire rental unit,Williamsburg,Brooklyn,40.71359,-73.9554,2014-10-07 00:00:00,within a few hours,100.0,75.0,False,1.0,1.0,"['email', 'phone']",True,True,1,1,0,0,Entire home/apt,2,1.0,1 bath,0.0,1.0,"[""Coffee maker: pour-over coffee"", ""Cleaning p...",True,3,12,24,113,False,30,60,157,2,1,2015-02-22 00:00:00,2024-08-31 00:00:00,4.82,4.89,4.69,4.97,4.96,4.94,4.73,1.35,I really enjoyed staying at Kanae's place. It ...


In [3]:
train.isnull().sum()

name                                               0
description                                      387
property_type                                      0
price                                              0
neighbourhood_cleansed                             0
neighbourhood_group_cleansed                       0
latitude                                           0
longitude                                          0
host_since                                         0
host_response_time                              2203
host_response_rate                              2203
host_acceptance_rate                            2053
host_is_superhost                                251
host_listings_count                                0
host_total_listings_count                          0
host_verifications                                 0
host_has_profile_pic                               0
host_identity_verified                             0
calculated_host_listings_count                

In [4]:
class SentimentScoreTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        sentiment_scores = X['reviews'].fillna('').apply(
            lambda text: SnowNLP(text).sentiments if text else 0.5
        )
        return sentiment_scores.to_frame(name='sentiment_score')

In [5]:
class DistanceFeatureTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, locations):
        self.locations = locations
    def fit(self, X, y=None):
        return self
    
    def haversine_distance(self, lat1, lon1, lat2, lon2):
        # 转换为弧度
        lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
        dlat = lat2 - lat1
        dlon = lon2 - lon1
        a = np.sin(dlat / 2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2)**2
        c = 2 * np.arcsin(np.sqrt(a))
        r = 6371  # 地球半径，单位为公里
        return c * r

    def transform(self, X):
        transformed_data = pd.DataFrame(index=X.index)    
        # 计算到每个位置中心的 Haversine 距离
        for location_name, (lat2, lon2) in self.locations.items():
            transformed_data[f'distance_to_{location_name}'] = X.apply(
                lambda row: self.haversine_distance(row['latitude'], row['longitude'], lat2, lon2), axis=1
            )
        
        return transformed_data
    
    def get_feature_names(self):
        return [f'distance_to_{name}' for name in self.locations.keys()]

In [6]:
class ListTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns
        self.unique_values = {}

    def fit(self, X, y=None):
        for col in self.columns:
            # 获取列中的所有唯一值
            self.unique_values[col] = set()
            X[col].dropna().apply(lambda x: self.unique_values[col].update(x))
        return self

    def transform(self, X):
        X_transformed = X.copy()
        for col in self.columns:
            # 针对每个列的独热编码
            for value in self.unique_values[col]:
                X_transformed[f"{col}_{value}"] = X_transformed[col].apply(
                    lambda x: 1 if x is not None and value in x else 0
                )
            # 移除原始列
            X_transformed.drop(columns=[col], inplace=True)
        return X_transformed

In [7]:
class LuxuryFeatureTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, luxury_keywords=None):
        if luxury_keywords is None:
            self.luxury_keywords = ['luxury', 'spacious', 'elegant', 'modern', 'exclusive']
        else:
            self.luxury_keywords = luxury_keywords

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        pattern = re.compile('|'.join(self.luxury_keywords), re.IGNORECASE)
        is_luxury = X['name'].fillna('') + ' ' + X['description'].fillna('')
        is_luxury = is_luxury.apply(lambda x: 1 if pattern.search(x) else 0)
        return is_luxury.to_frame(name='is_luxury')

class AmenitiesTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, luxury_amenities=None):
        if luxury_amenities is None:
            self.luxury_amenities = ['pool', 'jacuzzi', 'gym', 'sauna', 'hot tub']
        else:
            self.luxury_amenities = luxury_amenities

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        amenities_df = pd.DataFrame(index=X.index)
        # Count total amenities
        amenities_df['num_amenities'] = X['amenities'].fillna('[]').apply(
            lambda x: len(eval(x)) if isinstance(x, str) else 0
        )
        # Detect luxury amenities
        for amenity in self.luxury_amenities:
            amenities_df[f'has_{amenity.replace(" ", "_")}'] = X['amenities'].fillna('[]').apply(
                lambda x: 1 if amenity.lower() in map(str.lower, eval(x)) else 0
            )
        return amenities_df

# Adding the new features to the pipeline
luxury_transformer = Pipeline(steps=[
    ('luxury_features', LuxuryFeatureTransformer())
])

amenities_transformer = Pipeline(steps=[
    ('amenities_features', AmenitiesTransformer())
])

In [8]:
class PositiveReviewTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, positive_words=None):
        if positive_words is None:
            self.positive_words = ['great', 'amazing', 'excellent', 'fantastic', 'wonderful', 'perfect', 'clean', 'comfortable']
        else:
            self.positive_words = positive_words

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        pattern = re.compile('|'.join(self.positive_words), re.IGNORECASE)
        positive_reviews = X['reviews'].fillna('').apply(
            lambda text: 1 if pattern.search(text) else 0
        )
        return positive_reviews.to_frame(name='is_positive_review')

# Adding the new feature to the pipeline
positive_review_transformer = Pipeline(steps=[
    ('positive_reviews', PositiveReviewTransformer())
])


In [9]:
numeric_features = ['host_response_rate', 'host_acceptance_rate', 'host_listings_count', 'host_total_listings_count', 'calculated_host_listings_count', 'calculated_host_listings_count_entire_homes',
'calculated_host_listings_count_private_rooms', 'calculated_host_listings_count_shared_rooms','accommodates','bathrooms','bedrooms','beds','availability_30','availability_60','availability_90','availability_365',
'minimum_nights','maximum_nights','number_of_reviews','number_of_reviews_ltm','number_of_reviews_l30d', 'review_scores_rating','review_scores_accuracy',
'review_scores_cleanliness','review_scores_checkin','review_scores_communication','review_scores_location','review_scores_value','reviews_per_month']

categorical_features = ['host_response_time','property_type', 'neighbourhood_cleansed', 'neighbourhood_group_cleansed', 'host_verifications','room_type']#,'amenities']

list_features = ['host_verifications', 'amenities']

geo_features = ['latitude', 'longitude']

locations = {
    'manhattan': (40.7831, -73.9712),
    'brooklyn': (40.6782, -73.9442),
    'queens': (40.7282, -73.7949),
    'bronx': (40.8448, -73.8648),
    'staten_island': (40.5795, -74.1502)
}

review_features = ['reviews']

In [10]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Cat pipeline
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

list_transformer = ListTransformer(columns=list_features)

geo_transformer = Pipeline(steps=[
    ('distance_features', DistanceFeatureTransformer(locations=locations))
])

sentiment_transformer = Pipeline(steps=[
    ('sentiment_score', SentimentScoreTransformer())
])

calculate_host_duration = FunctionTransformer(
    lambda x: (pd.to_datetime('2025-01-01') - pd.to_datetime(x['host_since'])).dt.days.values.reshape(-1, 1)
)

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features),
        ('list', list_transformer, list_features),
        ('geo', geo_transformer, geo_features),
        ('luxury', luxury_transformer, ['name', 'description']),
        ('amenities', amenities_transformer, ['amenities']),
        ('positive_reviews', positive_review_transformer, ['reviews']),
        ('host_duration', calculate_host_duration, ['host_since'])
    ],
    remainder='passthrough'
)

In [11]:
target = 'price'
X = train.drop(columns=[target,'bathrooms_text','first_review','last_review','first_review'])
y = train[target]
# Split the data
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
from keras.models import Sequential
from sklearn.preprocessing import StandardScaler, LabelEncoder
from keras.optimizers import Adam
from tensorflow.keras.layers import LSTM, Dropout, Dense, GRU, BatchNormalization
from keras.callbacks import EarlyStopping
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

2024-11-22 13:22:53.225731: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [13]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
import numpy as np

# Split the data (if not already done)
target = 'price'
X = train.drop(columns=[target, 'bathrooms_text', 'first_review', 'last_review'])
y = train[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Identify non-numeric columns
non_numeric_columns = X_train.select_dtypes(include=['object', 'category']).columns



In [14]:
# One-hot encode non-numeric columns
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
X_train_encoded = encoder.fit_transform(X_train[non_numeric_columns])
X_test_encoded = encoder.transform(X_test[non_numeric_columns])


In [15]:

# Drop non-numeric columns from original data
X_train_numeric = X_train.drop(columns=non_numeric_columns)
X_test_numeric = X_test.drop(columns=non_numeric_columns)

# Concatenate numeric and encoded categorical features
X_train_processed = np.hstack([X_train_numeric, X_train_encoded])
X_test_processed = np.hstack([X_test_numeric, X_test_encoded])



In [None]:
# Normalize the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_processed)
X_test_scaled = scaler.transform(X_test_processed)

# Reshape for RNN input
X_train_reshaped = X_train_scaled.reshape(X_train_scaled.shape[0], 1, X_train_scaled.shape[1])
X_test_reshaped = X_test_scaled.reshape(X_test_scaled.shape[0], 1, X_test_scaled.shape[1])

In [29]:
# Find rows with NaN values
nan_rows = np.where(np.isnan(X_train_reshaped).any(axis=(1, 2)))[0]
print(f"Rows with NaN values: {nan_rows}")

# Inspect those rows
print(X_train_reshaped[nan_rows])
# Remove rows with NaN values
X_train_reshaped = np.delete(X_train_reshaped, nan_rows, axis=0)
y_train = np.delete(y_train, nan_rows, axis=0)
# Replace NaN with the mean of each feature
X_train_reshaped = np.nan_to_num(X_train_reshaped, nan=np.nanmean(X_train_reshaped))
print(np.isnan(X_train_reshaped).any())  # Should be False


Rows with NaN values: [    2     5     8 ... 12550 12551 12554]
[[[-2.25027959  2.49849685         nan ... -0.00892466 -0.00892466
   -0.62796216]]

 [[-0.78802838  2.84341386         nan ... -0.00892466 -0.00892466
    1.59245263]]

 [[ 1.29170417 -0.39851388  0.20895958 ... -0.00892466 -0.00892466
    1.59245263]]

 ...

 [[-0.07910282 -0.23942382  0.38713263 ... -0.00892466 -0.00892466
    1.59245263]]

 [[-0.32601248 -1.19565062  0.38713263 ... -0.00892466 -0.00892466
    1.59245263]]

 [[ 1.04677473 -0.48246798  0.20895958 ... -0.00892466 -0.00892466
    1.59245263]]]
False


In [36]:


# Assuming X_train_reshaped is of shape (samples, timesteps, features)
model = Sequential()
model.add(LSTM(50, input_shape=(X_test_reshaped.shape[1], X_test_reshaped.shape[2]), return_sequences=True))
model.add(Dropout(0.3))
model.add(LSTM(30, return_sequences=False))
model.add(Dropout(0.3))
model.add(Dense(10, activation='relu'))
model.add(Dense(6, activation='softmax'))  # Adjust to 6 classes


# Compile the model
model.compile(optimizer=Adam(learning_rate=0.001), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=5)

# Train the model
history = model.fit(X_train_reshaped, y_train, epochs=50, batch_size=200, validation_split=0.1, callbacks=[early_stopping])

# Evaluate the model
test_loss, test_accuracy = model.evaluate(X_test_reshaped, y_test)
print(f"Test Accuracy: {test_accuracy * 100:.2f}%")


2024-11-22 14:02:24.294774: W tensorflow/tsl/framework/bfc_allocator.cc:485] Allocator (GPU_0_bfc) ran out of memory trying to allocate 1.15GiB (rounded to 1237207552)requested by op _EagerConst
If the cause is memory fragmentation maybe the environment variable 'TF_GPU_ALLOCATOR=cuda_malloc_async' will improve the situation. 
Current allocation summary follows.
Current allocation summary follows.
2024-11-22 14:02:24.294818: I tensorflow/tsl/framework/bfc_allocator.cc:1039] BFCAllocator dump for GPU_0_bfc
2024-11-22 14:02:24.294830: I tensorflow/tsl/framework/bfc_allocator.cc:1046] Bin (256): 	Total Chunks: 109, Chunks in use: 109. 27.2KiB allocated for chunks. 27.2KiB in use in bin. 3.1KiB client-requested in use in bin.
2024-11-22 14:02:24.294837: I tensorflow/tsl/framework/bfc_allocator.cc:1046] Bin (512): 	Total Chunks: 17, Chunks in use: 16. 8.8KiB allocated for chunks. 8.0KiB in use in bin. 7.5KiB client-requested in use in bin.
2024-11-22 14:02:24.294843: I tensorflow/tsl/framew

InternalError: Failed copying input tensor from /job:localhost/replica:0/task:0/device:CPU:0 to /job:localhost/replica:0/task:0/device:GPU:0 in order to run _EagerConst: Dst tensor is not initialized.

llocated for chunks. 147.0KiB in use in bin. 91.2KiB client-requested in use in bin.
2024-11-22 14:02:24.294885: I tensorflow/tsl/framework/bfc_allocator.cc:1046] Bin (131072): 	Total Chunks: 0, Chunks in use: 0. 0B allocated for chunks. 0B in use in bin. 0B client-requested in use in bin.
2024-11-22 14:02:24.294890: I tensorflow/tsl/framework/bfc_allocator.cc:1046] Bin (262144): 	Total Chunks: 1, Chunks in use: 0. 362.5KiB allocated for chunks. 0B in use in bin. 0B client-requested in use in bin.
2024-11-22 14:02:24.294895: I tensorflow/tsl/framework/bfc_allocator.cc:1046] Bin (524288): 	Total Chunks: 0, Chunks in use: 0. 0B allocated for chunks. 0B in use in bin. 0B client-requested in use in bin.
2024-11-22 14:02:24.294900: I tensorflow/tsl/framework/bfc_allocator.cc:1046] Bin (1048576): 	Total Chunks: 0, Chunks in use: 0. 0B allocated for chunks. 0B in use in bin. 0B client-requested in use in bin.
2024-11-22 14:02:24.294904: I tensorflow/tsl/framework/bfc_allocator.cc:1046] Bin (2

In [None]:
mse = mean_squared_error(y_valid, y_pred_rf)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_valid, y_pred_rf)
r2 = r2_score(y_valid, y_pred_rf)

In [22]:
print(f"MSE: {mse:.5f}")
print(f"RMSE: {rmse:.5f}")
print(f"MAE: {mae:.5f}")
print(f"R²: {r2:.2f}")

MSE: 0.58376
RMSE: 0.76404
MAE: 0.45510
R²: 0.80


In [35]:
print(f"y_test shape: {y_test.shape}")
print(f"y_test dtype: {y_test.dtype}")
print(f"Unique values in y_test: {np.unique(y_test)}")


y_test shape: (3140,)
y_test dtype: int64
Unique values in y_test: [0 1 2 3 4 5]
