In [21]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier,RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score, RocCurveDisplay, PrecisionRecallDisplay,precision_recall_curve
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import matplotlib.pyplot as plt
from matplotlib import rcParams
rcParams['font.sans-serif'] = ['Microsoft YaHei']
rcParams['axes.unicode_minus'] = False
pd.set_option('display.max_columns', None)

In [2]:
train=pd.read_csv('Data/train.csv')
test=pd.read_csv('Data/test.csv')
test.head()

Unnamed: 0,id,name,description,property_type,neighbourhood_cleansed,neighbourhood_group_cleansed,latitude,longitude,host_since,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,host_total_listings_count,host_verifications,host_has_profile_pic,host_identity_verified,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,room_type,accommodates,bathrooms,bathrooms_text,bedrooms,beds,amenities,has_availability,availability_30,availability_60,availability_90,availability_365,instant_bookable,minimum_nights,maximum_nights,number_of_reviews,number_of_reviews_ltm,number_of_reviews_l30d,first_review,last_review,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,reviews_per_month,reviews
0,3917,"Beautiful 3 bedroom House in Woodside, Queens",Beautiful 3 bedroom house in the heart of Wood...,Entire home,Woodside,Queens,40.74462,-73.90452,2013-05-07 00:00:00,,,,False,1.0,12.0,"['email', 'phone']",True,True,1,1,0,0,Entire home/apt,6,1.5,1.5 baths,3.0,4.0,"[""Dishwasher"", ""Dining table"", ""Keypad"", ""Smok...",True,29,59,89,89,False,30,90,28,2,0,2022-10-17 00:00:00,2023-09-17 00:00:00,5.0,5.0,4.96,5.0,5.0,4.96,4.96,1.22,Home was perfect and exactly what we needed to...
1,1885,"Queens HDTV Room 13 mins to Manhattan, 3 bath ...",Bright bedroom in the best part of Long Island...,Private room in rental unit,Long Island City,Queens,40.753407,-73.934995,2012-08-11 00:00:00,within an hour,99.0,23.0,False,727.0,1336.0,"['email', 'phone']",True,True,719,0,719,0,Private room,1,3.0,3 shared baths,4.0,1.0,"[""Kitchen"", ""Hot water"", ""Dedicated workspace""...",True,29,59,89,364,False,30,365,0,0,0,,,,,,,,,,,
2,1305,Lovely Bright & Spacious Loft in Brooklyn.,"Our loft is spacious and calm, filled with nat...",Entire loft,Crown Heights,Brooklyn,40.67709,-73.94381,2012-05-14 00:00:00,,,,False,1.0,1.0,"['email', 'phone']",True,True,1,1,0,0,Entire home/apt,2,1.0,1 bath,1.0,1.0,"[""Kitchen"", ""Hair dryer"", ""Iron"", ""Heating"", ""...",True,29,59,89,89,False,30,365,31,0,0,2021-10-17 00:00:00,2022-10-31 00:00:00,4.94,5.0,4.9,4.9,4.97,4.68,4.9,0.88,I would give this place 10 stars if it were op...
3,19328,Classic Petite Room SB #17 - Furnished Studio,"Extended stay hotel , all rooms are fully furn...",Entire rental unit,Upper West Side,Manhattan,40.79576,-73.97157,2021-04-05 00:00:00,within a day,70.0,37.0,False,36.0,79.0,"['email', 'phone']",True,True,36,18,18,0,Entire home/apt,1,3.0,3 baths,0.0,1.0,"[""Kitchen"", ""Hair dryer"", ""Hot water"", ""Dedica...",True,0,0,0,157,False,30,500,4,3,1,2023-05-29 00:00:00,2024-08-21 00:00:00,4.75,4.5,4.75,5.0,4.5,5.0,4.5,0.26,The front-desk staff was absolutely wonderful ...
4,16511,Petit chalet with secret garden,Small loft type of apartment on ground floor w...,Entire rental unit,Williamsburg,Brooklyn,40.71359,-73.9554,2014-10-07 00:00:00,within a few hours,100.0,75.0,False,1.0,1.0,"['email', 'phone']",True,True,1,1,0,0,Entire home/apt,2,1.0,1 bath,0.0,1.0,"[""Coffee maker: pour-over coffee"", ""Cleaning p...",True,3,12,24,113,False,30,60,157,2,1,2015-02-22 00:00:00,2024-08-31 00:00:00,4.82,4.89,4.69,4.97,4.96,4.94,4.73,1.35,I really enjoyed staying at Kanae's place. It ...


In [3]:
numeric_features = ['host_response_rate', 'host_acceptance_rate', 'host_listings_count', 'host_total_listings_count', 'calculated_host_listings_count', 'calculated_host_listings_count_entire_homes',
'calculated_host_listings_count_private_rooms', 'calculated_host_listings_count_shared_rooms','accommodates','bathrooms','bedrooms','beds','availability_30','availability_60','availability_90','availability_365',
'minimum_nights','maximum_nights','number_of_reviews','number_of_reviews_ltm','number_of_reviews_l30d', 'review_scores_rating','review_scores_accuracy',
'review_scores_cleanliness','review_scores_checkin','review_scores_communication','review_scores_location','review_scores_value','reviews_per_month']

categorical_features = ['property_type', 'neighbourhood_cleansed', 'neighbourhood_group_cleansed', 'host_verifications','room_type']#,'amenities']

In [4]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Cat pipeline
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features),
    ])

In [5]:
target = 'price'
X = train.drop(columns=[target, 'name', 'host_since','description','latitude','longitude','amenities','first_review','last_review','first_review','reviews'])
y = train[target]
# Split the data
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

In [15]:
model_rf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestRegressor())
])

In [None]:
model_rf.fit(X_train, y_train)

In [54]:
y_pred_rf = np.round(model_rf.predict(X_valid))

In [56]:
mse = mean_squared_error(y_valid, y_pred_rf)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_valid, y_pred_rf)
r2 = r2_score(y_valid, y_pred_rf)

In [58]:
print(f"均方误差 (MSE): {mse:.2f}")
print(f"均方根误差 (RMSE): {rmse:.2f}")
print(f"平均绝对误差 (MAE): {mae:.2f}")
print(f"R²: {r2:.2f}")

均方误差 (MSE): 0.69
均方根误差 (RMSE): 0.83
平均绝对误差 (MAE): 0.51
R²: 0.77


In [60]:
y_pred=np.round(model_rf.predict(test))

In [80]:
submission = pd.DataFrame(data={'id': test.id, 'price': y_pred.astype('int')})

In [84]:
submission.to_csv('submission.csv',index=None)