In [1]:
import numpy as np
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.cluster import KMeans
from sklearn.base import BaseEstimator, TransformerMixin
from category_encoders import TargetEncoder
from sklearn.metrics import mean_absolute_error
from sklearn.compose import TransformedTargetRegressor
from sklearn.impute import KNNImputer
from sklearn.linear_model import Lasso, Ridge
from sklearn.linear_model import ElasticNet

In [45]:
preprocess_df_test = pd.read_csv("preprocess_df.csv")# 已经处理好的

In [50]:
#-------------------------------Price_Model-------------------------------------------------------------
x = preprocess_df_test.drop('Price', axis=1)
y = preprocess_df_test['Price'] 

x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, random_state=111
)
                     
log_cols = ['area', 'gas_fee'] 

log1p_cols = ['density_ratio', 'reflect_sentiment', 'park_area', 'greening', 'plot_ratio','property_fee']

knn_num_features = ['built_year','transaction_year', 'elevator_ratio']

house_allo_cols = ['room', 'hall', 'kitchen', 'bathroom']

binary_features = ['east', 'south', 'west', 'north', 'ownership']

low_cat_features = ['city', 'build_struc', 'decoration', 'use_of_house', 'tran_right', 'floor', 'wat_sup', 'ele_sup']

high_cat_district = ['district']  #target encoding
high_cat_plate = ['plate']  #clustering

class Winsorizer(BaseEstimator, TransformerMixin): #BE方便調超參 TM使類能與pipeline集成
    def __init__(self, lower_q=0.01, upper_q=0.99):
        self.lower_q = lower_q
        self.upper_q = upper_q
        self.low_ = None
        self.high_ = None

    def fit(self, X, y=None):
        X = np.asarray(X)
        self.low_ = np.nanpercentile(X, 100 * self.lower_q, axis=0)
        self.high_ = np.nanpercentile(X, 100 * self.upper_q, axis=0)
        return self

    def transform(self, X):
        X = np.asarray(X).astype(float)
        return np.clip(X, self.low_, self.high_)

class LonLatClusterTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, n_clusters=8):
        self.n_clusters = n_clusters
        self.kmeans = None
        
    def fit(self, X, y=None):
        lon_lat = X[['lon', 'lat']].values #Kmeans需要數值輸入
        self.kmeans = KMeans(n_clusters=self.n_clusters, random_state=42)
        self.kmeans.fit(lon_lat)
        return self
        
    def transform(self, X):
        lon_lat = X[['lon', 'lat']].values
        clusters = self.kmeans.predict(lon_lat)
        return clusters.reshape(-1, 1) #將一維數組轉为二維數組（label）
        
class PlateClusterTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, n_clusters=10):
        self.n_clusters = n_clusters
        self.kmeans = None
        self.encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False) #輸出np陣列
        
    def fit(self, X, y=None):
        plate_data = X.values.reshape(-1, 1)
        plate_encoded = self.encoder.fit_transform(plate_data)
        self.kmeans = KMeans(n_clusters=self.n_clusters, random_state=42)
        self.kmeans.fit(plate_encoded)
        return self
        
    def transform(self, X):
        plate_data = X.values.reshape(-1, 1)
        plate_encoded = self.encoder.transform(plate_data)
        clusters = self.kmeans.predict(plate_encoded)
        return clusters.reshape(-1, 1) #sklearn Transformer 需要二維數組

knn_num_pipeline = Pipeline([
    ('ref_imputer', SimpleImputer(strategy='median')),
    ('knn_imputer', KNNImputer(n_neighbors=5)), 
    ('winsor', Winsorizer(lower_q=0.001, upper_q=0.999)), 
    ('scaler', StandardScaler()) 
])

log_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('log', FunctionTransformer(np.log)), 
    ('winsor', Winsorizer(lower_q=0.001, upper_q=0.999)),
    ('scaler', StandardScaler())
])

log1p_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('log1p', FunctionTransformer(np.log1p)),
    ('winsor', Winsorizer(lower_q=0.001, upper_q=0.999)),
    ('scaler', StandardScaler())
])

house_allo_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('winsor', Winsorizer(lower_q=0.001, upper_q=0.999)),
    ('scaler', StandardScaler())
])

low_cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

binary_pipeline = Pipeline([
    ('passthrough', FunctionTransformer(lambda x: x))  # 直接傳遞
])


preprocessor = ColumnTransformer([
    ('geo_cluster', LonLatClusterTransformer(n_clusters=40), ['lon', 'lat']),
    ('log_cols', log_pipeline, log_cols),
    ('log1p_num', log1p_pipeline, log1p_cols),
    ('built_year_knn_impute', knn_num_pipeline, knn_num_features),
    ('house_allo', house_allo_pipeline, house_allo_cols),
    ('binary', binary_pipeline, binary_features),
    ('low_cat', low_cat_pipeline, low_cat_features),
    ('district_target', TargetEncoder(cols=['district'], smoothing=50), ['district']),
    ('plate_cluster', PlateClusterTransformer(n_clusters=10), ['plate'])
], remainder='drop')

base_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor',  Ridge(alpha=200)) #LinearRegression(), Lasso(), Ridge(), ElasticNet()
])

ttr = TransformedTargetRegressor(regressor=base_pipeline,
                                     func=np.log,
                                     inverse_func=np.exp)

ttr.fit(x_train, y_train)

y_pred = ttr.predict(x_test)
y_pred_in = ttr.predict(x_train)

mae = mean_absolute_error(y_test, y_pred)
mae_in = mean_absolute_error(y_train, y_pred_in)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"MAE: {mae}")
print(f"MAEIN: {mae_in}")
print(f"MSE: {mse}")
print(f"R²: {r2}")

MAE: 519626.61097841046
MAEIN: 524076.68133207056
MSE: 1193435671831.8533
R²: 0.807239447208437


In [49]:
from sklearn.model_selection import cross_val_score

cv_folds = 6 
scoring_metric = 'neg_mean_absolute_error'

cv_scores = cross_val_score(
    estimator=ttr,
    X=x_train,                
    y=y_train,                 
    cv=cv_folds,              
    scoring=scoring_metric 
)

print(f"cv_avg: {cv_scores.mean()}")

cv_avg: -527682.534245314


In [2]:
#-------------------------------process_rent-------------------------------------------------------------
def floor2num(floor_str):
    if pd.isna(floor_str):
        return np.nan
    floor_str = str(floor_str).strip()

    if "高" in floor_str:
        return 3
    elif "中" in floor_str:
        return 2
    elif "低" in floor_str:
        return 1

    if floor_str[0].isdigit():
        frac = eval(floor_str)
        if frac < 0.3:
            return 1
        elif frac > 0.6:
            return 3
        else:
            return 2

def apply_floor_conversion(df, col_name="楼层"):
    df_copy = df.copy()
    df_copy["floor"] = df_copy[col_name].apply(floor2num)
    return df_copy          

def split_house_type(df, col_name="户型"):
    df_copy = df.copy()
    patterns = {
        'room': r'(\d+)室',    
        'hall': r'(\d+)厅',    
        'kitchen': r'(\d+)厨', 
        'bathroom': r'(\d+)卫' 
    }
    for new_col, pattern in patterns.items():
        df_copy[new_col] = df_copy[col_name].apply(
            lambda x: int(re.search(pattern, str(x)).group(1)) #取出第一個括號捕獲群組的內容(\d+)
            if pd.notnull(x) and re.search(pattern, str(x))
            else np.nan
        )
    return df_copy

def remove_string(df, col_name, strings_to_remove):
    df_copy = df.copy()
    pattern = '|'.join(re.escape(s) for s in strings_to_remove) #escape()把特殊字符轉義
    df_copy[col_name] = df_copy[col_name].str.replace(pattern, '', regex=True)
    return df_copy

def change_toward(df, col_name="朝向"):
    df_copy = df.copy()
    directions = {
        'east': '东',   
        'south': '南',  
        'west': '西',   
        'north': '北'  
    }
    for new_col, direction_char in directions.items():
        df_copy[new_col] = df_copy[col_name].apply(
            lambda x: 1 if pd.notna(x) and direction_char in str(x) else 0
        )
    return df_copy

def water2num(water_str):
    if "商水" in str(water_str):
        return 2
    elif "民水" in str(water_str):
        return 1
    else:
        return np.nan

def apply_water_conversion(df, col_name="用水"):
    df_copy = df.copy()
    df_copy[col_name] = df_copy[col_name].apply(water2num) 
    return df_copy

def ele2num(ele_str):
    if "商电" in str(ele_str):
        return 2
    elif "民电" in str(ele_str):
        return 1
    else:
        return np.nan

def apply_ele_conversion(df, col_name="用电"):
    df_copy = df.copy()
    df_copy[col_name] = df_copy[col_name].apply(ele2num) 
    return df_copy

def yesno2num(w_str):
    if "有" in str(w_str):
        return 1
    elif "无" in str(w_str):
        return 0
    else:
        return np.nan

def apply_yesno2num(df, col_name=None):
    df_copy = df.copy()
    df_copy[col_name] = df_copy[col_name].apply(yesno2num) 
    return df_copy

def rent2num(r_str):
    if "整租" in str(r_str):
        return 2
    elif "合租" in str(r_str):
        return 1
    else:
        return np.nan

def apply_rent2num(df, col_name="租赁方式"):
    df_copy = df.copy()
    df_copy[col_name] = df_copy[col_name].apply(rent2num) 
    return df_copy

def avgdense_ratio(df, col1="房屋总数", col2="楼栋总数"):
    df_copy = df.copy()
    
    new_col_name = "density_ratio" 
    safe_col1 = df_copy[col1].replace(0, np.nan)
    safe_col2 = df_copy[col2].replace(0, np.nan)
    df_copy[new_col_name] = safe_col1 / safe_col2
    
    return df_copy

def cal_deal_time(df, col_name="交易时间", base_year=2025):
    df_copy = df.copy()
    
    df_copy["transaction_year"] = df_copy[col_name].apply( 
        lambda x: base_year - int(str(x)[:4]) if pd.notna(x) and str(x).strip() else np.nan
    ) 
    return df_copy

facility = ["床", "衣柜", "空调", "洗衣机", "热水器"]
def check_facility(df, col_name):
    df_copy = df.copy()
    
    for f in facility:
        df_copy[f] = np.nan 
    
    def process_row(row_idx, s):
        if pd.isna(s):
            s = ""
        else:
            s = str(s).strip()
        
        s_facilities = [item.strip() for item in s.split("、") if item.strip()]
        
        for f in facility:
            if f in s_facilities:
                df_copy.at[row_idx, f] = 1
            else:
                df_copy.at[row_idx, f] = 0
    
    for row_idx, s in df[col_name].items():
        process_row(row_idx, s)
    
    return df_copy

build_cat = ["普通住宅", "底商", "商业", "车库", "公寓", "别墅"]
def check_build_cat(df, col_name):
    df_copy = df.copy()
    
    for bc in build_cat:
        df_copy[bc] = np.nan 
    
    def process_row(row_idx, s):
        if pd.isna(s):
            s = ""
        else:
            s = str(s).strip()
        
        s_bc = [item.strip() for item in s.split("/") if item.strip()]
        
        for bc in build_cat:
            if bc in s_bc:
                df_copy.at[row_idx, bc] = 1
            else:
                df_copy.at[row_idx, bc] = 0
    
    for row_idx, s in df[col_name].items():
        process_row(row_idx, s)
    
    return df_copy

def take_year(df, col_name="建筑年代"):
    df_copy = df.copy()

    def process_single_value(s):
        try:
            year_pos = s.find("年")
            if year_pos >= 4:
                year_str = s[year_pos - 4 : year_pos]
                return int(year_str)
            else:
                return None

        except:
            return None
    
    df_copy[col_name] = df_copy[col_name].apply(process_single_value)
    return df_copy

def interval_covert(df, col_name=None):
    df_copy = df.copy()

    def count_interval(s):
        
        s = str(s).strip()
        
        try:
            if "-" in str(s):
                dash_pos = s.find("-")
                a = s[:dash_pos].strip()
                b = s[dash_pos + 1:].strip()
                i_mean = (float(b)+float(a))/2
                return i_mean
                
            else:
                return float(s)
        except:
            return None

    df_copy[col_name] = df_copy[col_name].apply(count_interval)
    return df_copy


def preprocess_before_pipeline(df, config=None): #未提供config則使用默認值
    if config is None:
        config = {}

    floor_col = config.get('floor_col', '楼层')
    house_type_col = config.get('house_type_col', '户型')
    toward_col = config.get('toward_col', '朝向')
    house_area_col = config.get('house_area_col', '面积')
    elev_col = config.get('elev_col', '电梯')
    deal_time_col = config.get('deal_time_col', '交易时间')
    way2rent_col = config.get('way2rent_col', '租赁方式')
    house_total_col = config.get('house_total_col', '房屋总数')
    building_total_col = config.get('building_total_col', '楼栋总数')
    gas_fee_col = config.get('gas_fee_col', '燃气')
    wat_sup_col = config.get('wat_sup_col', '用水')
    ele_sup_col = config.get('ele_sup_col', '用电')
    
    facility_col = config.get('facility_col', '配套设施')
    build_cat_col = config.get('build_cat_col', '物业类别')
    built_year_col = config.get('built_year_col', '建筑年代')
    greening_rate_col = config.get('greening_rate_col', '绿 化 率')
    property_fee_col = config.get('property_fee_col', '物 业 费')
    gas_feenum_col = config.get('gas_fee_col', '燃气费')
    
    
    df_processed = df.copy()
    df_processed = remove_string(df_processed, floor_col, "层")
    df_processed = apply_floor_conversion(df_processed, floor_col)
    
    df_processed = split_house_type(df_processed, house_type_col)
    
    df_processed = remove_string(df_processed, house_area_col, "㎡")
    df_processed[house_area_col] = pd.to_numeric(df_processed[house_area_col], errors='coerce')
    
    df_processed = change_toward(df_processed, toward_col)

    df_processed = cal_deal_time(df_processed, deal_time_col)
    
    #df_processed = apply_rent2num(df_processed, way2rent_col)

    df_processed = apply_yesno2num(df_processed, elev_col)
    
    df_processed = apply_yesno2num(df_processed, gas_fee_col)

    df_processed = apply_water_conversion(df_processed, wat_sup_col)
    df_processed = apply_ele_conversion(df_processed, ele_sup_col)
    
    df_processed = check_facility(df_processed, facility_col)
    df_processed = check_build_cat(df_processed, build_cat_col)
    df_processed = take_year(df_processed, built_year_col)
    
    df_processed = remove_string(df_processed, greening_rate_col, "%")
    df_processed[greening_rate_col] = pd.to_numeric(df_processed[greening_rate_col], errors='coerce')
    df_processed = remove_string(df_processed, property_fee_col, "元/月/㎡")
    df_processed = interval_covert(df_processed, property_fee_col)
    df_processed[property_fee_col] = pd.to_numeric(df_processed[property_fee_col], errors='coerce')
    df_processed = remove_string(df_processed, gas_feenum_col, "元/m³")
    df_processed =interval_covert(df_processed, gas_feenum_col)
    df_processed[gas_feenum_col] = pd.to_numeric(df_processed[gas_feenum_col], errors='coerce')
    
    df_processed = remove_string(df_processed, house_total_col, "户")
    df_processed = remove_string(df_processed, building_total_col, "栋")
    df_processed[house_total_col] = pd.to_numeric(df_processed[house_total_col], errors='coerce')
    df_processed[building_total_col] = pd.to_numeric(df_processed[building_total_col], errors='coerce')
    df_processed = avgdense_ratio(df_processed, house_total_col, building_total_col)
    
    columns_to_drop = [floor_col, house_type_col, toward_col, deal_time_col]
    existing_cols_to_drop = [col for col in columns_to_drop if col in df_processed.columns]
    df_processed = df_processed.drop(columns=existing_cols_to_drop)
    
    return df_processed

column_mapping = {
    "城市": "city", 
    "区县": "district", 
    "板块": "plate", 
    "面积": "area",
    "年份": "year",
    "租赁方式": "way2rent",
    "电梯": "elevator",
    "用水": "wat_sup",
    "用电": "ele_sup",
    "燃气": "gas",
    "床": "bed", 
    "衣柜": "wardrobe", 
    "空调": "air_condi", 
    "洗衣机": "wash_mach",
    "热水器": "water_heat",
    "普通住宅": "dwelling",
    "底商": "ground_comm",
    "商业": "commerce",
    "车库": "carport",
    "公寓": "apart",
    "别墅": "villa",
    "租期": "period2rent",
    "建筑年代": "built_year",
    "绿 化 率": "greening_rate",
    "容 积 率": "plot_ratio",
    "物 业 费": "property_fee",
    "建筑结构": "build_struc",
    "燃气费": "gas_feenum",
    "停车位": "park_area"
}

def fraction_to_float(frac_str):
    if pd.isna(frac_str):
        return None
    
    try:
        parts = str(frac_str).split('/')
        if len(parts) == 2:
            numerator = float(parts[0])
            denominator = float(parts[1])
            return numerator / denominator
        else:
            return float(frac_str)
    except:
        return None

In [7]:
test_rent = pd.read_csv("ruc_Class25Q2_train_rent.csv") #未处理的
test_rent_senti = pd.read_csv("sentiment_rent.csv")
test_rent_sentiment = test_rent_senti.copy()
test_rent["reflect_sentiment"] = test_rent_senti["reflect_sentiment"]
test_rent_sentiment = test_rent.copy()
preprocess_before_df = preprocess_before_pipeline(test_rent_sentiment, config=None)
preprocess_before_df.drop(columns=["付款方式", "coord_x", "coord_y", "房屋总数", "楼栋总数", "客户反馈",
                                  "配套设施", "物业类别", "物业类别", "开发商", "物业公司", "产权描述",
                                  "供水", "供电", "停车费用"], inplace=True)
preprocess_before_df.columns = [column_mapping.get(col, col) for col in preprocess_before_df.columns]
preprocess_before_df.info()
preprocess_df_test = preprocess_before_df.copy()

  test_rent = pd.read_csv("ruc_Class25Q2_train_rent.csv") #未处理的


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 98899 entries, 0 to 98898
Data columns (total 51 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   city               98899 non-null  int64  
 1   装修                 25410 non-null  object 
 2   Price              98899 non-null  float64
 3   area               98899 non-null  float64
 4   way2rent           98899 non-null  object 
 5   elevator           98895 non-null  float64
 6   车位                 24764 non-null  object 
 7   wat_sup            81159 non-null  float64
 8   ele_sup            81575 non-null  float64
 9   gas                94317 non-null  float64
 10  采暖                 34412 non-null  object 
 11  period2rent        51966 non-null  object 
 12  lon                98899 non-null  float64
 13  lat                98899 non-null  float64
 14  year               98899 non-null  float64
 15  district           94222 non-null  float64
 16  plate              937

In [9]:
#-------------------------------Rent_Model-------------------------------------------------------------
x = preprocess_df_test.drop('Price', axis=1)
y = preprocess_df_test['Price'] 

x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, random_state=111
)

log_cols = ['gas_feenum', 'park_area']

log1p_cols = ['density_ratio', 'reflect_sentiment', 'plot_ratio', 'greening_rate', 'property_fee']

num_cols = ['area']

knn_num_features = ['built_year','transaction_year', 'year'] #knn

house_allo_cols = ['room', 'hall', 'bathroom', 'dwelling', 'ground_comm', 'commerce', 'carport', 'apart', 'villa']#knn

binary_features = ['east', 'south', 'west', 'north', 'bed', 'wardrobe', 'air_condi', 'wash_mach', 'water_heat']

low_cat_features = ['city', 'way2rent', 'floor', 'wat_sup', 'ele_sup', 'build_struc']

high_cat_district = ['district']  #target encoding
high_cat_period2rent = ['period2rent']
high_cat_plate = ['plate']  #clustering

class Winsorizer(BaseEstimator, TransformerMixin): #BE方便調超參 TM使類能與pipeline集成
    def __init__(self, lower_q=0.01, upper_q=0.99):
        self.lower_q = lower_q
        self.upper_q = upper_q
        self.low_ = None
        self.high_ = None

    def fit(self, X, y=None):
        X = np.asarray(X)
        self.low_ = np.nanpercentile(X, 100 * self.lower_q, axis=0)
        self.high_ = np.nanpercentile(X, 100 * self.upper_q, axis=0)
        return self

    def transform(self, X):
        X = np.asarray(X).astype(float)
        return np.clip(X, self.low_, self.high_)

class LonLatClusterTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, n_clusters=8):
        self.n_clusters = n_clusters
        self.kmeans = None
        
    def fit(self, X, y=None):
        lon_lat = X[['lon', 'lat']].values #Kmeans需要數值輸入
        self.kmeans = KMeans(n_clusters=self.n_clusters, random_state=42)
        self.kmeans.fit(lon_lat)
        return self
        
    def transform(self, X):
        lon_lat = X[['lon', 'lat']].values
        clusters = self.kmeans.predict(lon_lat)
        return clusters.reshape(-1, 1) #將一維數組轉为二維數組（label）
        
class PlateClusterTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, n_clusters=10):
        self.n_clusters = n_clusters
        self.kmeans = None
        self.encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False) #輸出np陣列
        
    def fit(self, X, y=None):
        plate_data = X.values.reshape(-1, 1)
        plate_encoded = self.encoder.fit_transform(plate_data)
        self.kmeans = KMeans(n_clusters=self.n_clusters, random_state=42)
        self.kmeans.fit(plate_encoded)
        return self
        
    def transform(self, X):
        plate_data = X.values.reshape(-1, 1)
        plate_encoded = self.encoder.transform(plate_data)
        clusters = self.kmeans.predict(plate_encoded)
        return clusters.reshape(-1, 1) #sklearn Transformer 需要二維數組

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('winsor', Winsorizer(lower_q=0.01, upper_q=0.99)), 
    ('scaler', StandardScaler()) 
])

log_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('log', FunctionTransformer(np.log)), 
    ('winsor', Winsorizer(lower_q=0.01, upper_q=0.99)),
    ('scaler', StandardScaler())
])

log1p_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('log1p', FunctionTransformer(np.log1p)),
    ('winsor', Winsorizer(lower_q=0.01, upper_q=0.99)),
    ('scaler', StandardScaler())
])

knn_num_pipeline = Pipeline([
    ('ref_imputer', SimpleImputer(strategy='median')),
    ('knn_imputer', KNNImputer(n_neighbors=5)), 
    ('winsor', Winsorizer(lower_q=0.01, upper_q=0.99)), 
    ('scaler', StandardScaler()) 
])

house_allo_pipeline = Pipeline([
    ('ref_imputer', SimpleImputer(strategy='median')),
    ('knn_imputer', KNNImputer(n_neighbors=5)),
    ('winsor', Winsorizer(lower_q=0.01, upper_q=0.99)),
    ('scaler', StandardScaler())
])

low_cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

binary_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent'))
])


preprocessor = ColumnTransformer([
    ('geo_cluster', LonLatClusterTransformer(n_clusters=25), ['lon', 'lat']),
    ('log_cols', log_pipeline, log_cols),
    ('log1p_num', log1p_pipeline, log1p_cols),
    ('num_feature', num_pipeline, num_cols),
    ('knn_num_impute', knn_num_pipeline, knn_num_features),
    ('house_allo', house_allo_pipeline, house_allo_cols),
    ('binary', binary_pipeline, binary_features),
    ('low_cat', low_cat_pipeline, low_cat_features),
    ('district_target', TargetEncoder(cols=['district'], smoothing=15), ['district']),
    ('period2rent', TargetEncoder('period2rent', smoothing=15), ['period2rent']),
    ('plate_cluster', PlateClusterTransformer(n_clusters=10), ['plate'])
], remainder='drop')

base_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('scaler', StandardScaler()),
    ('regressor', LinearRegression()) #LinearRegression()
])

ttr = TransformedTargetRegressor(regressor=base_pipeline,
                                     func=np.log,
                                     inverse_func=np.exp)

ttr.fit(x_train, y_train)

y_pred = ttr.predict(x_test)
y_pred_in = ttr.predict(x_train)

mae = mean_absolute_error(y_test, y_pred)
mae_in = mean_absolute_error(y_train, y_pred_in)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"MAE: {mae}")
print(f"MAEIN: {mae_in}")
print(f"MSE: {mse}")
print(f"R²: {r2}")

MAE: 134220.47204780584
MAEIN: 133596.7249174649
MSE: 91420503538.85396
R²: 0.7534768016171883


In [10]:
from sklearn.model_selection import cross_val_score

cv_folds = 6 
scoring_metric = 'neg_mean_absolute_error'

cv_scores = cross_val_score(
    estimator=ttr,
    X=x_train,                
    y=y_train,                 
    cv=cv_folds,              
    scoring=scoring_metric 
)

print(f"cv_avg: {cv_scores.mean()}")

cv_avg: -132888.60818521338


In [21]:
print(ttr.regressor)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('geo_cluster',
                                                  LonLatClusterTransformer(n_clusters=25),
                                                  ['lon', 'lat']),
                                                 ('log_cols',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('log',
                                                                   FunctionTransformer(func=<ufunc 'log'>)),
                                                                  ('winsor',
                                                                   Winsorizer()),
                                                                  ('scaler',
                                                                   StandardScaler()