In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.regularizers import l2
import os

# 1. 更健壮的文件读取与列名检查
def load_and_validate_data(train_path, test_path):
    # 检查文件是否存在
    if not os.path.exists(train_path):
        raise FileNotFoundError(f"训练集文件不存在: {train_path}")
    if not os.path.exists(test_path):
        raise FileNotFoundError(f"测试集文件不存在: {test_path}")
    
    # 读取数据
    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path)
    
    # 检查训练集是否有Price列
    if 'Price' not in train_df.columns:
        # 尝试查找可能的目标列
        price_candidates = [col for col in train_df.columns if 'price' in col.lower()]
        if price_candidates:
            print(f"警告: 未找到'Price'列, 使用替代列: {price_candidates[0]}")
            train_df = train_df.rename(columns={price_candidates[0]: 'Price'})
        else:
            # 尝试使用最后一列作为目标列
            last_col = train_df.columns[-1]
            print(f"警告: 未找到'Price'列, 使用最后一列作为目标列: {last_col}")
            train_df = train_df.rename(columns={last_col: 'Price'})
    
    # 检查测试集是否有ID列
    if 'ID' not in test_df.columns:
        # 尝试查找可能的ID列
        id_candidates = [col for col in test_df.columns if 'id' in col.lower()]
        if id_candidates:
            print(f"警告: 未找到'ID'列, 使用替代列: {id_candidates[0]}")
            test_df = test_df.rename(columns={id_candidates[0]: 'ID'})
        else:
            # 尝试使用第一列作为ID列
            first_col = test_df.columns[0]
            print(f"警告: 未找到'ID'列, 使用第一列作为ID列: {first_col}")
            test_df = test_df.rename(columns={first_col: 'ID'})
    
    return train_df, test_df

# 2. 改进的数据预处理
def preprocess_data(train_df, test_df):
    # 复制数据避免修改原始数据
    train_df = train_df.copy()
    test_df = test_df.copy()
    
    # 检查特征数量是否一致
    train_features = [col for col in train_df.columns if col != 'Price']
    test_features = [col for col in test_df.columns if col != 'ID']
    
    if len(train_features) != len(test_features):
        print(f"警告: 训练集特征数({len(train_features)})与测试集特征数({len(test_features)})不一致")
        # 使用交集作为共同特征
        common_features = list(set(train_features) & set(test_features))
        print(f"使用共同特征: {len(common_features)}个")
    else:
        common_features = train_features
    
    # 计算训练集的填充值
    fill_values = train_df[common_features].mean()
    
    # 填充训练集缺失值
    train_df[common_features] = train_df[common_features].fillna(fill_values)
    
    # 填充测试集缺失值
    test_df[common_features] = test_df[common_features].fillna(fill_values)
    
    # 分离数据
    X_train = train_df[common_features].values
    y_train = train_df['Price'].values
    
    X_test = test_df[common_features].values
    test_ids = test_df['ID'].values
    
    return X_train, y_train, X_test, test_ids, common_features, fill_values

# 主程序
try:
    # 加载并验证数据
    train_df, test_df = load_and_validate_data(
        '正常化数据（带反馈）_train_rent.csv', 
        '正常化数据（带反馈）_test_rent.csv'
    )
    
    # 预处理数据
    X_train, y_train, X_test, test_ids, feature_names, fill_values = preprocess_data(train_df, test_df)
    
    print(f"训练集形状: {X_train.shape}, 测试集形状: {X_test.shape}")
    print(f"使用特征: {feature_names}")
    print(f"填充值示例: {fill_values.head() if isinstance(fill_values, pd.Series) else fill_values[:5]}")
    
    # 特征标准化
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # 划分验证集
    X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
        X_train_scaled, y_train, test_size=0.2, random_state=42
    )
    
    # 构建神经网络模型
    model = Sequential([
        Dense(256, activation='relu', input_shape=(X_train.shape[1],), 
              kernel_regularizer=l2(0.001)),
        BatchNormalization(),
        Dropout(0.4),
        Dense(128, activation='relu', kernel_regularizer=l2(0.001)),
        BatchNormalization(),
        Dropout(0.3),
        Dense(64, activation='relu'),
        Dense(32, activation='relu'),
        Dense(1)  # 输出层（回归任务）
    ])
    
    # 编译模型
    model.compile(optimizer='adam', loss='mse', metrics=['mae'])
    
    # 设置早停
    early_stop = EarlyStopping(monitor='val_loss', patience=15, restore_best_weights=True)
    
    # 训练模型
    print("开始训练模型...")
    history = model.fit(
        X_train_split, y_train_split,
        validation_data=(X_val_split, y_val_split),
        epochs=200,
        batch_size=512,
        callbacks=[early_stop],
        verbose=1
    )
    
    # 评估模型
    val_loss, val_mae = model.evaluate(X_val_split, y_val_split, verbose=0)
    print(f"验证集损失: {val_loss:.4f}, MAE: {val_mae:.4f}")
    
    # 进行预测
    print("进行预测...")
    predictions = model.predict(X_test_scaled).flatten()
    
    # 创建结果DataFrame
    result_df = pd.DataFrame({
        'ID': test_ids,
        'predict_price': predictions
    })
    
    # 保存结果
    model_name = "NeuralNetwork"
    output_file = f'{model_name}_predict_rent.csv'
    result_df.to_csv(output_file, index=False)
    
    print(f"预测结果已保存到: {output_file}")
    print(f"文件前5行预览:\n{result_df.head()}")

except Exception as e:
    print(f"发生错误: {str(e)}")
    print("请检查: ")
    print("1. 文件路径是否正确")
    print("2. 文件内容是否符合要求")
    print("3. 列名是否包含'Price'(训练集)和'ID'(测试集)")
    print("4. 数据格式是否正确")

2025-10-29 02:09:02.388183: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0


训练集形状: (98899, 56), 测试集形状: (9773, 56)
使用特征: ['城市', 'lon', 'lat', '年份', '区县', '板块', 'coord_x', 'coord_y', '室数', '厅数', '厨房数', '卫数', '精装修', '是否为底层', '是否为低层', '是否为中层', '是否为高层', '是否为顶层', '总楼层', '建筑面积值', '向南', '向北', '向西', '向东', '交易距今', '季付', '双月付', '月付', '半年付价', '年付', '整租', '合租', '是否有电梯', '租用车位', '免费使用', '是否有燃气', '设施情况', '房屋年份', '总户数', '总楼数', '绿化率', '塔楼', '板楼', '平房', '民水供水', '商水供水', '民电供电', '商电供电', '无供暖', '集中供暖', '自采暖', '很不好', '比较不好', '适中', '比较好', '很好']
填充值示例: 城市        4.322157
lon     115.752394
lat      31.420651
年份     2021.643010
区县       70.243054
dtype: float64


2025-10-29 02:09:05.474610: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcuda.so.1
2025-10-29 02:09:05.540224: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: 
pciBusID: 0000:b6:00.0 name: Tesla P100-SXM2-16GB computeCapability: 6.0
coreClock: 1.4805GHz coreCount: 56 deviceMemorySize: 15.90GiB deviceMemoryBandwidth: 681.88GiB/s
2025-10-29 02:09:05.540289: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0
2025-10-29 02:09:05.544318: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublas.so.11
2025-10-29 02:09:05.544419: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublasLt.so.11
2025-10-29 02:09:05.545780: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcuff

开始训练模型...
Epoch 1/200


2025-10-29 02:09:06.713849: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:176] None of the MLIR Optimization Passes are enabled (registered 2)
2025-10-29 02:09:06.727672: I tensorflow/core/platform/profile_utils/cpu_utils.cc:114] CPU Frequency: 2502935000 Hz
2025-10-29 02:09:07.490819: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublas.so.11


 27/155 [====>.........................] - ETA: 0s - loss: 717524172800.0000 - mae: 582598.5000

2025-10-29 02:09:07.817719: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublasLt.so.11


Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78/200
Epoch 7