In [None]:

import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import os

# ==============================================
# KONFIGURACJA MODELU
# ==============================================
config = {
    "batch_size": 32,
    "learning_rate": 0.003,
    "epochs": 2000,
    "early_stopping_patience": 100,
    "hidden_layers": [512, 256, 128, 64],
    "dropout_rate": 0.3,
    "use_dropout": True,
    "validation_size": 0.2,
    "random_state": 42,
    "target_transform": True, 
    "weight_decay": 1e-4,
    "use_scheduler": True,
    "scheduler_patience": 25,
    "scheduler_factor": 0.3,
    "min_lr": 1e-7,
    "model_save_path": "best_house_price_model.pth", 
    "submission_file_name": "submission_final.csv" 
}

# ==============================================
# WCZYTYWANIE DANYCH
# ==============================================
def load_data(train_path="Data/train.csv", test_path="Data/test.csv"):
    try:
        train_df = pd.read_csv(train_path)
        test_df = pd.read_csv(test_path)
        print(f"Original train shape: {train_df.shape}")
        print(f"Original test shape: {test_df.shape}")
        if train_df.empty or test_df.empty:
            raise ValueError("Loaded train_df or test_df is empty!")
        return train_df, test_df
    except FileNotFoundError:
        print(f"FATAL: {train_path} or {test_path} not found.")
        raise SystemExit("Exiting due to missing data files.")

# ==============================================
# PRZETWARZANIE DANYCH I FEATURE ENGINEERING
# ==============================================
def preprocess_data(train_df, test_df, target_transform=True):
   
    test_ids = test_df['Id']
    
    train_index = train_df.index
    test_index = test_df.index
    train_df['is_train'] = 1
    test_df['is_train'] = 0
    
    all_features_df = pd.concat((train_df, test_df), ignore_index=True).copy()

    # --- Feature Engineering ---
    all_features_df['HouseAge'] = all_features_df['YrSold'] - all_features_df['YearBuilt']
    all_features_df['RemodAge'] = all_features_df['YrSold'] - all_features_df['YearRemodAdd']
    all_features_df['IsRemodeled'] = (all_features_df['YearRemodAdd'] != all_features_df['YearBuilt']).astype(int)
    all_features_df.loc[all_features_df['YearRemodAdd'] == all_features_df['YearBuilt'], 'RemodAge'] = all_features_df['HouseAge']

    base_sf_cols = ['GrLivArea', 'TotalBsmtSF', 'BsmtFinSF1', 'BsmtFinSF2', 
                    'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch']
    for col in base_sf_cols:
        if col not in all_features_df.columns: all_features_df[col] = 0
        all_features_df[col] = all_features_df[col].fillna(0)

    all_features_df['TotalSF'] = all_features_df['GrLivArea'] + all_features_df['TotalBsmtSF']
    all_features_df['TotalFinishedSF'] = all_features_df['GrLivArea'] + all_features_df['BsmtFinSF1'] + all_features_df['BsmtFinSF2']
    all_features_df['TotalPorchSF'] = all_features_df['OpenPorchSF'] + all_features_df['EnclosedPorch'] + \
                                   all_features_df['3SsnPorch'] + all_features_df['ScreenPorch']

    bath_cols = ['FullBath', 'HalfBath', 'BsmtFullBath', 'BsmtHalfBath']
    for col in bath_cols:
        if col not in all_features_df.columns: all_features_df[col] = 0
        all_features_df[col] = all_features_df[col].fillna(0)
    all_features_df['TotalBath'] = all_features_df['FullBath'] + 0.5 * all_features_df['HalfBath'] + \
                                all_features_df['BsmtFullBath'] + 0.5 * all_features_df['BsmtHalfBath']

    all_features_df['HasPool'] = (all_features_df.get('PoolArea', pd.Series(0, index=all_features_df.index)).fillna(0) > 0).astype(int)
    all_features_df['HasFireplace'] = (all_features_df.get('Fireplaces', pd.Series(0, index=all_features_df.index)).fillna(0) > 0).astype(int)
    all_features_df['HasGarage'] = (all_features_df.get('GarageArea', pd.Series(0, index=all_features_df.index)).fillna(0) > 0).astype(int)
    
    all_features_df['BsmtFinToTotalBsmt_Ratio'] = (all_features_df['BsmtFinSF1'] + all_features_df['BsmtFinSF2']) / (all_features_df['TotalBsmtSF'] + 1e-6)
    all_features_df['LotArea_x_GrLivArea_Ratio'] = all_features_df['GrLivArea'] / (all_features_df['LotArea'] + 1e-6)

    train_df_fe = all_features_df[all_features_df['is_train'] == 1].drop(columns=['is_train'])
    test_df_fe = all_features_df[all_features_df['is_train'] == 0].drop(columns=['is_train', 'SalePrice'])
    train_df_fe.index = train_index
    test_df_fe.index = test_index
    train_df = train_df_fe
    test_df = test_df_fe

    train_df = train_df.drop('Id', axis=1)
    test_df = test_df.drop('Id', axis=1)

    if target_transform and 'SalePrice' in train_df.columns:
        train_df['SalePrice'] = np.log1p(train_df['SalePrice'])

    y_train_full = None
    split_point = 0
    if 'SalePrice' in train_df.columns:
        y_train_full = train_df['SalePrice'].copy()
        train_df_features = train_df.drop(['SalePrice'], axis=1)
        all_df = pd.concat((train_df_features.reset_index(drop=True), 
                            test_df.reset_index(drop=True)), 
                           ignore_index=True)
        split_point = len(train_df_features)
    else:
        all_df = pd.concat((train_df.reset_index(drop=True), 
                            test_df.reset_index(drop=True)), 
                           ignore_index=True)
        split_point = len(train_df)
        
    print(f"Combined shape after FE, before other processing: {all_df.shape}")

    if 'LotFrontage' in all_df.columns and 'Neighborhood' in all_df.columns:
        all_df['LotFrontage'] = all_df.groupby('Neighborhood')['LotFrontage'].transform(lambda x: x.fillna(x.median()))
        all_df['LotFrontage'] = all_df['LotFrontage'].fillna(all_df['LotFrontage'].median())
    elif 'LotFrontage' in all_df.columns:
        all_df['LotFrontage'] = all_df['LotFrontage'].fillna(all_df['LotFrontage'].median())

    cols_fillna_none = ['Alley', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
                        'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond',
                        'PoolQC', 'Fence', 'MasVnrType', 'MiscFeature']
    for col in cols_fillna_none:
        if col in all_df.columns: all_df[col] = all_df[col].fillna('None')

    num_cols_with_na = all_df.select_dtypes(include=np.number).isnull().sum()
    num_cols_to_fill = num_cols_with_na[num_cols_with_na > 0].index
    if len(num_cols_to_fill) > 0:
        for col in num_cols_to_fill: all_df[col] = all_df[col].fillna(all_df[col].median())

    cat_cols_with_na = all_df.select_dtypes(include='object').isnull().sum()
    cat_cols_to_fill = cat_cols_with_na[cat_cols_with_na > 0].index
    if len(cat_cols_to_fill) > 0:
        for col in cat_cols_to_fill: all_df[col] = all_df[col].fillna(all_df[col].mode()[0])
    
    if all_df.isnull().sum().sum() > 0:
        print(f"Warning: NaNs still present after imputation ({all_df.isnull().sum().sum()} total). Filling with 0.")
        all_df = all_df.fillna(0)

    ordinal_mappings = {
        'ExterQual': {'None': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5}, 'ExterCond': {'None': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5},
        'BsmtQual': {'None': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5}, 'BsmtCond': {'None': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5},
        'BsmtExposure': {'None': 0, 'No': 1, 'Mn': 2, 'Av': 3, 'Gd': 4},
        'BsmtFinType1': {'None': 0, 'Unf': 1, 'LwQ': 2, 'Rec': 3, 'BLQ': 4, 'ALQ': 5, 'GLQ': 6},
        'BsmtFinType2': {'None': 0, 'Unf': 1, 'LwQ': 2, 'Rec': 3, 'BLQ': 4, 'ALQ': 5, 'GLQ': 6},
        'HeatingQC': {'None': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5}, 'KitchenQual': {'None': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5},
        'FireplaceQu': {'None': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5},
        'GarageQual': {'None': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5}, 'GarageCond': {'None': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5},
        'PoolQC': {'None': 0, 'Fa': 1, 'TA': 2, 'Gd': 3, 'Ex': 4}, 'Fence': {'None': 0, 'MnWw': 1, 'GdWo': 2, 'MnPrv': 3, 'GdPrv': 4},
        'LotShape': {'IR3': 0, 'IR2': 1, 'IR1': 2, 'Reg': 3}, 'LandSlope': {'Sev': 0, 'Mod': 1, 'Gtl': 2},
        'PavedDrive': {'N': 0, 'P': 1, 'Y': 2}, 'Street': {'Grvl': 0, 'Pave': 1},
        'Alley': {'None': 0, 'Grvl': 1, 'Pave': 2}, 'CentralAir': {'N': 0, 'Y': 1},
        'Utilities': {'ELO': 0, 'NoSeWa': 1, 'NoSewr': 2, 'AllPub': 3},
        'GarageFinish': {'None': 0, 'Unf': 1, 'RFn': 2, 'Fin': 3},
        'Functional': {'Sal':0, 'Sev':1, 'Maj2':2, 'Maj1':3, 'Mod':4, 'Min2':5, 'Min1':6, 'Typ':7}
    }
    for col, mapping in ordinal_mappings.items():
        if col in all_df.columns: all_df[col] = all_df[col].map(mapping).fillna(0)
            
    if 'OverallQual' in all_df.columns and 'OverallCond' in all_df.columns:
        all_df['OverallGrade'] = all_df['OverallQual'] * all_df['OverallCond']
    if 'GarageQual' in all_df.columns and 'GarageCond' in all_df.columns:
        all_df['GarageGrade'] = all_df['GarageQual'] * all_df['GarageCond']
    if 'ExterQual' in all_df.columns and 'ExterCond' in all_df.columns:
        all_df['ExterGrade'] = all_df['ExterQual'] * all_df['ExterCond']

    if 'MSSubClass' in all_df.columns: all_df['MSSubClass'] = all_df['MSSubClass'].astype(str)

    categorical_cols = [col for col in all_df.columns if all_df[col].dtype == 'object']
    if 'MSSubClass' in all_df.columns and isinstance(all_df['MSSubClass'].dtype, pd.StringDtype):
        if 'MSSubClass' not in categorical_cols: categorical_cols.append('MSSubClass')
    elif 'MSSubClass' in all_df.columns and all_df['MSSubClass'].dtype == 'object':
         if 'MSSubClass' not in categorical_cols: categorical_cols.append('MSSubClass')
    
    if categorical_cols:
        all_df = pd.get_dummies(all_df, columns=categorical_cols, dummy_na=False, dtype=int)

    train_processed = all_df.iloc[:split_point]
    test_processed = all_df.iloc[split_point:]
    
    print(f"Final processed train shape: {train_processed.shape}")
    print(f"Final processed test shape: {test_processed.shape}")

    if y_train_full is not None:
        train_processed = train_processed.copy() 
        train_processed.loc[:, 'SalePrice'] = y_train_full.values

    return train_processed, test_processed, test_ids

# ==============================================
# MODEL SIECI NEURONOWEJ
# ==============================================
class HousePriceModel(nn.Module):
    def __init__(self, input_dim, hidden_layers, dropout_rate=0.2, use_dropout=True):
        super(HousePriceModel, self).__init__()
        layers = []
        prev_layer_size = input_dim
        for layer_size in hidden_layers:
            layers.append(nn.Linear(prev_layer_size, layer_size))
            layers.append(nn.BatchNorm1d(layer_size))
            layers.append(nn.ReLU())
            if use_dropout: layers.append(nn.Dropout(dropout_rate))
            prev_layer_size = layer_size
        layers.append(nn.Linear(prev_layer_size, 1))
        self.net = nn.Sequential(*layers)
    def forward(self, x): return self.net(x)

# ==============================================
# Funkcje treningu i metryk 
# ==============================================
def train_model(model, train_loader, val_loader, optimizer, criterion, epochs, patience, device, model_save_path, scheduler=None):
    
    pass

def calculate_metrics(y_true, y_pred, context=""):
   
    y_true_flat, y_pred_flat = y_true.flatten(), y_pred.flatten()
    if len(y_true_flat) == 0 or len(y_pred_flat) == 0 or len(y_true_flat) != len(y_pred_flat):
        return np.nan, np.nan, np.nan, np.nan
    valid_indices = ~ (np.isnan(y_true_flat) | np.isinf(y_true_flat) | np.isnan(y_pred_flat) | np.isinf(y_pred_flat))
    y_true_clean, y_pred_clean = y_true_flat[valid_indices], y_pred_flat[valid_indices]
    if len(y_true_clean) == 0: return np.nan, np.nan, np.nan, np.nan

    r2 = np.nan
    if len(y_true_clean) >= 1 and not (np.var(y_true_clean) < 1e-9):
        try: r2 = r2_score(y_true_clean, y_pred_clean)
        except ValueError: pass 
    elif np.var(y_true_clean) < 1e-9 : 
         r2 = 0.0 if np.var(y_pred_clean) < 1e-9 and np.abs(np.mean(y_true_clean) - np.mean(y_pred_clean)) < 1e-9 else np.nan


    mse = mean_squared_error(y_true_clean, y_pred_clean); rmse = np.sqrt(mse)
    rmsle = np.nan
    y_true_rmsle, y_pred_rmsle = y_true_clean.copy(), y_pred_clean.copy()
    if np.any(y_pred_rmsle < 0): y_pred_rmsle[y_pred_rmsle < 0] = 0
    if not np.any(y_true_rmsle < 0) and len(y_true_rmsle) > 0:
        try:
            log_y_true, log_y_pred = np.log1p(y_true_rmsle), np.log1p(y_pred_rmsle)
            if not (np.any(np.isinf(log_y_true)) or np.any(np.isinf(log_y_pred)) or \
                    np.any(np.isnan(log_y_true)) or np.any(np.isnan(log_y_pred))):
                rmsle = np.sqrt(mean_squared_error(log_y_true, log_y_pred))
        except Exception: pass 
    return mse, rmse, r2, rmsle

# ==============================================
# GŁÓWNY BLOK WYKONAWCZY 
# ==============================================
def main():
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")
    np.random.seed(config["random_state"])
    torch.manual_seed(config["random_state"])
    if device.type == 'cuda': torch.cuda.manual_seed_all(config["random_state"])

    
    train_df_orig, test_df_orig = load_data()
    train_df, test_df, test_ids = preprocess_data(train_df_orig.copy(), test_df_orig.copy(), config["target_transform"])
    
    if 'SalePrice' not in train_df.columns:
        if config["target_transform"]: y = np.log1p(train_df_orig['SalePrice']); X = train_df
        else: raise ValueError("'SalePrice' missing and target_transform is False.")
    else: X = train_df.drop('SalePrice', axis=1); y = train_df['SalePrice']
    
    non_numeric_cols = X.select_dtypes(exclude=np.number).columns
    if len(non_numeric_cols) > 0:
        print(f"Warning: Non-numeric columns in X before scaling: {list(non_numeric_cols)}. Converting.")
        for col in non_numeric_cols: X.loc[:, col] = pd.to_numeric(X[col], errors='coerce').fillna(0)
    
    missing_cols_in_test = set(X.columns) - set(test_df.columns)
    for c in missing_cols_in_test: test_df.loc[:, c] = 0
    extra_cols_in_test = set(test_df.columns) - set(X.columns)
    if extra_cols_in_test: test_df = test_df.drop(columns=list(extra_cols_in_test))
    test_df = test_df[X.columns]

    
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=config["validation_size"], random_state=config["random_state"])
    
    
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    
    X_test_scaled = scaler.transform(test_df)
    
    X_test_tensor = torch.tensor(X_test_scaled, dtype=torch.float32)

    # =========================================================================
    # GŁÓWNA ZMIANA: WCZYTANIE MODELU ZAMIAST TRENINGU
    # =========================================================================
    print("\n--- Tryb Predykcji: Pomijanie treningu, wczytywanie modelu z pliku ---")

    input_dim = X_test_tensor.shape[1]
    model = HousePriceModel(
        input_dim, 
        config["hidden_layers"], 
        config["dropout_rate"], 
        config["use_dropout"]
    ).to(device)

    model_path = config["model_save_path"]
    try:
        
        model.load_state_dict(torch.load(model_path, map_location=device))
        print(f"Pomyślnie wczytano model z pliku: {model_path}")
    except FileNotFoundError:
        print(f"BŁĄD KRYTYCZNY: Nie znaleziono pliku modelu '{model_path}'.")
        print("Nie można kontynuować. Uruchom skrypt w trybie treningu, aby zapisać model.")
        return 

    model.eval()

   
    print("\nGenerowanie predykcji dla zbioru testowego...")
    with torch.no_grad():
        test_predictions_scaled = model(X_test_tensor.to(device)).cpu().numpy()

    clip_min, clip_max = -20, 25 
    test_predictions_scaled_clipped = np.clip(test_predictions_scaled, clip_min, clip_max)
    
    if config["target_transform"]:
        final_test_predictions = np.expm1(test_predictions_scaled_clipped.flatten())
    else:
        final_test_predictions = test_predictions_scaled_clipped.flatten()
        
    final_test_predictions[final_test_predictions < 0] = 0 
    
    submission_df = pd.DataFrame({'Id': test_ids, 'SalePrice': final_test_predictions})
    submission_df.to_csv(config["submission_file_name"], index=False) 
    print(f"Zapisano przewidywania ({len(final_test_predictions)} wierszy) do pliku {config['submission_file_name']}")
    print("\nZakończono.")

if __name__ == "__main__":
    main()



Using device: cpu
Original train shape: (1460, 81)
Original test shape: (1459, 80)
Combined shape after FE, before other processing: (2919, 91)
Final processed train shape: (1460, 251)
Final processed test shape: (1459, 251)

--- Tryb Predykcji: Pomijanie treningu, wczytywanie modelu z pliku ---
Pomyślnie wczytano model z pliku: best_house_price_model.pth

Generowanie predykcji dla zbioru testowego...
Zapisano przewidywania (1459 wierszy) do pliku submission_final.csv

Zakończono.
