In [None]:
import os
import pandas as pd
import numpy as np
import requests
from PIL import Image
from io import BytesIO
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, median_absolute_error, r2_score
from tqdm import tqdm


class AirbnbPreprocessorAndTrainer:
    def __init__(self, csv_path, nrows=1000, image_size=32, batch_size=64, lr=0.001, patience=5, seed=42):
        self.csv_path = csv_path
        self.nrows = nrows
        self.image_size = image_size
        self.batch_size = batch_size
        self.lr = lr
        self.patience = patience

        # Reproducibility
        self.seed = seed
        np.random.seed(seed)
        torch.manual_seed(seed)
        if torch.cuda.is_available():
            torch.cuda.manual_seed_all(seed)

        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        print("Verwende Gerät:", self.device)

        self.scaler = MinMaxScaler()
        self.imputer = SimpleImputer(strategy='median')
        self.model = None

        # Diese werden dynamisch gesetzt
        self.feature_columns = None  # Wird in preprocess() gesetzt
        
        self.image_column = "picture_url"
        self.target_column = "price"
        self.extra_columns = ["id"]

        self.train_loader = None
        self.test_loader = None
        self.train_dataset = None
        self.test_dataset = None
        self.df = None
        self.images = None

    def preprocess(self):
        df = pd.read_csv(self.csv_path, nrows=self.nrows)
        
        # Dynamische Feature-Auswahl: alle Spalten außer Ziel-, Bild- und ID-Spalte
        self.feature_columns = [col for col in df.columns 
                               if col not in [self.image_column, self.target_column, "id"]]
        
        # print(f"Verwende {len(self.feature_columns)} Features: {self.feature_columns[:10]}...")  # Zeige erste 10
        
        needed = self.feature_columns + [self.image_column, self.target_column] + self.extra_columns
        df = df[[col for col in needed if col in df.columns]].copy()
        
        # Drop rows ohne Preis oder Bild
        df = df.dropna(subset=[self.target_column, self.image_column])
        
        # Preis bereinigen
        df[self.target_column] = df[self.target_column].astype(str).str.replace("[$,]", "", regex=True)
        df[self.target_column] = pd.to_numeric(df[self.target_column], errors="coerce")
        df = df[df[self.target_column].notna()]
        df[self.target_column] = np.log(df[self.target_column])
        
        # Index nach allen dropna-Operationen zurücksetzen
        df = df.reset_index(drop=True)
        
        # Spalten nach Typ kategorisieren
        numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
        categorical_cols = df.select_dtypes(include=['object', 'bool']).columns.tolist()
        
        # Entferne Ziel-, Bild- und ID-Spalten aus Features
        numeric_cols = [col for col in numeric_cols 
                       if col not in [self.target_column, "id"]]
        categorical_cols = [col for col in categorical_cols 
                           if col not in [self.image_column, "id"]]
        
        print(f"Numerische Spalten ({len(numeric_cols)}): {numeric_cols[:5]}...")
        print(f"Kategorische Spalten ({len(categorical_cols)}): {categorical_cols[:5]}...")
        
        # Kategorische Variablen verarbeiten
        encoded_dfs = []
        for col in categorical_cols:
            if col in df.columns:
                # Boolean-Variablen
                unique_vals = set(df[col].dropna().unique())
                if df[col].dtype == 'bool' or unique_vals.issubset({'t', 'f', True, False}):
                    df[col] = df[col].map({'t': 1, 'f': 0, True: 1, False: 0}).fillna(0).astype(float)
                else:
                    # One-Hot Encoding für andere kategorische Variablen
                    # Nur wenn nicht zu viele unique values
                    if df[col].nunique() < 50:  # Maximal 50 kategorien
                        encoder = OneHotEncoder(sparse_output=False, handle_unknown="ignore")
                        encoded = encoder.fit_transform(df[[col]].fillna('missing'))
                        encoded_cols = [f"{col}_{str(cat).replace(' ', '_')}" for cat in encoder.categories_[0]]
                        encoded_df = pd.DataFrame(encoded, columns=encoded_cols, index=df.index)
                        encoded_dfs.append(encoded_df)
                        df = df.drop(col, axis=1)
                    else:
                        # Zu viele Kategorien - entfernen
                        # print(f"Spalte {col} hat {df[col].nunique()} Kategorien - wird entfernt")
                        df = df.drop(col, axis=1)
        
        # Encoded Features hinzufügen
        if encoded_dfs:
            df = pd.concat([df] + encoded_dfs, axis=1)
        
        # Index nach concat wieder zurücksetzen
        df = df.reset_index(drop=True)
        
        # Numerische Imputation - KORRIGIERTE VERSION
        numeric_cols_existing = [col for col in numeric_cols if col in df.columns]
        
        # Filter für Spalten, die nicht nur NaN sind
        valid_numeric_cols = []
        for col in numeric_cols_existing:
            if df[col].notna().any():  # Hat mindestens einen nicht-NaN Wert
                valid_numeric_cols.append(col)
            else:
                # print(f"Spalte {col} enthält nur fehlende Werte und wird entfernt")
                df = df.drop(col, axis=1)  # Entferne Spalte komplett
        
        if len(valid_numeric_cols) > 0:
            # Erstelle neuen Imputer für den aktuellen DataFrame
            current_imputer = SimpleImputer(strategy='median')
            numeric_data = df[valid_numeric_cols].copy()
            
            if numeric_data.shape[0] > 0:
                # Imputation direkt auf dem aktuellen DataFrame
                imputed_values = current_imputer.fit_transform(numeric_data)
                
                # WICHTIG: Verwende nur die Spalten, die tatsächlich im Output sind
                output_cols = current_imputer.get_feature_names_out(valid_numeric_cols)
                
                # Zuweisung: nur für Spalten, die tatsächlich imputed wurden
                for i, col in enumerate(valid_numeric_cols):
                    if i < imputed_values.shape[1]:  # Sicherheitscheck
                        df[col] = imputed_values[:, i]
        
        # Alle Feature-Spalten skalieren (außer Ziel-, Bild- und ID-Spalten)
        scale_cols = [col for col in df.columns 
                      if col not in [self.image_column, self.target_column, "id"]]
        
        if scale_cols and len(scale_cols) > 0:
            # Erstelle neuen Scaler für den aktuellen DataFrame
            current_scaler = MinMaxScaler()
            scale_data = df[scale_cols].copy()
            
            if scale_data.shape[0] > 0:
                scaled_values = current_scaler.fit_transform(scale_data)
                
                # Zuweisung: gleiche Anzahl Zeilen 
                for i, col in enumerate(scale_cols):
                    if i < scaled_values.shape[1]:  # Sicherheitscheck
                        df[col] = scaled_values[:, i]
        
        # Finale Bereinigung: alle NaN entfernen
        df = df.dropna()
        df = df.reset_index(drop=True)
        
        self.df = df
        print(f"Finale Datenform: {df.shape}")
        print(f"Feature-Spalten: {len(scale_cols)}")

    def process_images(self):
        images = []
        valid_indices = []

        for i, row in tqdm(self.df.iterrows(), total=len(self.df), desc="Bilder verarbeiten"):
            try:
                url = row[self.image_column]
                response = requests.get(url, timeout=5)
                response.raise_for_status()

                content_type = response.headers.get('Content-Type', '')
                if 'image' not in content_type:
                    raise ValueError(f"Kein Bild-Content (Type: {content_type})")

                img = Image.open(BytesIO(response.content)).convert("RGB")
                img = img.resize((self.image_size, self.image_size))
                images.append(np.array(img))
                valid_indices.append(i)
            except Exception as e:
                print(f"Fehler bei Index {i}: {e}")

        self.images = np.array(images)
        self.df = self.df.iloc[valid_indices].reset_index(drop=True)

    def prepare_tensors(self):
        feature_cols = [col for col in self.df.columns if col not in [self.image_column, self.target_column, "id"]]
        X_tab = self.df[feature_cols].values.astype(np.float32)
        y = self.df[self.target_column].values.astype(np.float32)

        # Train-Test-Split (tabular, images, target)
        (X_train_tab, X_test_tab,
         X_train_img, X_test_img,
         y_train, y_test) = train_test_split(
            X_tab, self.images, y, test_size=0.2, random_state=self.seed
        )

        # Datasets
        self.train_dataset = self.AirbnbDataset(X_train_img, X_train_tab, y_train)
        self.test_dataset = self.AirbnbDataset(X_test_img, X_test_tab, y_test)

        # DataLoader
        self.train_loader = DataLoader(self.train_dataset, batch_size=self.batch_size, shuffle=True)
        self.test_loader = DataLoader(self.test_dataset, batch_size=self.batch_size, shuffle=False)

    class AirbnbDataset(Dataset):
        def __init__(self, images, tab_features, prices, transform=None):
            self.images = images
            self.tab_features = tab_features
            self.prices = prices
            self.transform = transform or transforms.Compose([
                transforms.ToTensor(),
                transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])
            ])

        def __len__(self):
            return len(self.images)

        def __getitem__(self, idx):
            img = self.images[idx]
            if self.transform:
                img = self.transform(img)
            tab_data = torch.tensor(self.tab_features[idx], dtype=torch.float32)
            price = torch.tensor([self.prices[idx]], dtype=torch.float32)
            return (img, tab_data), price

    class MultiInputPricePredictor(nn.Module):
        def __init__(self, tab_dim):
            super().__init__()
            # Bild-Zweig (CNN)
            self.image_branch = nn.Sequential(
                nn.Conv2d(3, 16, kernel_size=3, padding=1),
                nn.ReLU(),
                nn.MaxPool2d(2),      # 16x16
                nn.Conv2d(16, 32, kernel_size=3, padding=1),
                nn.ReLU(),
                nn.MaxPool2d(2),      # 8x8
                nn.Conv2d(32, 64, kernel_size=3, padding=1),
                nn.ReLU(),
                nn.MaxPool2d(2),      # 4x4
                nn.Flatten()
            )
            
            # Erweiterte tabellarische Zweig für viele Features
            self.tab_branch = nn.Sequential(
                nn.Linear(tab_dim, 128),  # Größere erste Schicht
                nn.ReLU(),
                nn.Dropout(0.2),
                nn.Linear(128, 64),       # Weitere Schicht
                nn.ReLU(),
                nn.Dropout(0.2),
                nn.Linear(64, 32)         # Finale Feature-Dimension
            )
            
            # Gemeinsamer Regressor
            self.regressor = nn.Sequential(
                nn.Linear(64 * 4 * 4 + 32, 256),  # Größere kombinierte Schicht
                nn.ReLU(),
                nn.Dropout(0.3),
                nn.Linear(256, 128),
                nn.ReLU(),
                nn.Dropout(0.2),
                nn.Linear(128, 1)
            )

        def forward(self, x):
            img, tab = x
            img_features = self.image_branch(img)
            tab_features = self.tab_branch(tab)
            combined = torch.cat((img_features, tab_features), dim=1)
            return self.regressor(combined)

    def train_model(self, epochs=50):
        tab_dim = self.train_dataset.tab_features.shape[1]
        model = self.MultiInputPricePredictor(tab_dim).to(self.device)
        criterion = nn.MSELoss()
        optimizer = optim.Adam(model.parameters(), lr=self.lr)

        best_val_loss = float('inf')
        patience_counter = 0
        train_losses = []
        val_losses = []

        for epoch in range(epochs):
            # Training
            model.train()
            total_train_loss = 0.0
            for (images, tab_data), prices in self.train_loader:
                images = images.to(self.device)
                tab_data = tab_data.to(self.device)
                prices = prices.to(self.device)

                optimizer.zero_grad()
                outputs = model((images, tab_data))
                loss = criterion(outputs, prices)
                loss.backward()
                optimizer.step()

                total_train_loss += loss.item() * images.size(0)

            epoch_train_loss = total_train_loss / len(self.train_loader.dataset)
            train_losses.append(epoch_train_loss)

            # Validation
            model.eval()
            total_val_loss = 0.0
            with torch.no_grad():
                for (images, tab_data), prices in self.test_loader:
                    images = images.to(self.device)
                    tab_data = tab_data.to(self.device)
                    prices = prices.to(self.device)

                    outputs = model((images, tab_data))
                    loss = criterion(outputs, prices)
                    total_val_loss += loss.item() * images.size(0)

            epoch_val_loss = total_val_loss / len(self.test_loader.dataset)
            val_losses.append(epoch_val_loss)

            print(f"Epoch {epoch+1}/{epochs}, Train Loss: {epoch_train_loss:.4f}, Val Loss: {epoch_val_loss:.4f}")

            # Early Stopping
            if epoch_val_loss < best_val_loss:
                best_val_loss = epoch_val_loss
                patience_counter = 0
                torch.save(model.state_dict(), 'best_model.pth')
                print("Neues bestes Modell gespeichert")
            else:
                patience_counter += 1
                if patience_counter >= self.patience:
                    print("Early stopping triggered")
                    break

        self.model = model
        return train_losses, val_losses

    def calculate_rmse(self, loader):
        """
        RMSE auf der Original-Preisskala .
        """
        self.model.eval()
        all_preds, all_targets = [], []
        with torch.no_grad():
            for (images, tab_data), prices in loader:
                images = images.to(self.device)
                tab_data = tab_data.to(self.device)
                prices = prices.to(self.device)

                outputs = self.model((images, tab_data))
                all_preds.extend(outputs.cpu().numpy().flatten())
                all_targets.extend(prices.cpu().numpy().flatten())

        # zurücktransformieren von log -> price
        preds_price = np.exp(np.array(all_preds))
        targets_price = np.exp(np.array(all_targets))

        rmse = np.sqrt(np.mean((preds_price - targets_price) ** 2))
        return rmse


    def regression_metrics_real(self, preds_log, targets_log):
        """
        Regression-Metriken auf echter Preis-Skala (nicht log).
        """
        preds_price = np.exp(preds_log)
        targets_price = np.exp(targets_log)

        mae = mean_absolute_error(targets_price, preds_price)
        rmse = np.sqrt(np.mean((preds_price - targets_price) ** 2))
        medae = median_absolute_error(targets_price, preds_price)
        rme = np.mean((preds_price - targets_price) / targets_price) * 100  # % Error
        r2 = r2_score(targets_price, preds_price)

        return mae, rmse, medae, rme, r2


    def evaluate_loader(self, loader, model, device):
        preds, targets = [], []
        model.eval()
        with torch.no_grad():
            for (imgs, tabs), prices in loader:
                imgs = imgs.to(device)
                tabs = tabs.to(device)
                outputs = model((imgs, tabs))
                preds.extend(outputs.cpu().numpy().flatten())
                targets.extend(prices.cpu().numpy().flatten())
        return np.array(preds), np.array(targets)


# ------------------ Zusatz: Tabellenmetriken, Plots & Learning-Curve ------------------ #

def build_results_row(trainer, split_name, loader):
    preds_log, targets_log = trainer.evaluate_loader(loader, trainer.model, trainer.device)
    mae, rmse, medae, rme, r2 = trainer.regression_metrics_real(preds_log, targets_log)
    return {
        "Modell": "MLP+CNN",
        "Split": split_name,
        "RMSE ($)": rmse,
        "R² ($)": r2,
        "MAE ($)": mae,
        "MedAE ($)": medae
    }


def plot_true_vs_pred_price(trainer, loader, title="True vs. Predicted Price – MLP+CNN", max_price=500):
    preds_log, targets_log = trainer.evaluate_loader(loader, trainer.model, trainer.device)
    y_true_price = np.exp(targets_log)
    preds_price = np.exp(preds_log)

    plt.figure(figsize=(5, 5))
    plt.scatter(y_true_price, preds_price, alpha=0.4)
    plt.plot([0, max_price], [0, max_price], 'r--', label='Ideal')
    plt.xlim(0, max_price)
    plt.ylim(0, max_price)
    plt.xlabel("True Price ($)")
    plt.ylabel("Predicted Price ($)")
    plt.title(title)
    plt.grid(True)
    plt.legend()
    plt.tight_layout()
    plt.show()

def plot_learning_curve_pytorch(trainer,
                                train_fracs=np.linspace(0.1, 1.0, 10),
                                epochs_per_frac=6,
                                use_log_rmse=True,
                                shuffle=True):
    # Volle Trainingsdaten aus dem vorhandenen Dataset
    imgs_full = trainer.train_dataset.images
    tabs_full = trainer.train_dataset.tab_features
    y_full = trainer.train_dataset.prices
    n = len(imgs_full)

    idx_all = np.arange(n)
    if shuffle:
        rng = np.random.default_rng(trainer.seed)
        rng.shuffle(idx_all)

    def make_loader_from_indices(indices):
        subset_ds = trainer.AirbnbDataset(imgs_full[indices], tabs_full[indices], y_full[indices])
        return DataLoader(subset_ds, batch_size=trainer.batch_size, shuffle=True)

    def rmse_on_loader(loader, model):
        preds, targets = [], []
        model.eval()
        with torch.no_grad():
            for (imgs_b, tabs_b), y_b in loader:
                imgs_b = imgs_b.to(trainer.device)
                tabs_b = tabs_b.to(trainer.device)
                out = model((imgs_b, tabs_b)).cpu().numpy().flatten()
                tg = y_b.cpu().numpy().flatten()
                if not use_log_rmse:
                    out = np.exp(out)
                    tg = np.exp(tg)
                preds.extend(out)
                targets.extend(tg)
        preds = np.array(preds)
        targets = np.array(targets)
        return np.sqrt(np.mean((preds - targets) ** 2))

    train_sizes, train_rmse, val_rmse = [], [], []

    for frac in train_fracs:
        m = max(1, int(n * frac))
        indices = idx_all[:m]
        train_loader_frac = make_loader_from_indices(indices)

        # Frisches Modell je Trainingsgröße
        tab_dim = trainer.train_dataset.tab_features.shape[1]
        model = trainer.MultiInputPricePredictor(tab_dim).to(trainer.device)
        criterion = nn.MSELoss()
        optimizer = optim.Adam(model.parameters(), lr=trainer.lr)

        # Kurzes Training pro Größe
        for _ in range(epochs_per_frac):
            model.train()
            for (imgs_b, tabs_b), y_b in train_loader_frac:
                imgs_b = imgs_b.to(trainer.device)
                tabs_b = tabs_b.to(trainer.device)
                y_b = y_b.to(trainer.device)
                optimizer.zero_grad()
                out = model((imgs_b, tabs_b))
                loss = criterion(out, y_b)
                loss.backward()
                optimizer.step()

        train_sizes.append(m)
        train_rmse.append(rmse_on_loader(train_loader_frac, model))
        val_rmse.append(rmse_on_loader(trainer.test_loader, model))

    # Plot
    plt.figure(figsize=(6, 4))
    plt.plot(train_sizes, train_rmse, 'o-', label=f"Training RMSE {'(log)' if use_log_rmse else '($)'}")
    plt.plot(train_sizes, val_rmse, 'o-', label=f"Validation RMSE {'(log)' if use_log_rmse else '($)'}")
    plt.xlabel("Trainingsgröße")
    plt.ylabel(f"RMSE {'(log)' if use_log_rmse else '($)'}")
    plt.title("Learning Curve – MLP+CNN")
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.show()


if __name__ == "__main__":
    # Pfad anpassen
    csv_path = "data/listings.csv.gz"

    # Initialisieren und Daten verarbeiten
    trainer = AirbnbPreprocessorAndTrainer(csv_path, nrows=10000, image_size=32, batch_size=64, lr=0.001, patience=5, seed=42)
    trainer.preprocess()
    trainer.process_images()
    trainer.prepare_tensors()

    # Trainieren
    train_losses, val_losses = trainer.train_model(epochs=30)

    # Bestes Modell laden
    trainer.model.load_state_dict(torch.load('best_model.pth', map_location=trainer.device))

   # Log-RMSE
    test_rmse_log = trainer.calculate_rmse(trainer.test_loader)
    print(f"Test RMSE (log): {test_rmse_log:.4f}")

    # RMSE in $ auf echter Skala
    test_rmse_real = trainer.calculate_rmse(trainer.test_loader)
    print(f"Test RMSE (real $): {test_rmse_real:.2f}")


    # Tabellenmetriken (nur log)
    results = []
    results.append(build_results_row(trainer, "Test", trainer.test_loader))

    results_df = pd.DataFrame(results).sort_values(by="RMSE ($)").reset_index(drop=True)
    print("\nErgebnistabelle (MLP+CNN):")
    print(results_df.to_string(index=False))

    # True-vs-Predicted-Plots
    plot_true_vs_pred_price(trainer, trainer.test_loader, title="True vs. Predicted Price – MLP+CNN", max_price=500)

    # Learning-Curve für das PyTorch-Modell (in log scale)
    plot_learning_curve_pytorch(trainer,
                                train_fracs=np.linspace(0.1, 1.0, 10),
                                epochs_per_frac=6,
                                use_log_rmse=True,
                                shuffle=True)


In [None]:
plt.figure(figsize=(6,4))
plt.plot(train_losses, label="Train Loss (log)")
plt.plot(val_losses, label="Val Loss (log)")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.title("Training/Validation Loss (log)")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
import pandas as pd

# Methode 1: Direktes Lesen mit pandas
df = pd.read_csv("data/listings.csv.gz")

# Methode 2: Mit gzip-Modul
import gzip
with gzip.open("data/listings.csv.gz", "rt") as f:
    df = pd.read_csv(f)

# Methode 3: Spezifizierte Komprimierung
df = pd.read_csv("data/listings.csv.gz", compression='gzip')


In [None]:
# Alle Spaltennamen anzeigen
print(df.columns.tolist())

# DataFrame-Info anzeigen
print(df.info())

# Erste paar Zeilen anzeigen
print(df.head())

# Shape der Daten
print(f"Anzahl Zeilen: {df.shape[0]}, Anzahl Spalten: {df.shape[1]}")
