In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import joblib


# =====================
# 1. Dataset
# =====================
class CSVDataset(Dataset):
    def __init__(self, df, scaler_X=None, scaler_y=None, fit_scaler=False):
        X = df.drop('price', axis=1).values.astype('float32')
        y = df['price'].values.reshape(-1, 1).astype('float32')

        # scale dữ liệu
        if fit_scaler:
            self.scaler_X = StandardScaler().fit(X)
            self.scaler_y = StandardScaler().fit(y)
        else:
            self.scaler_X = scaler_X
            self.scaler_y = scaler_y

        X = self.scaler_X.transform(X)
        y = self.scaler_y.transform(y)

        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.float32)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]


# =====================
# 2. Model
# =====================
class RegressionNet(nn.Module):
    def __init__(self, input_dim):
        super(RegressionNet, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1)
        )

    def forward(self, x):
        return self.net(x)


# =====================
# 3. Train function
# =====================
def train_model(csv_file, epochs=100, batch_size=16, lr=1e-3):
    # đọc dữ liệu
    df = pd.read_csv(csv_file)
    train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

    # dataset
    train_dataset = CSVDataset(train_df, fit_scaler=True)
    test_dataset = CSVDataset(
        test_df,
        scaler_X=train_dataset.scaler_X,
        scaler_y=train_dataset.scaler_y
    )

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size)

    # model, loss, optimizer
    model = RegressionNet(input_dim=train_dataset.X.shape[1])
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)

    # train loop
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for X_batch, y_batch in train_loader:
            optimizer.zero_grad()
            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        avg_loss = total_loss / len(train_loader)
        print(f"Epoch [{epoch+1}/{epochs}] - Train Loss: {avg_loss:.4f}")

    # evaluate
    model.eval()
    with torch.no_grad():
        mse = 0
        for X_batch, y_batch in test_loader:
            preds = model(X_batch)
            mse += criterion(preds, y_batch).item()
        mse /= len(test_loader)
    print(f"✅ Test MSE: {mse:.4f}")

    # save model và scaler
    torch.save(model.state_dict(), "regression_model.pt")
    joblib.dump(train_dataset.scaler_X, "scaler_X.pkl")
    joblib.dump(train_dataset.scaler_y, "scaler_y.pkl")
    print("💾 Model và scaler đã được lưu!")

    return model


# =====================
# 4. Dự đoán (Inference)
# =====================
def predict_new(csv_input=None, new_data=None):
    # load model và scaler
    scaler_X = joblib.load("scaler_X.pkl")
    scaler_y = joblib.load("scaler_y.pkl")

    # xác định input dimension
    input_dim = len(scaler_X.mean_)
    model = RegressionNet(input_dim)
    model.load_state_dict(torch.load("regression_model.pt"))
    model.eval()

    # nếu truyền file csv
    if csv_input:
        new_df = pd.read_csv(csv_input)
    else:
        # new_data dạng list hoặc dict
        new_df = pd.DataFrame(new_data)

    X_new = scaler_X.transform(new_df.values.astype('float32'))
    X_tensor = torch.tensor(X_new, dtype=torch.float32)

    with torch.no_grad():
        y_pred_scaled = model(X_tensor)
        y_pred = scaler_y.inverse_transform(y_pred_scaled.numpy())

    new_df['Predicted Price'] = y_pred
    print("📈 Dự đoán:")
    print(new_df)
    new_df.to_csv("predicted_output.csv", index=False)
    print("💾 Kết quả lưu tại predicted_output.csv")


# =====================
# 5. Chạy thử
# =====================
if __name__ == "__main__":
    # Train model
    train_model("house_prices.csv", epochs=100, batch_size=16, lr=0.001)

    # Dự đoán dữ liệu mới
    predict_new(new_data=[{
        "area": 1800,
        "bedrooms": 3,
        "bathrooms": 2
    }])
