In [6]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pandas as pd

# 定义自动编码器模型
class Autoencoder(nn.Module):
    def __init__(self, input_dim, encoding_dim):
        super(Autoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.Linear(64, encoding_dim),
            nn.ReLU()
        )
        self.decoder = nn.Sequential(
            nn.Linear(encoding_dim, 64),
            nn.ReLU(),
            nn.Linear(64, input_dim),
            nn.Sigmoid()
        )

    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return encoded, decoded

# 定义表格编码器
class TableEncoder(nn.Module):
    def __init__(self, d_table):
        super(TableEncoder, self).__init__()
        self.autoencoder = Autoencoder(input_dim=6, encoding_dim=d_table)

    def forward(self, X):
        encoded, _ = self.autoencoder(X)
        return encoded

# 定义文本编码器
class TextEncoder(nn.Module):
    def __init__(self, d_t, vocab_size):
        super(TextEncoder, self).__init__()
        self.embedding = nn.EmbeddingBag(vocab_size, d_t)

    def forward(self, T, offsets):
        return self.embedding(T, offsets)

# 定义CLIP模型，集成表格编码器和文本编码器
class CLIPTableText(nn.Module):
    def __init__(self, d_table, d_t, d_e, temperature, vocab_size):
        super(CLIPTableText, self).__init__()
        self.table_encoder = TableEncoder(d_table)
        self.text_encoder = TextEncoder(d_t, vocab_size)
        self.W_table = nn.Parameter(torch.randn(d_table, d_e))
        self.W_text = nn.Parameter(torch.randn(d_t, d_e))
        self.temperature = temperature

    def forward(self, X, T):
        X_f = self.table_encoder(X)
        T_f = self.text_encoder(T, torch.arange(0, T.size(0), dtype=torch.long))
        X_e = nn.functional.normalize(torch.matmul(X_f, self.W_table), dim=1)
        T_e = nn.functional.normalize(torch.matmul(T_f, self.W_text), dim=1)
        logits = torch.matmul(X_e, T_e.t()) * torch.exp(torch.tensor(self.temperature))

        labels_x = torch.arange(logits.size(0), dtype=torch.long)
        labels_t = torch.arange(logits.size(1), dtype=torch.long)

        loss_x = nn.functional.cross_entropy(logits, labels_x, reduction='mean')
        loss_t = nn.functional.cross_entropy(logits.t(), labels_t, reduction='mean')  # 在计算损失之前转置logits

        loss = (loss_x + loss_t) / 2.0

        return loss

# 读取数据
data = pd.read_csv('./result.csv')
table_data = data.iloc[:, 1:-1]
text_data = data.iloc[:, -1]

# 将Dataframe转化为numpy
table_data_np = table_data.values
text_data_np = text_data.values

# 将标签转换为数值格式
labels = text_data.unique()
indexed_labels = torch.tensor(text_data.map({label: i for i, label in enumerate(labels)}), dtype=torch.long)

# 实例化CLIP模型，用于表格和文本
d_table = 16  # 表格特征维度
d_t = 16  # 文本特征维度
d_e = 8   # 联合嵌入维度
temperature = 0.5
vocab_size = len(labels)  # 假设词汇表大小为唯一标签的数量

X = torch.tensor(table_data_np, dtype=torch.float32)
T = torch.tensor(indexed_labels, dtype=torch.long)

clip_model = CLIPTableText(d_table, d_t, d_e, temperature, vocab_size)

# 定义优化器和训练循环
optimizer = optim.Adam(clip_model.parameters(), lr=0.001) 

num_epochs = 1000
for epoch in range(num_epochs):
    # 前向传播 得到loss
    loss = clip_model(X, T)

    # 反向传播和优化
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    if epoch % 100 == 0:
        print(f'Epoch {epoch}/{num_epochs}, Loss: {loss.item()}')

# 保存预训练模型
torch.save(clip_model.state_dict(), 'clip_table_text_pretrained.pth')


  T = torch.tensor(indexed_labels, dtype=torch.long)


Epoch 0/1000, Loss: 6.357712268829346
Epoch 100/1000, Loss: 6.2946624755859375
Epoch 200/1000, Loss: 6.290989875793457
Epoch 300/1000, Loss: 6.290102958679199
Epoch 400/1000, Loss: 6.289830207824707
Epoch 500/1000, Loss: 6.289745330810547
Epoch 600/1000, Loss: 6.289721965789795
Epoch 700/1000, Loss: 6.289715766906738
Epoch 800/1000, Loss: 6.289715766906738
Epoch 900/1000, Loss: 6.289714813232422


In [8]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pandas as pd
from transformers import BertModel, BertTokenizer

# Define Autoencoder and TableEncoder as before

# Define BERT-based TextEncoder
class TextEncoder(nn.Module):
    def __init__(self, bert_model_name, d_t):
        super(TextEncoder, self).__init__()
        self.bert = BertModel.from_pretrained(bert_model_name)
        self.tokenizer = BertTokenizer.from_pretrained(bert_model_name)
        self.linear = nn.Linear(self.bert.config.hidden_size, d_t)

    def forward(self, T):
        input_ids = self.tokenizer(T, return_tensors='pt', padding=True, truncation=True)['input_ids']
        outputs = self.bert(input_ids)
        embeddings = outputs['last_hidden_state']  # Get embeddings for each token
        mean_pooled = torch.mean(embeddings, dim=1)  # Mean-pooling to get a single embedding for the entire sequence
        return self.linear(mean_pooled)

# Define CLIPTableText as before

# Read data
data = pd.read_csv('./result.csv')
table_data = data.iloc[:, 1:-1]
text_data = data.iloc[:, -1]

# Convert DataFrame to numpy
table_data_np = table_data.values
text_data_np = text_data.values

# Convert labels to numerical format
labels = text_data.unique()
indexed_labels = torch.tensor(text_data.map({label: i for i, label in enumerate(labels)}), dtype=torch.long)

# Instantiate CLIP model for table and text with BERT-based TextEncoder
d_table = 16
d_t = 768  # Assuming BERT base model hidden size
d_e = 8
temperature = 0.5
vocab_size = len(labels)

X = torch.tensor(table_data_np, dtype=torch.float32)
T = torch.tensor(indexed_labels, dtype=torch.long)

bert_model_name = 'bert-base-uncased'  # You can choose a different pre-trained BERT model

clip_model = CLIPTableText(d_table, d_t, d_e, temperature, vocab_size, text_encoder=TextEncoder(bert_model_name, d_t))

# Define optimizer and training loop
optimizer = optim.Adam(clip_model.parameters(), lr=0.001)

num_epochs = 1000
for epoch in range(num_epochs):
    # Forward pass
    loss = clip_model(X, T)

    # Backward pass and optimization
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if epoch % 100 == 0:
        print(f'Epoch {epoch}/{num_epochs}, Loss: {loss.item()}')

# Save the pre-trained model
torch.save(clip_model.state_dict(), 'clip_table_text_pretrained.pth')


  T = torch.tensor(indexed_labels, dtype=torch.long)


OSError: Error no file named pytorch_model.bin, tf_model.h5, model.ckpt.index or flax_model.msgpack found in directory bert-base-uncased.

In [None]:
# 加载已经训练好的模型
loaded_model = ContrastiveModel(table_input_size=table_data.shape[1],
                                text_embedding_dim=50,
                                hidden_size=64)
loaded_model.load_state_dict(torch.load('contrastive_model.pth'))
loaded_model.eval()

# 将数据转换为PyTorch张量并移动到CUDA设备上（如果可用）
table_data_tensor = torch.tensor(table_data_np, dtype=torch.float32).to(torch.device("cuda"))
text_data_tensor = torch.tensor(text_data_encoded, dtype=torch.long).to(torch.device("cuda"))

# 使用模型进行推断
with torch.no_grad():
    table_embeddings, text_embeddings = loaded_model(table_data_tensor, text_data_tensor)

# 现在，table_embeddings 和 text_embeddings 包含了对应数据的嵌入表示
