In [7]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pandas as pd

# 定义自动编码器模型
class Autoencoder(nn.Module):
    def __init__(self, input_dim, encoding_dim):
        super(Autoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.Linear(64, encoding_dim),
            nn.ReLU()
        )
        self.decoder = nn.Sequential(
            nn.Linear(encoding_dim, 64),
            nn.ReLU(),
            nn.Linear(64, input_dim),
            nn.Sigmoid()
        )

    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return encoded, decoded

# 定义表格编码器
class TableEncoder(nn.Module):
    def __init__(self, d_table):
        super(TableEncoder, self).__init__()
        self.autoencoder = Autoencoder(input_dim=6, encoding_dim=d_table)

    def forward(self, X):
        encoded, _ = self.autoencoder(X)
        return encoded

# 定义文本编码器
class TextEncoder(nn.Module):
    def __init__(self, d_t, vocab_size):
        super(TextEncoder, self).__init__()
        self.embedding = nn.EmbeddingBag(vocab_size, d_t)

    def forward(self, T, offsets):
        return self.embedding(T, offsets)

# 定义CLIP模型，集成表格编码器和文本编码器
class CLIPTableText(nn.Module):
    def __init__(self, d_table, d_t, d_e, temperature, vocab_size):
        super(CLIPTableText, self).__init__()
        self.table_encoder = TableEncoder(d_table)
        self.text_encoder = TextEncoder(d_t, vocab_size)
        self.W_table = nn.Parameter(torch.randn(d_table, d_e))
        self.W_text = nn.Parameter(torch.randn(d_t, d_e))
        self.temperature = temperature

    def forward(self, X, T):
        X_f = self.table_encoder(X)
        T_f = self.text_encoder(T, torch.arange(0, T.size(0), dtype=torch.long))
        X_e = nn.functional.normalize(torch.matmul(X_f, self.W_table), dim=1)
        T_e = nn.functional.normalize(torch.matmul(T_f, self.W_text), dim=1)
        logits = torch.matmul(X_e, T_e.t()) * torch.exp(torch.tensor(self.temperature))

        labels_x = torch.arange(logits.size(0), dtype=torch.long)
        labels_t = torch.arange(logits.size(1), dtype=torch.long)

        loss_x = nn.functional.cross_entropy(logits, labels_x, reduction='mean')
        loss_t = nn.functional.cross_entropy(logits.t(), labels_t, reduction='mean')  # 在计算损失之前转置logits

        loss = (loss_x + loss_t) / 2.0

        return loss

# 读取数据
data = pd.read_csv('./result.csv')
table_data = data.iloc[:, 1:-1]
text_data = data.iloc[:, -1]

# 将Dataframe转化为numpy
table_data_np = table_data.values
text_data_np = text_data.values

# 将标签转换为数值格式
labels = text_data.unique()
indexed_labels = torch.tensor(text_data.map({label: i for i, label in enumerate(labels)}), dtype=torch.long)

# 实例化CLIP模型，用于表格和文本
d_table = 16  # 表格特征维度
d_t = 16  # 文本特征维度
d_e = 8   # 联合嵌入维度
temperature = 0.5
vocab_size = len(labels)  # 假设词汇表大小为唯一标签的数量

X = torch.tensor(table_data_np, dtype=torch.float32)
T = torch.tensor(indexed_labels, dtype=torch.long)

clip_model = CLIPTableText(d_table, d_t, d_e, temperature, vocab_size)

# 定义优化器和训练循环
optimizer = optim.Adam(clip_model.parameters(), lr=0.001) 

num_epochs = 1000
for epoch in range(num_epochs):
    # 前向传播 得到loss
    loss = clip_model(X, T)

    # 反向传播和优化
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    if epoch % 100 == 0:
        print(f'Epoch {epoch}/{num_epochs}, Loss: {loss.item()}')

# 保存预训练模型
torch.save(clip_model.state_dict(), 'clip_table_text_pretrained.pth')


  T = torch.tensor(indexed_labels, dtype=torch.long)


Epoch 0/1000, Loss: 6.34505558013916
Epoch 100/1000, Loss: 6.300232887268066
Epoch 200/1000, Loss: 6.293827056884766
Epoch 300/1000, Loss: 6.2916340827941895
Epoch 400/1000, Loss: 6.290716171264648
Epoch 500/1000, Loss: 6.290273666381836
Epoch 600/1000, Loss: 6.2900390625
Epoch 700/1000, Loss: 6.289906024932861
Epoch 800/1000, Loss: 6.289827346801758
Epoch 900/1000, Loss: 6.2897796630859375


In [27]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pandas as pd
from torch.nn.utils.rnn import pad_sequence
from transformers import BertTokenizer, BertModel

# 定义自动编码器模型
class Autoencoder(nn.Module):
    def __init__(self, input_dim, encoding_dim):
        super(Autoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.Linear(64, encoding_dim),
            nn.ReLU()
        )
        self.decoder = nn.Sequential(
            nn.Linear(encoding_dim, 64),
            nn.ReLU(),
            nn.Linear(64, input_dim),
            nn.Sigmoid()
        )

    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return encoded, decoded

# 定义表格编码器
class TableEncoder(nn.Module):
    def __init__(self, d_table):
        super(TableEncoder, self).__init__()
        self.autoencoder = Autoencoder(input_dim=6, encoding_dim=d_table)

    def forward(self, X):
        encoded, _ = self.autoencoder(X)
        return encoded


# 定义TextEncoder，包括bert_model参数
class TextEncoder(nn.Module):
    def __init__(self, d_t, vocab_size, bert_model):
        super(TextEncoder, self).__init__()
        self.bert_model = bert_model

    def forward(self, T):
        # 使用BERT模型的输出作为文本嵌入
        with torch.no_grad():
            outputs = self.bert_model(T)
        text_embeddings = outputs.last_hidden_state[:, 0, :]
        return text_embeddings

# 定义CLIP模型，集成表格编码器和文本编码器
class CLIPTableText(nn.Module):
    def __init__(self, d_table, d_t, d_e, temperature, vocab_size, text_encoder):
        super(CLIPTableText, self).__init__()
        self.table_encoder = TableEncoder(d_table)
        self.text_encoder = text_encoder
        self.W_table = nn.Parameter(torch.randn(d_table, d_e))
        self.W_text = nn.Parameter(torch.randn(d_t, d_e))
        self.temperature = temperature

    def forward(self, X, T):
        X_f = self.table_encoder(X)
        T_f = self.text_encoder(T)
        X_e = nn.functional.normalize(torch.matmul(X_f, self.W_table), dim=1)
        T_e = nn.functional.normalize(torch.matmul(T_f, self.W_text), dim=1)
        logits = torch.matmul(X_e, T_e.t()) * torch.exp(torch.tensor(self.temperature))

        labels_x = torch.arange(logits.size(0), dtype=torch.long)
        labels_t = torch.arange(logits.size(1), dtype=torch.long)

        loss_x = nn.functional.cross_entropy(logits, labels_x, reduction='mean')
        loss_t = nn.functional.cross_entropy(logits.t(), labels_t, reduction='mean')  # 在计算损失之前转置logits

        loss = (loss_x + loss_t) / 2.0

        return loss

# 读取数据
data = pd.read_csv('./result.csv')
table_data = data.iloc[:, 1:-1]
text_data = data.iloc[:, -1]

# 将Dataframe转化为numpy
table_data_np = table_data.values
text_data_np = text_data.values

# 加载预训练的BERT模型和分词器
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# 对每个文本数据进行编码
encoded_texts = []

for text in text_data_np:
    # 使用分词器将文本转换为标记
    tokens = tokenizer.tokenize(tokenizer.decode(tokenizer.encode(str(text[:512]))))

    # 将标记转换为模型输入的格式
    input_ids = tokenizer.convert_tokens_to_ids(tokens)
    encoded_texts.append(input_ids)

# 使用pad_sequence进行填充
indexed_labels = pad_sequence([torch.tensor(ids) for ids in encoded_texts], batch_first=True)

# 将标签转换为数值格式
labels = text_data.unique()

# 实例化CLIP模型，用于表格和文本
d_table = 16  # 表格特征维度
d_t = 768  # 文本特征维度
d_e = 8   # 联合嵌入维度
temperature = 0.5
vocab_size = len(labels)  # 假设词汇表大小为唯一标签的数量

X = torch.tensor(table_data_np, dtype=torch.float32)
T = torch.tensor(indexed_labels, dtype=torch.long)

# 创建BERT模型和分词器
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')

# 创建TextEncoder实例，包括bert_model参数
text_encoder = TextEncoder(d_t, vocab_size, bert_model)

# 在CLIPTableText模型中使用TextEncoder
clip_model = CLIPTableText(d_table, d_t, d_e, temperature, vocab_size, text_encoder)

# 定义优化器和训练循环
optimizer = optim.Adam(clip_model.parameters(), lr=0.001) 

num_epochs = 5
for epoch in range(num_epochs):
    # 前向传播 得到loss
    loss = clip_model(X, T)

    # 反向传播和优化
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    if epoch % 1 == 0:
        print(f'Epoch {epoch}/{num_epochs}, Loss: {loss.item()}')

# 保存预训练模型
torch.save(clip_model.state_dict(), 'clip_table_text_pretrained.pth')


  T = torch.tensor(indexed_labels, dtype=torch.long)


Epoch 0/5, Loss: 6.291183948516846
Epoch 1/5, Loss: 6.291066646575928
Epoch 2/5, Loss: 6.290956974029541
Epoch 3/5, Loss: 6.29085636138916
Epoch 4/5, Loss: 6.290770053863525


In [None]:
# 加载已经训练好的模型
loaded_model = ContrastiveModel(table_input_size=table_data.shape[1],
                                text_embedding_dim=50,
                                hidden_size=64)
loaded_model.load_state_dict(torch.load('contrastive_model.pth'))
loaded_model.eval()

# 将数据转换为PyTorch张量并移动到CUDA设备上（如果可用）
table_data_tensor = torch.tensor(table_data_np, dtype=torch.float32).to(torch.device("cuda"))
text_data_tensor = torch.tensor(text_data_encoded, dtype=torch.long).to(torch.device("cuda"))

# 使用模型进行推断
with torch.no_grad():
    table_embeddings, text_embeddings = loaded_model(table_data_tensor, text_data_tensor)

# 现在，table_embeddings 和 text_embeddings 包含了对应数据的嵌入表示
