In [3]:
def pnn1(inputs, embed_size, hidden_dim, keep_prob):
    num_inputs = len(inputs)
    num_pairs = int(num_inputs * (num_inputs - 1) / 2)
    xw = torch.cat(inputs, 1)
    xw3d = xw.view(-1, num_inputs, embed_size)
    row = []
    col = []
    for i in range(num_inputs - 1):
        for j in range(i + 1, num_inputs):
            row.append(i)
            col.append(j)
    p = xw3d[:, row, :].transpose(1, 0)
    q = xw3d[:, col, :].transpose(1, 0)
    p = p.contiguous().view(-1, num_pairs, embed_size)
    q = q.contiguous().view(-1, num_pairs, embed_size)
    ip = (p * q).sum(-1).view(-1, num_pairs)
    l = torch.cat([xw, ip], 1)
    h = nn.Linear(l.size(1), hidden_dim)(l)
    h = nn.ReLU()(h)
    h = nn.Dropout(p=1 - keep_prob)(h)
    p = nn.Linear(hidden_dim, 1)(h).view(-1)
    return h, p

In [9]:
import os
import torch
import numpy as np
import math
from scipy import sparse
from torch.nn import functional as F


import torch
import torch.nn as nn




# 加载数据
data_folder = 'assist09'
con_sym = ';'

pro_skill_coo = sparse.load_npz(os.path.join(data_folder, 'pro_skill_sparse.npz'))
skill_skill_coo = sparse.load_npz(os.path.join(data_folder, 'skill_skill_sparse.npz'))
pro_pro_coo = sparse.load_npz(os.path.join(data_folder, 'pro_pro_sparse.npz'))

pro_num, skill_num = pro_skill_coo.shape
print(f'问题数目{pro_num}, 技能数目{skill_num}')

pro_skill = pro_skill_coo.toarray()
pro_pro = pro_pro_coo.toarray()
skill_skill = skill_skill_coo.toarray()
pro_skill_tensor = torch.from_numpy(pro_skill)
skill_skill_tensor = torch.from_numpy(skill_skill)
pro_pro_tensor = torch.from_numpy(pro_pro)

pro_feat = sparse.load_npz(os.path.join(data_folder, 'pro_feat.npz')).toarray()
pro_feat_tensor = torch.from_numpy(pro_feat)
print('问题特征形状:', pro_feat.shape)

diff_feat_dim = pro_feat.shape[1] - 1
embed_dim = 64
hidden_dim = 64
dropout = 0.5
lr = 0.001
batch_size = 256
epochs = 200

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")


# 定义模型
class PEGB(torch.nn.Module):
    def __init__(self, pro_num, skill_num, diff_feat_dim, embed_dim):
        super(PEGB, self).__init__()

        self.pro_embeddings = torch.nn.Embedding(pro_num, embed_dim)
        self.skill_embeddings = torch.nn.Embedding(skill_num, embed_dim)
        self.diff_embeddings = torch.nn.Linear(diff_feat_dim, embed_dim)
        
        self.final_embedding = torch.zeros(pro_num, embed_dim)
    
    def final_pro_embedding(self):
        batch_pro = torch.arange(pro_num).long().to(device)
        batch_pro_skill = torch.Tensor(pro_skill).to(device)
        batch_pro_pro = torch.Tensor(pro_pro).to(device)
        batch_diff_feat = torch.Tensor(pro_feat[:,:-1]).to(device)
        batch_skill_skill = torch.arange(skill_num).to(device)
        pro_embed = self.pro_embeddings(batch_pro)
        skill_embed = self.skill_embeddings(batch_skill_skill)
        diff_feat_embed = self.diff_embeddings(batch_diff_feat)
        print(pro_embed.shape, skill_embed.shape, diff_feat_embed.shape)
        skill_embed = batch_pro_skill @ skill_embed / batch_pro_skill.sum(1, keepdim=True)
        h,p= pnn1([pro_embed, skill_embed, diff_feat_embed],embed_dim, hidden_dim, 0.5)
        return h



    def forward(self, pro, diff_feat, pro_skill, pro_pro, skill_skill):
        pro_embed = self.pro_embeddings(pro)
        skill_embed = self.skill_embeddings(skill_skill)
        diff_feat_embed = self.diff_embeddings(diff_feat)

        # pro-skill
        pro_skill_logits = (pro_embed @ skill_embed.t()).view(-1)
        pro_skill_loss = F.binary_cross_entropy_with_logits(pro_skill_logits, pro_skill.view(-1))

        # pro-pro
        pro_pro_logits = (pro_embed @ pro_embed.t()).view(-1)
        # print(pro_pro_logits.shape,pro_pro.shape)
        pro_pro_loss = F.binary_cross_entropy_with_logits(pro_pro_logits, pro_pro.contiguous().view(-1))

        # skill-skill
        skill_skill_logits = (skill_embed @ skill_embed.t()).view(-1)
        skill_skill_loss = F.binary_cross_entropy_with_logits(skill_skill_logits, skill_skill_tensor.view(-1))

        # 特征融合
        skill_embed = pro_skill @ skill_embed / pro_skill.sum(1, keepdim=True)
#         print(pro_embed.shape,skill_embed.shape,diff_feat_embed.shape)
        h,p= pnn1([pro_embed, skill_embed, diff_feat_embed],embed_dim, hidden_dim, 1.0)
#         h,p = self.pnn([pro_embed, skill_embed, diff_feat_embed])
        # pro_final_embed = None
#         print(p.shape,pro_feat_tensor.shape)
        self.final_embedding[pro] = h
        mse = ((p-diff_feat[:,-1])**2).mean()

        return pro_skill_loss, pro_pro_loss, skill_skill_loss,mse,h


model = PEGB(pro_num, skill_num, diff_feat_dim, embed_dim).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=lr)




问题数目17751, 技能数目123
问题特征形状: (17751, 7)


In [10]:
def final_pro_embedding(self):
    batch_pro = torch.arange(pro_num).long().to(device)
    batch_pro_skill = torch.Tensor(pro_skill).to(device)
    batch_pro_pro = torch.Tensor(pro_pro).to(device)
    batch_diff_feat = torch.Tensor(pro_feat[:,:-1]).to(device)
    batch_skill_skill = torch.arange(skill_num).to(device)
    pro_embed = self.pro_embeddings(batch_pro)
    skill_embed = self.skill_embeddings(batch_skill_skill)
    diff_feat_embed = self.diff_embeddings(batch_diff_feat)
    print(pro_embed.shape, skill_embed.shape, diff_feat_embed.shape)
    skill_embed = batch_pro_skill @ skill_embed / batch_pro_skill.sum(1, keepdim=True)
    h,p= pnn1([pro_embed, skill_embed, diff_feat_embed],embed_dim, hidden_dim, 1.0)
    return h

In [11]:
final_pro_embedding(model)[0]

torch.Size([17751, 64]) torch.Size([123, 64]) torch.Size([17751, 64])


tensor([0.0000, 0.0398, 0.0000, 0.4969, 0.6608, 0.0000, 0.0000, 0.0338, 0.0000,
        0.1085, 0.0000, 0.0000, 0.1481, 0.2564, 0.1821, 0.0000, 0.1928, 0.0000,
        0.5993, 0.0000, 0.0000, 0.6060, 0.0000, 0.0000, 0.2857, 0.0000, 0.0000,
        0.0019, 0.0000, 0.2350, 0.0000, 0.0000, 0.0000, 0.3801, 0.0000, 0.6563,
        0.0807, 0.4795, 0.0000, 0.5401, 0.6200, 0.0000, 0.1211, 1.2164, 0.0000,
        0.0000, 0.3931, 0.7050, 0.0790, 0.0000, 0.3559, 0.3488, 0.5475, 0.6672,
        0.4781, 0.0800, 0.0000, 0.0710, 0.0000, 0.0000, 0.3134, 0.0000, 0.0000,
        0.1495], grad_fn=<SelectBackward0>)

In [12]:
for epoch in range(1000):
    model.train()
    train_loss = 0

    for i in range(0, pro_num, batch_size):
        batch_pro = torch.arange(i, min(i + batch_size, pro_num)).long().to(device)
        batch_pro_skill = torch.Tensor(pro_skill[i:i + batch_size]).to(device)
        batch_pro_pro = torch.Tensor(pro_pro[i:i + batch_size,i:i + batch_size]).to(device)
        batch_diff_feat = torch.Tensor(pro_feat[i:i + batch_size, :-1]).to(device)
        batch_skill_skill = torch.arange(skill_num).to(device)

        pro_skill_loss, pro_pro_loss, skill_skill_loss, mse,h = model(batch_pro,
                                                                  batch_diff_feat,
                                                                  batch_pro_skill,
                                                                  batch_pro_pro,
                                                                  batch_skill_skill)
        loss = pro_skill_loss + pro_pro_loss + skill_skill_loss + mse

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        train_loss += loss.item()

    train_loss /= math.ceil(pro_num / batch_size)
    print(f'Epoch {epoch}, Loss {train_loss:.4f}')

Epoch 0, Loss 9.4805
Epoch 1, Loss 8.9006
Epoch 2, Loss 8.4040
Epoch 3, Loss 7.9765
Epoch 4, Loss 7.5813
Epoch 5, Loss 7.2292
Epoch 6, Loss 6.9107
Epoch 7, Loss 6.6242
Epoch 8, Loss 6.3530
Epoch 9, Loss 6.1120
Epoch 10, Loss 5.8798
Epoch 11, Loss 5.6716
Epoch 12, Loss 5.4690
Epoch 13, Loss 5.2847
Epoch 14, Loss 5.1116
Epoch 15, Loss 4.9459
Epoch 16, Loss 4.7953
Epoch 17, Loss 4.6460
Epoch 18, Loss 4.5092
Epoch 19, Loss 4.3758
Epoch 20, Loss 4.2489
Epoch 21, Loss 4.1232
Epoch 22, Loss 4.0065
Epoch 23, Loss 3.8964
Epoch 24, Loss 3.7821
Epoch 25, Loss 3.6770
Epoch 26, Loss 3.5752
Epoch 27, Loss 3.4722
Epoch 28, Loss 3.3750
Epoch 29, Loss 3.2859
Epoch 30, Loss 3.1959
Epoch 31, Loss 3.1062
Epoch 32, Loss 3.0202
Epoch 33, Loss 2.9380
Epoch 34, Loss 2.8558
Epoch 35, Loss 2.7833
Epoch 36, Loss 2.7122
Epoch 37, Loss 2.6422
Epoch 38, Loss 2.5709
Epoch 39, Loss 2.5053
Epoch 40, Loss 2.4512
Epoch 41, Loss 2.3878
Epoch 42, Loss 2.3311
Epoch 43, Loss 2.2819
Epoch 44, Loss 2.2340
Epoch 45, Loss 2.184

Epoch 361, Loss 0.8357
Epoch 362, Loss 0.8232
Epoch 363, Loss 0.8196
Epoch 364, Loss 0.8259
Epoch 365, Loss 0.8366
Epoch 366, Loss 0.8241
Epoch 367, Loss 0.8215
Epoch 368, Loss 0.8240
Epoch 369, Loss 0.8340
Epoch 370, Loss 0.8226
Epoch 371, Loss 0.8249
Epoch 372, Loss 0.8338
Epoch 373, Loss 0.8166
Epoch 374, Loss 0.8198
Epoch 375, Loss 0.8233
Epoch 376, Loss 0.8202
Epoch 377, Loss 0.8110
Epoch 378, Loss 0.8293
Epoch 379, Loss 0.8267
Epoch 380, Loss 0.8257
Epoch 381, Loss 0.8097
Epoch 382, Loss 0.8169
Epoch 383, Loss 0.8102
Epoch 384, Loss 0.8142
Epoch 385, Loss 0.8084
Epoch 386, Loss 0.8079
Epoch 387, Loss 0.8182
Epoch 388, Loss 0.8097
Epoch 389, Loss 0.8232
Epoch 390, Loss 0.8070
Epoch 391, Loss 0.8119
Epoch 392, Loss 0.8052
Epoch 393, Loss 0.8083
Epoch 394, Loss 0.8175
Epoch 395, Loss 0.8069
Epoch 396, Loss 0.7962
Epoch 397, Loss 0.8149
Epoch 398, Loss 0.8066
Epoch 399, Loss 0.8053
Epoch 400, Loss 0.8086
Epoch 401, Loss 0.8016
Epoch 402, Loss 0.8028
Epoch 403, Loss 0.8044
Epoch 404, 

Epoch 718, Loss 0.6758
Epoch 719, Loss 0.6800
Epoch 720, Loss 0.6979
Epoch 721, Loss 0.6844
Epoch 722, Loss 0.6811
Epoch 723, Loss 0.6766
Epoch 724, Loss 0.6784
Epoch 725, Loss 0.6687
Epoch 726, Loss 0.6770
Epoch 727, Loss 0.6892
Epoch 728, Loss 0.6777
Epoch 729, Loss 0.6787
Epoch 730, Loss 0.6675
Epoch 731, Loss 0.6840
Epoch 732, Loss 0.6911
Epoch 733, Loss 0.6873
Epoch 734, Loss 0.6883
Epoch 735, Loss 0.6980
Epoch 736, Loss 0.6914
Epoch 737, Loss 0.6817
Epoch 738, Loss 0.6653
Epoch 739, Loss 0.6780
Epoch 740, Loss 0.6804
Epoch 741, Loss 0.6759
Epoch 742, Loss 0.6882
Epoch 743, Loss 0.6860
Epoch 744, Loss 0.6787
Epoch 745, Loss 0.6811
Epoch 746, Loss 0.6839
Epoch 747, Loss 0.6797
Epoch 748, Loss 0.6917
Epoch 749, Loss 0.6908
Epoch 750, Loss 0.6818
Epoch 751, Loss 0.6812
Epoch 752, Loss 0.6742
Epoch 753, Loss 0.6608
Epoch 754, Loss 0.6828
Epoch 755, Loss 0.6656
Epoch 756, Loss 0.6783
Epoch 757, Loss 0.6614
Epoch 758, Loss 0.6854
Epoch 759, Loss 0.6851
Epoch 760, Loss 0.6751
Epoch 761, 

In [13]:
model.pro_embeddings.weight.shape, model.skill_embeddings.weight.shape, model.diff_embeddings.weight.shape

(torch.Size([17751, 64]), torch.Size([123, 64]), torch.Size([64, 6]))

In [14]:
x=model.final_pro_embedding()

torch.Size([17751, 64]) torch.Size([123, 64]) torch.Size([17751, 64])


In [None]:
model.pro_embeddings.weight.shape

In [None]:
# 保存训练好的embedding
model.eval()
with torch.no_grad():
    pro_embeddings = model.pro_embeddings.weight.cpu().numpy()
    skill_embeddings = model.skill_embeddings.weight.cpu().numpy()

    batch_pro = torch.arange(pro_num).long()
    batch_pro_skill = torch.Tensor(pro_skill)
    batch_diff_feat = torch.Tensor(pro_feat[:, :-1])
    batch_skill_skill = torch.arange(skill_num).to(device)

# 处理技能embedding
with open(os.path.join(data_folder, 'skill_id_dict.txt'), 'r') as f:
    skill_id_dict = eval(f.read())

joint_skill_num = len(skill_id_dict)
skill_embeddings_new = np.zeros((joint_skill_num, skill_embeddings.shape[1]))
skill_embeddings_new[:skill_num] = skill_embeddings

for s in skill_id_dict:
    if con_sym in s:
        tmp_skill_id = skill_id_dict[s]
        tmp_skills = [skill_id_dict[t] for t in s.split(con_sym)]
        skill_embeddings_new[tmp_skill_id] = np.mean(skill_embeddings[tmp_skills], axis=0)

In [None]:
pro_embeddings.shape

In [None]:
skill_embeddings.shape

In [None]:
pro_skill_tensor

In [None]:
torch.sigmoid(model.pro_embeddings.weight @ model.skill_embeddings.weight.t())

In [None]:
pro_skill_tensor

In [None]:
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

# 准备数据
# data_tensor = model.pro_embeddings.weight
data_tensor = model.pro_embeddings.weight

# 初始化t-SNE模型
tsne = TSNE(n_components=2, perplexity=30, learning_rate=100, n_iter=1000)

# 执行t-SNE降维
embedded_data = tsne.fit_transform(data_tensor.detach().numpy())

# 可视化结果
# plt.scatter(embedded_data[:, 0], embedded_data[:, 1])
for label in range(skill_num):
    pids = pro_skill_tensor[:,label].nonzero().view(-1).detach().numpy()
#     print(pids)
    red =  np.random.randint(0, 256)
    green = np.random.randint(0, 256)
    blue = np.random.randint(0, 256)
    color = (red / 255, green / 255, blue / 255)
    class_data = embedded_data[pids]
    plt.scatter(class_data[:, 0], class_data[:, 1],c=color, label=str(label))
plt.show()

In [None]:
pro_skill_tensor

In [None]:
torch.sigmoid(model.pro_embeddings.weight @ model.skill_embeddings.weight.t())

In [None]:
pro_skill_tensor

In [None]:
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

# 准备数据
# data_tensor = model.pro_embeddings.weight
data_tensor = model.pro_embeddings.weight

# 初始化t-SNE模型
tsne = TSNE(n_components=2, perplexity=30, learning_rate=100, n_iter=1000)

# 执行t-SNE降维
embedded_data = tsne.fit_transform(data_tensor.detach().numpy())

# 可视化结果
# plt.scatter(embedded_data[:, 0], embedded_data[:, 1])
for label in range(skill_num):
    pids = pro_skill_tensor[:,label].nonzero().view(-1).detach().numpy()
#     print(pids)
    red =  np.random.randint(0, 256)
    green = np.random.randint(0, 256)
    blue = np.random.randint(0, 256)
    color = (red / 255, green / 255, blue / 255)
    class_data = embedded_data[pids]
    plt.scatter(class_data[:, 0], class_data[:, 1],c=color, label=str(label))
plt.show()

In [None]:
import numpy as np

In [None]:
model.pro_embeddings.weight.detach().numpy()

In [None]:
np.save('problem_embedding.npy',model.pro_embeddings.weight.detach().numpy())

In [None]:
np.load('pro_feat.npy')

In [31]:
model.pro_embeddings.weight.detach().numpy()

array([[ 2.2617402 , -3.276963  ,  0.1759356 , ..., -0.5237568 ,
        -0.7671713 ,  0.05471067],
       [ 1.716306  , -3.1644986 ,  0.9045877 , ..., -0.48114157,
        -1.5587529 ,  0.04089681],
       [ 2.093425  , -3.4318895 ,  0.5193685 , ..., -0.49782342,
        -1.0824597 , -0.0611878 ],
       ...,
       [ 3.2362974 , -3.3381224 ,  2.6229885 , ..., -0.07946296,
        -1.4938825 , -0.3286222 ],
       [ 1.7086983 , -1.4899223 ,  2.1529396 , ...,  0.58268046,
        -1.406912  , -0.28759634],
       [ 2.503716  , -2.0899796 ,  2.4648428 , ...,  0.251701  ,
        -0.4089658 , -0.05650741]], dtype=float32)

In [40]:
np.save('problem_embedding.npy',model.pro_embeddings.weight.detach().numpy())

In [41]:
np.load('pro_feat.npy')

array([[ 2.2617402 , -3.276963  ,  0.1759356 , ..., -0.5237568 ,
        -0.7671713 ,  0.05471067],
       [ 1.716306  , -3.1644986 ,  0.9045877 , ..., -0.48114157,
        -1.5587529 ,  0.04089681],
       [ 2.093425  , -3.4318895 ,  0.5193685 , ..., -0.49782342,
        -1.0824597 , -0.0611878 ],
       ...,
       [ 3.2362974 , -3.3381224 ,  2.6229885 , ..., -0.07946296,
        -1.4938825 , -0.3286222 ],
       [ 1.7086983 , -1.4899223 ,  2.1529396 , ...,  0.58268046,
        -1.406912  , -0.28759634],
       [ 2.503716  , -2.0899796 ,  2.4648428 , ...,  0.251701  ,
        -0.4089658 , -0.05650741]], dtype=float32)