In [None]:
## import paddle
import paddle.nn as nn
import paddle.nn.functional as F
import random
import time
import numpy as np
import math
import jieba
import numpy as np
from sklearn.metrics import f1_score
from scipy import stats
import numpy as np
import matplotlib.pyplot as plt
BATCH_SIZE = 32
# 选择数据集, 数据集包括 CR | MPQA | MR | SUBJ | SST
DATA_SET = 'CR'

train_path = "../DataSet/" + DATA_SET + "/train.tsv"
test_path = "../DataSet/" + DATA_SET + "/test.tsv"
word_vector_path = "../sub_word_vector/" + DATA_SET + "_word_vector.txt"

print('Hello')

## 加载词向量

In [None]:
import paddle
word_dim = 100 # 词向量维度
# 获取单词到索引的映射表以及每个单词的词向量表
word_to_index = {'<unknown>': 1, '<padded>': 1}  # 根据筛选出来的词向量文件"word_vector.txt" 生成单词和索引的字典
zero_ls = [0.0 for i in range(word_dim)]
ls = [zero_ls, zero_ls]  # 用一个列表ls来存储词向量 前两个分别是 100 维的0向量，用来表示unknown_token和pad_token
index_to_word = {}
with open(word_vector_path, 'r', encoding='utf-8') as f:
    for i, line in enumerate(f):
        # 将形如 "the -0.038194 -0.24487 0.72812 -0.39961..."的字符串分成对应的词和词向量，比如"the"， "-0.038194 -0.24487 0.72812 -0.39961..."
        #并且构建单词映射到整数的字典，word_to_index。 eg. 'the' : 2  ，PS：0和1分别是两个特殊字符串 unknown 和 padded的索引，the作为词向量文件中的第一个单词，所以下标为2
        word_vector = line.split()
        word_to_index[word_vector[0]] = i + 2  # 前两位由unknown和 padded分别占据
        tmp = [] # 存储一个单词的词向量，总共是100个数字
        for j, word in enumerate(word_vector):
            if j == 0: #第一个是单词，所以跳过，只需要每个单词后面的词向量
                continue
            tmp.append(float(word))
        ls.append(tmp) #每个单词的词向量又存到列表ls当中
for key, value in word_to_index.items():
    index_to_word[value] = key

word_vector_weight_matrix = paddle.to_tensor(ls) #将词向量列表转换为Tensor
VOCAB_SIZE = len(word_to_index) + 2
# print(word_vector_weight_matrix.size())
print(word_vector_weight_matrix.shape[0])
print(word_vector_weight_matrix.shape[1])
print(len(word_to_index))


In [None]:
ls = [[1, 2, 3], [2, 3, 4]]
ls_to_tensor = paddle.to_tensor(ls)
print(ls_to_tensor.shape[0])

## 构造数据迭代器

In [None]:
# 获取语料中每个句子的长度，并且将句子分词，分词后的句子存放到列表中，句子和句子长度以元组的形式又存放在一个列表当中
# return: eg.[(["haha"], 1), (["I" "love" "China"], 3)]
# path为存放语料的文件路径

def get_sentences(path):
    sentences = []
    batch_sentences = []
    with open(path, 'r', encoding='utf-8') as f:
        for i, line in enumerate(f):
            # 跳过第一行的text label
            if i == 0:
                continue
            try:
                words = list(jieba.cut(line.split('\t')[0], cut_all=False))  # 需要将末尾的0或者1去掉
                label = float(line.split('\t')[1].strip())
            except BaseException:
                print(line.split('\t'))
            count = 0  # 统计每个句子的长度
            for word in words:
                count += 1
            tmp = (words, count, label)
            sentences.append(tmp)
    return sentences

# 根据上面得到的word_to_index,将单词转换为数字,列表形式为[ ([words_list1], len1, label1), ([words_list2], len2, label2),......], word_list是多个单词组成的序列
def lookup_table(array_ls):
    # 注意，array_ls是经过排序之后再传入到look_table()方法中的
    sentences_tensor = []
    for i, sentence in enumerate(array_ls):
        tensor_ls = []
        # sentence[0]是一个包含多个单词的列表
        for word in sentence[0]:
            # word_to_index 是一个单词到索引的字典
            if word in word_to_index.keys():
                tensor_ls.append(word_to_index[word])  # 将单词转换为索引，并且索引存入张量当中
            else:
                tensor_ls.append(1)  # 如果在索引表中没找到单词，则“不认识”单词,用1下标代替, 此时unknown_token和padded_token下标都是1
        sentences_tensor.append((tensor_ls, sentence[1], sentence[2]))
    return sentences_tensor

# 对句子进行填充，eg:16个句子组成一个batch,每个batch的句子长度必须相等，这里的做法是，获取batch中长度最长的句子,然后句子长度不够的append 1
# 输入 [([单词下标]，句子长度1, 标签1), ([单词下标]，句子长度2, 标签2), ....]
# eg. [([1, 2, 3],3, 0), ([2, 3], 2, 1), ...]
# 输出，[([单词下标]，句子长度1, 标签1), ([单词下标]，句子长度2, 标签2), ....]
# eg. 假设batch_size = 2
# [([1, 2, 3], 3), ([2, 3, 1], 2),...] 第二个句子列表 append 1，但是实际长度为2
def pad_sentence_plus(array_ls, batch_size):
    # 这里在不排序的情况下，每一个batch的数据进行一次填充
    ans = 0 # j记录每个batch的第一条数据的下标
    max = array_ls[0][1] # max为每个batch的句子最大长度
    for i in range(len(array_ls)):
        # 需要考虑最后一个batch可能长度不够batch_size
        if (i + 1) % batch_size == 0 or i == len(array_ls)-1:
            if array_ls[i][1]>max:
                max = array_ls[i][1]
            if (i + 1) % batch_size == 0:
                index = batch_size
            else:
                index = i - ans + 1
            for j in range(index):
                while(len(array_ls[j+ans][0])<max):
                    array_ls[j+ans][0].append(1)
            # 每一次填充完毕后，需要更新标记，并再次初始化最大值
            ans = i + 1
            if ans!=len(array_ls):
                max = array_ls[ans][1]
        else:
            if array_ls[i][1]>max:
                max = array_ls[i][1]
    return array_ls

# 输入，[([单词下标]，句子长度1, 标签1), ([单词下标]，句子长度2, 标签2), ....]
# [([1, 2, 3],3, 1.0), ([2, 3, 1], 2, 0.0),...]
# shuffle表示是否将每个batch打乱
# batch_first if false 表示返回的 文本张量 形状为 (sentence_len,batch_size),
# if true 表示返回的 文本张量 形状为(batch_size,sentence_len)
# 这里默认采用batch_first = False,主要是为了适应nn.Embedding层的输入形状
# 无论batch_first,  标签张量 形状都是一样的
def iterator(array_ls, batch_size, shuffle=True, batch_first=False):
    sentences_index = []  # 存放填充后的语句列表
    tmp_sen = []
    tmp_label = []
    for i, sentence in enumerate(array_ls):
        tmp_sen.append(sentence[0]) # 存放一个batch的数据
        tmp_label.append(sentence[2]) # 存放一个batch的标签
        if (i + 1) % batch_size == 0: #
            sentences_index.append((tmp_sen, tmp_label))
            tmp_sen = [] # 清空数据
            tmp_label = [] # 清空标签
    # 最后几个样本可能不够一个batch,需要额外判断
    if len(tmp_sen) != 0:
        sentences_index.append((tmp_sen, tmp_label))
    if shuffle:
        random.shuffle(sentences_index)  # 打乱列表中各个batch的顺序
    res = []
    # 2D张量转置
    if batch_first == False:
        for batch in sentences_index:
            res.append((paddle.to_tensor(batch[0]).t(), paddle.to_tensor(batch[1])))
    else:
        for batch in sentences_index:
            res.append((paddle.to_tensor(batch[0]), paddle.to_tensor(batch[1])))
    return res

def get_iterator(path, batch_size):
    sentences = get_sentences(path)
    lookup_sentences = lookup_table(sentences)
    padded_sentences_plus = pad_sentence_plus(lookup_sentences, batch_size)
    Iterator = iterator(padded_sentences_plus, batch_size)
    return Iterator

print("开始构造迭代器:")
train_iterator = get_iterator(train_path, BATCH_SIZE)
test_iterator = get_iterator(test_path, BATCH_SIZE)
# for batch in test_iterator:
#     print(batch[1])
# print(test_iterator)

print("迭代器构造完毕，开始将数据投入到网络当中:")

## 投影测量

In [None]:
class projection(nn.Layer):
    def __init__(self, Embedding_dim):
        super(projection, self).__init__()
        self.projector = paddle.standard_normal((2, Embedding_dim, 1))
#         self.pad = nn.ZeroPad2d(1)
    def forward(self,inputs):

        amplitude = inputs[0]
        phase = inputs[1]
        
        amplitude_permute = paddle.transpose(amplitude, perm=[0, 1, 2])

        amplitude_norm = paddle.zeros(shape=[amplitude_permute.shape[0], amplitude_permute.shape[1], amplitude_permute.shape[2]])
    
        amplitude_norm = F.normalize(amplitude_permute, 2, 2)
        # paddle.transpose(ls_to_tensor, perm=[1, 0, 2])
        phase_permute = paddle.transpose(phase, perm=[1, 0, 2])

        real_part = amplitude_norm*paddle.cos(phase_permute)
        imag_part = amplitude_norm*paddle.sin(phase_permute)

        real_part_expand = paddle.unsqueeze(real_part, axis=3)
        imag_part_expand = paddle.unsqueeze(imag_part, axis=3)
       
        real_part_expand_transpose = paddle.transpose(real_part_expand, perm=[0, 1, 3, 2])
        imag_part_expand_transpose = paddle.transpose(imag_part_expand, perm=[0, 1, 3, 2])
       
        v_real = paddle.matmul(real_part_expand, real_part_expand_transpose) - paddle.matmul(imag_part_expand, imag_part_expand_transpose)
        v_imag = paddle.matmul(imag_part_expand, real_part_expand_transpose) + paddle.matmul(real_part_expand, imag_part_expand_transpose)
        
        v_real_avg = paddle.zeros(shape=[v_real.shape[0], v_real.shape[2], v_real.shape[3]], dtype='float32')
        v_imag_avg = paddle.zeros(shape=[v_imag.shape[0], v_imag.shape[2], v_imag.shape[3]], dtype='float32')

        v_real_avg = paddle.mean(v_real, axis=1, keepdim=False)
        v_imag_avg = paddle.mean(v_imag, axis=1, keepdim=False)
        # v_real_avg (batch_size, embedding_dim, embedding_dim)
     
        p_real = self.projector[0]
        p_imag = self.projector[1]

        p_real_norm = p_real / paddle.norm(p_real, axis=0)
        p_imag_norm = p_imag / paddle.norm(p_imag, axis=0)
        
        p_real_mat = paddle.matmul(p_real_norm, paddle.transpose(p_real_norm, perm=[1, 0]))
        p_imag_mat = paddle.matmul(p_imag_norm, paddle.transpose(p_imag_norm, perm=[1, 0]))

        Pv_real = paddle.matmul(v_real_avg, p_real_mat) - paddle.matmul(v_imag_avg, p_imag_mat)
        Pv_imag = paddle.matmul(v_real_avg, p_imag_mat) + paddle.matmul(v_imag_avg, p_real_mat)

        return [paddle.unsqueeze(Pv_real, axis=1), paddle.unsqueeze(Pv_imag, axis=1)]
        

In [None]:
# 映射成三个不同的向量的自注意力机制
class self_attention(nn.Layer):
    def __init__(self, Embedding_dim):
        super(self_attention, self).__init__()
        self.mapping_query = paddle.standard_normal((Embedding_dim, Embedding_dim))
        self.mapping_key = paddle.standard_normal((Embedding_dim, Embedding_dim))
        self.mapping_value = paddle.standard_normal((Embedding_dim, Embedding_dim))
    def forward(self, inputs):
        query = paddle.matmul(paddle.transpose(inputs, perm=[1, 0, 2]), self.mapping_query)
        key = paddle.matmul(paddle.transpose(inputs, perm=[1, 0, 2]), self.mapping_key)
        value = paddle.matmul(paddle.transpose(inputs, perm=[1, 0, 2]), self.mapping_value)
        query = query / paddle.reshape(paddle.norm(query, axis=2), shape=[query.shape[0], query.shape[1], 1])
        key = key / paddle.reshape(paddle.norm(key, axis=2), shape=[key.shape[0], key.shape[1], 1])
        value = value / paddle.reshape(paddle.norm(value, axis=2),shape=[value.shape[0], value.shape[1], 1])

        return paddle.matmul(F.softmax((paddle.matmul(query, paddle.transpose(key, perm=[0, 2, 1])))/np.sqrt(inputs.shape[1]), axis=2), value)

## 定义模型

In [None]:
Pretrained_Attr = paddle.ParamAttr(name='amplitude_embedding',
                                   initializer=paddle.nn.initializer.Assign(word_vector_weight_matrix),
                                   trainable=True)
class CICWEQNN(nn.Layer):
    def __init__(self, weight_matrix, embedding_dim, hidden_dim, output_dim, pad_idx, pretrained_attr):
        super(CICWEQNN, self).__init__()
        
        self.amplitude_embedding = paddle.nn.Embedding(num_embeddings=word_vector_weight_matrix.shape[0],
                                      embedding_dim=word_vector_weight_matrix.shape[1],
                                      padding_idx=word_to_index['<padded>'],
                                      weight_attr=pretrained_attr)
        self.phase_embedding = nn.Embedding(VOCAB_SIZE, 1, padding_idx=pad_idx)
        self.gru = nn.GRU(embedding_dim, embedding_dim)
        self.Conv2dOne = nn.Conv2D(1, 1, 3)
        self.Conv2dTwo = nn.Conv2D(1, 1, 3)

        self.MaxPool1 = nn.MaxPool2D((embedding_dim-2, 1), 1)
        self.MaxPool2 = nn.MaxPool2D((embedding_dim-2, 1), 1)
        self.attention = self_attention(embedding_dim)
        self.projection_measurement = projection(embedding_dim)

        self.fc1 = nn.Linear(2*(embedding_dim-2), 10) 
        self.fc2 = nn.Linear(10, output_dim)
    def forward(self, text):
       
        amplitude = self.amplitude_embedding(text)
        
        phase = self.phase_embedding(text)
        
        amplitude_plus = self.gru(amplitude)
        
        amplitude_plus2 = self.attention(amplitude_plus[0])
        
        
        embedded = [amplitude_plus2, phase]
        
        project = self.projection_measurement(embedded)
        
        Conv_real = self.Conv2dOne(project[0])
        
        Conv_imag = self.Conv2dTwo(project[1])
        
        Max_real = self.MaxPool1(nn.Sigmoid()(Conv_real))
        
        Max_imag = self.MaxPool2(nn.Sigmoid()(Conv_imag))
        
        fc1 = self.fc1(paddle.concat(x=[Max_real, Max_imag],axis=3))
        
        return nn.Sigmoid()(self.fc2(nn.Sigmoid()(fc1)))

## 超参数

In [None]:
EMBEDDING_DIM = 100
HIDDEN_DIM = 3
OUTPUT_DIM = 1

PAD_IDX = 1

In [None]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

## 模型训练和预测


In [None]:

# dataset与mnist的定义与第一部分内容一致

# 用 DataLoader 实现数据加载
#train_loader = paddle.io.DataLoader(train_dataset, batch_size=64, shuffle=True)

def binary_accuracy(preds, y):
    #round predictions to the closest integer
    rounded_preds = paddle.round(preds)
    
#     print(f'preds:{preds}, y:{y}')
    correct = (rounded_preds == y) #convert into float for division
    return correct.sum() / len(correct)
    
model=CICWEQNN(word_vector_weight_matrix,
            EMBEDDING_DIM,
            HIDDEN_DIM,
            OUTPUT_DIM,
            PAD_IDX, Pretrained_Attr)


# 设置迭代次数
epochs = 50

# 设置优化器
optim = paddle.optimizer.Adam(parameters=model.parameters())
# 设置损失函数
loss_fn = nn.BCELoss()
# 训练
def train(model, train_iterator):
    epoch_loss = 0
    epoch_acc = 0
    model.train()
    for batch_id, data in enumerate(train_iterator):

        x_data = data[0]           # 训练数据
        
        
        y_data = data[1]    # 训练数据标签
        
        predicts = paddle.squeeze(model(x_data))    # 预测结果
        
        # 计算损失 等价于 prepare 中loss的设置

        loss = loss_fn(predicts, y_data)
        
        # 计算准确率 等价于 prepare 中metrics的设置
        acc = binary_accuracy(predicts, y_data)
        
        epoch_loss += loss.numpy().item()
        
        epoch_acc += acc.numpy().item()
        
        
        # 反向传播
        loss.backward()

        # 更新参数
        optim.step()

        # 梯度清零
        optim.clear_grad()
    return epoch_loss / len(train_iterator), epoch_acc / len(train_iterator)
# 预测
def test(model, test_iterator):
    epoch_loss = 0.0
    epoch_acc = 0.0
    model.eval()
    for batch_id, data in enumerate(test_iterator):
        x_data = data[0]            # 测试数据
    
        y_data = data[1]     # 测试数据标签
        predicts = paddle.squeeze(model(x_data))    # 预测结果
        # 计算损失 
        loss = loss_fn(predicts, y_data)
        # 计算准确率 
        acc = binary_accuracy(predicts, y_data)
        epoch_loss += loss.numpy().item()
        
        epoch_acc += acc.numpy().item()

    return epoch_loss / len(test_iterator), epoch_acc / len(test_iterator)

best_acc = -1
for epoch in range(epochs):
    start_time = time.time()
    train_loss, train_acc = train(model, train_iterator)
    test_loss, test_acc = test(model, test_iterator)
    end_time = time.time()
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    if(test_acc > best_acc):
        best_acc = test_acc
    
    print(f'Epoch: {epoch + 1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {round(train_loss,3)} | Train Acc: {round(train_acc * 100, 2)}% ')
    print(f'\t test. Loss: {round(test_loss,3)} |  test. Acc: {round(test_acc * 100, 2)}% |  best_acc: {round(best_acc * 100, 2)}%')

In [None]:
print(paddle.zeros(shape=[3, 2], dtype='float32'))

In [None]:
#v_real_avg = torch.mean(v_real, dim=1, keepdim=False)
 #       v_imag_avg = torch.mean(v_imag, dim=1, keepdim=False)


key = paddle.to_tensor([[[1, 2, 3], [4, 5, 6]],[[7, 8, 9], [10, 11, 12]]])
key_trans = paddle.transpose(key, perm=[0, 2, 1])
print(key_trans)


In [None]:
def binary_accuracy(preds, y):
    #round predictions to the closest integer
    rounded_preds = paddle.round(preds)
#     print(f'preds:{preds}, y:{y}')
    correct = (rounded_preds == y) #convert into float for division
    # print('batch: 模型预测值', rounded_preds,'真实标签', y)
    
    acc = correct.sum() / len(correct)
    return acc
a = paddle.to_tensor([0.6, 0.2, 0.2, 0.1])
b = paddle.to_tensor([1, 1, 0, 0])
binary_accuracy(a, b)


In [None]:
import paddle

def binary_accuracy(preds, y):
    #round predictions to the closest integer
    rounded_preds = paddle.round(preds)
    print(rounded_preds)
#     print(f'preds:{preds}, y:{y}')
    correct = (rounded_preds == y) #convert into float for division
    return correct.sum() / len(correct)
    # print('batch: 模型预测值', rounded_preds,'真实标签', y)
a = paddle.to_tensor(
       [0.50367379, 0.50367212, 0.50367010, 0.50367481, 0.50366598, 0.50366402,
        0.50366652, 0.50366569, 0.50366539, 0.50366944, 0.50366831, 0.50366378,
        0.50366819, 0.50366312, 0.50366414, 0.50366485])
b = paddle.to_tensor(
       [0., 0., 0., 1., 0., 1., 1., 0., 0., 0., 1., 0., 1., 0., 1., 1.])

print(binary_accuracy(a, b))

In [None]:
dataset = 'weibo_senti_100k'
yourSet = 'dev.tsv'
yourSetTmp = 'devTmp.tsv'
with open('../DataSet/'+ dataset +'/'+ yourSetTmp, 'w', encoding='utf-8') as f1:
    with open('../DataSet/'+ dataset +'/'+ yourSet, 'r', encoding='utf-8') as f2:
        for i, line in enumerate(f2):
            
            if i != 0:
                lineTmp = ''
                lineTmp += (line[:-2] + ('0.0' if line[-2] == '0' else '1.0') + '\n')
                f1.write(lineTmp)
            else:
                f1.write(line)
            