In [95]:
# 查看当前挂载的数据集目录, 该目录下的变更重启环境后会自动还原
# View dataset directory. 
# This directory will be recovered automatically after resetting environment. 
!ls /home/aistudio/data

In [96]:
# 查看工作区文件, 该目录下的变更将会持久保存. 请及时清理不必要的文件, 避免加载过慢.
# View personal work directory. 
# All changes under this directory will be kept even after reset. 
# Please clean unnecessary files in time to speed up environment loading. 
!ls /home/aistudio/work

In [98]:
# 同时添加如下代码, 这样每次环境(kernel)启动的时候只要运行下方代码即可: 
# Also add the following code, 
# so that every time the environment (kernel) starts, 
# just run the following code: 
import sys 
import io
import os
import re
import six
import requests
import string
import tarfile
import hashlib
import math
from collections import OrderedDict
import random
import numpy as np
import paddle 
import paddle.fluid as fluid
from paddle.fluid.dygraph.nn import Embedding 
sys.path.append('/home/aistudio/external-libraries')

In [99]:
def download():
    corpus_url="https://dataset.bj.bcebos.com/imdb%2FaclImdb_v1.tar.gz"
    web_request=requests.get(corpus_url)
    corpus=web_request.content
    with open("./aclImdb_v1.tar.gz","wb") as f:
        f.write(corpus)
    f.close()

download()

In [100]:
def load_imdb(is_training):
    data_set=[]

    for label in ["pos","neg"]:
        with tarfile.open("./aclImdb_v1.tar.gz") as tarf:
            path_pattern="aclImdb/train/"+label+"/.*\.txt$" if is_training \
                else "aclImdb/test/"+label+"/.*\.txt$"
            path_pattern=re.compile(path_pattern)
            tf=tarf.next()
            while tf!=None:
                if bool(path_pattern.match(tf.name)):
                    sentence=tarf.extractfile(tf).read().decode()
                    sentence_label=0 if label=="neg" else 1
                    data_set.append((sentence,sentence_label))
                tf=tarf.next()
    return data_set


train_corpus=load_imdb(True)
test_corpus=load_imdb(False)

for i in range(5):
    print("sentence %d,%s"%(i,train_corpus[i][0]))
    print("sentence %d, label %d"%(i,train_corpus[i][1]))
#二元组 句子加情感

sentence 0,Zentropa has much in common with The Third Man, another noir-like film set among the rubble of postwar Europe. Like TTM, there is much inventive camera work. There is an innocent American who gets emotionally involved with a woman he doesn't really understand, and whose naivety is all the more striking in contrast with the natives.<br /><br />But I'd have to say that The Third Man has a more well-crafted storyline. Zentropa is a bit disjointed in this respect. Perhaps this is intentional: it is presented as a dream/nightmare, and making it too coherent would spoil the effect. <br /><br />This movie is unrelentingly grim--"noir" in more than one sense; one never sees the sun shine. Grim, but intriguing, and frightening.
sentence 0, label 1
sentence 1,Zentropa is the most original movie I've seen in years. If you like unique thrillers that are influenced by film noir, then this is just the right cure for all of those Hollywood summer blockbusters clogging the theaters these da

请点击[此处](https://ai.baidu.com/docs#/AIStudio_Project_Notebook/a38e5576)查看本环境基本用法.  <br>
Please click [here ](https://ai.baidu.com/docs#/AIStudio_Project_Notebook/a38e5576) for more detailed instructions. 

In [101]:
def data_preprocess(corpus):
    data_set=[]
    for sentence,sentence_label in corpus:
        sentence=sentence.strip().lower()
        sentence=sentence.split(" ")

        data_set.append((sentence,sentence_label))
    
    return data_set

train_corpus=data_preprocess(train_corpus)
test_corpus=data_preprocess(test_corpus)

print(train_corpus[:5])

[(['zentropa', 'has', 'much', 'in', 'common', 'with', 'the', 'third', 'man,', 'another', 'noir-like', 'film', 'set', 'among', 'the', 'rubble', 'of', 'postwar', 'europe.', 'like', 'ttm,', 'there', 'is', 'much', 'inventive', 'camera', 'work.', 'there', 'is', 'an', 'innocent', 'american', 'who', 'gets', 'emotionally', 'involved', 'with', 'a', 'woman', 'he', "doesn't", 'really', 'understand,', 'and', 'whose', 'naivety', 'is', 'all', 'the', 'more', 'striking', 'in', 'contrast', 'with', 'the', 'natives.<br', '/><br', '/>but', "i'd", 'have', 'to', 'say', 'that', 'the', 'third', 'man', 'has', 'a', 'more', 'well-crafted', 'storyline.', 'zentropa', 'is', 'a', 'bit', 'disjointed', 'in', 'this', 'respect.', 'perhaps', 'this', 'is', 'intentional:', 'it', 'is', 'presented', 'as', 'a', 'dream/nightmare,', 'and', 'making', 'it', 'too', 'coherent', 'would', 'spoil', 'the', 'effect.', '<br', '/><br', '/>this', 'movie', 'is', 'unrelentingly', 'grim--"noir"', 'in', 'more', 'than', 'one', 'sense;', 'one', 

In [102]:
def build_dict(corpus):
    word_freq_dict=dict()

    for sentence,_ in corpus:
        for word in sentence:
            if word not in word_freq_dict:
                word_freq_dict[word]=0
            word_freq_dict[word]+=1
    
    word_fre_dict=sorted(word_freq_dict.items(),key= lambda x:x[1],reverse=True)

    word2id_dict=dict()
    word2id_freq=dict()

    word2id_dict['[oov]']=0
    word2id_freq[0]=1e10

    word2id_dict["[pad]"]=1
    word2id_freq[1]=1e10

    for word,freq in word_fre_dict:
        word2id_dict[word]=len(word2id_dict)
        word2id_freq[word2id_dict[word]]=freq

    return word2id_freq,word2id_dict

word2id_freq,word2id_dict=build_dict(train_corpus)
vocab_size=len(word2id_freq)
print("there are totally %d different words in this corpus" % vocab_size)
for _,(word,word_id) in zip(range(50),word2id_dict.items()):
    print("word %s,its id %d,its word freq %d" %(word,word_id,word2id_freq[word_id]))

there are totally 252173 different words in this corpus
word [oov],its id 0,its word freq 10000000000
word [pad],its id 1,its word freq 10000000000
word the,its id 2,its word freq 322174
word a,its id 3,its word freq 159949
word and,its id 4,its word freq 158556
word of,its id 5,its word freq 144459
word to,its id 6,its word freq 133965
word is,its id 7,its word freq 104170
word in,its id 8,its word freq 90521
word i,its id 9,its word freq 70477
word this,its id 10,its word freq 69711
word that,its id 11,its word freq 66288
word it,its id 12,its word freq 65490
word /><br,its id 13,its word freq 50935
word was,its id 14,its word freq 47023
word as,its id 15,its word freq 45098
word for,its id 16,its word freq 42840
word with,its id 17,its word freq 42725
word but,its id 18,its word freq 39757
word on,its id 19,its word freq 31618
word movie,its id 20,its word freq 30885
word his,its id 21,its word freq 29058
word are,its id 22,its word freq 28742
word not,its id 23,its word freq 28591


In [103]:
def convert_corpus_to_id(corpus,word2id_dict):
    data_set=[]
    for sentence,sentence_label in corpus:

        sentence=[word2id_dict[word] if word in word2id_dict\
        else word2id_dict['[oov]']for word in sentence]

        data_set.append((sentence,sentence_label))
    return data_set

train_corpus=convert_corpus_to_id(train_corpus,word2id_dict)
test_corpus=convert_corpus_to_id(test_corpus,word2id_dict)
print("%d tokens in the corpus" % len(train_corpus))
print(train_corpus[:5])
print(test_corpus[:5])


25000 tokens in the corpus
[([22216, 41, 76, 8, 1136, 17, 2, 874, 979, 167, 69425, 24, 283, 707, 2, 19881, 5, 16628, 11952, 37, 100421, 52, 7, 76, 5733, 415, 912, 52, 7, 32, 1426, 299, 36, 195, 2299, 644, 17, 3, 282, 27, 141, 61, 7447, 4, 555, 25364, 7, 35, 2, 51, 3590, 8, 2691, 17, 2, 69426, 13, 688, 428, 26, 6, 142, 11, 2, 874, 160, 41, 3, 51, 14841, 4458, 22216, 7, 3, 218, 6262, 8, 10, 6919, 382, 10, 7, 100422, 12, 7, 1394, 15, 3, 100423, 4, 242, 12, 104, 5041, 54, 2368, 2, 4828, 109, 13, 255, 20, 7, 32280, 100424, 8, 51, 68, 30, 29571, 30, 102, 1010, 2, 4142, 18952, 11069, 18, 11636, 4, 12644], 1), ([22216, 7, 2, 78, 225, 20, 190, 119, 8, 1043, 46, 25, 37, 1008, 4578, 11, 22, 4379, 31, 24, 9244, 96, 10, 7, 39, 2, 246, 5601, 16, 35, 5, 136, 385, 1901, 11953, 69427, 2, 3689, 124, 2351, 2666, 17339, 100425, 37, 2732, 2, 6821, 26, 1702, 51, 35630, 18, 10, 7, 61, 21, 116, 912, 12, 7, 7006, 191, 99, 6263, 4, 1485, 2, 439, 2239, 5, 1221, 4, 513, 2598, 44, 104, 97, 27, 761, 32281, 5417, 66

In [104]:
def build_batch(word2id_dict,corpus,batch_size,epoch_num,max_seq_len,shuffle=True):
    sentence_batch=[]
    sentence_label_batch=[]

    for _ in range(epoch_num):
        if shuffle:
            random.shuffle(corpus)
        #pad填充操作
        for sentence,sentence_label in corpus:
            sentence_sample=sentence[:min(max_seq_len,len(sentence))]
            if len(sentence_sample) <max_seq_len:
                for _ in range(max_seq_len- len(sentence_sample)):
                    sentence_sample.append(word2id_dict["[pad]"])
            sentence_sample=[[word_id] for word_id in sentence_sample]

            sentence_batch.append(sentence_sample)
            sentence_label_batch.append([sentence_label])

            if len(sentence_batch)==batch_size:
                yield np.array(sentence_batch).astype("int64"),
                np.array(sentence_label_batch).astype("int64")
                sentence_batch=[]
                sentence_label_batch=[]
        if len(sentence_batch) == batch_size:
            yield np.array(sentence_batch).astype("int64"),
            np.array(sentence_label_batch).astype("int64")
                
for _,batch in zip(range(10),build_batch(word2id_dict,train_corpus,batch_size=64,epoch_num=3,max_seq_len=100)):
    print(batch)


(array([[[  4162],
        [   924],
        [    47],
        ...,
        [    59],
        [ 68693],
        [    16]],

       [[   852],
        [  3034],
        [    70],
        ...,
        [    17],
        [   501],
        [  3795]],

       [[     3],
        [    49],
        [  9124],
        ...,
        [     1],
        [     1],
        [     1]],

       ...,

       [[  2330],
        [  1622],
        [    18],
        ...,
        [   500],
        [    40],
        [   127]],

       [[  1195],
        [   289],
        [  3640],
        ...,
        [     5],
        [   234],
        [   408]],

       [[176006],
        [ 90944],
        [     7],
        ...,
        [   290],
        [    36],
        [    81]]]),)
(array([[[    10],
        [    24],
        [     7],
        ...,
        [    41],
        [     6],
        [    28]],

       [[  4037],
        [     2],
        [   413],
        ...,
        [    78],
        [218428],
        [ 12126]],


In [105]:
import paddle.fluid as fluid

class SimpleLSTMRNN(fluid.Layer):
    def __init__(self,hidden_size,num_steps,num_layers=1,init_scale=0.1,dropout=None):
        super(SimpleLSTMRNN,self).__init__()
        self._hidden_size=hidden_size
        self._num_layers=num_layers
        self._init_scale=init_scale
        self._dropout=dropout
        self._input=None 
        self._num_steps=num_steps
        self.cell_array=[]
        self.hidden_array=[]

        self.weight_1_arr=[]
        self.bias_arr=[]


        for i in range(self._num_layers):
            weight_1=self.create_parameter(
                attr=fluid.ParamAttr(initializer=fluid.initializer.UniformInitializer(low=-self._init_scale,high=self._init_scale)),
                shape=[self._hidden_size*2,self._hidden_size*4],
                dtype="float32",
                default_initializer=fluid.initializer.UniformInitializer(
                    low=-self.__init__scale,high=self._init_scale
                ))

            self.weight_1_arr.append(self.add_parameter("w_%d"%i,weight_1))
            bias_1=self.create_parameter(attr=fluid.ParamAttr(
                initializer=fluid.initializer.UniformInitializer(low=-self._init_scale,high=self._init_scale)),
                shape=[self._hidden_size*4],
                dtype="float32",
                default_initializer=fluid.initializer.Constant(0.0))
            self.bias_arr.append(self.add_parameter("b_%d"% i,bias_1))

        
    def forward(self,input_embedding,init_hiddent=None,init_cell=None):
        self.cell_array=[]
        self.hidden_array=[]

        for i in range(self._num_layers):
            pre_hidden=fluid.layers.slice(
                init_hidden,axes=[0],starts=[i],ends=[i+1])
            pre_cell=fluid.layers.slice(
                init_cell,axes=[0],starts=[i],ends=[i+1]
            )
            pre_hidden=fluid.layers.slice(
                pre_hidden,shape=[-1,self._hidden_size]
            )
            pre_cell=fluid.layers.reshape(
                pre_cell,shape=[-1,self._hidden_size]
            )
            self.hidden_array.append(pre_hidden)
            self.cell_array.append(pre_cell)
        res=[]
        for index in range(self._num_steps):
            self._input=fluid.layers.slice(
                input_embedding,axes=[1],starts=[index],ends=[index+1]
            )
            self._input=fluid.layers.reshape(
                self._input,shape=[-1,self._hidden_array]
            )

            for k in range(self._num_layers):
                pre_hidden=self.hidden_array[k]
                pre_cell=self.cell_array[k]

                weight_1=self.weight_1_arr[k]
                bias=self.bias_arr[k]


                nn=fluid.layers.concat([self._input,pre_hidden],1)


                gate_input=fluid.layers.matmul(x=nn,y=weight_1)

                gate_input=fluid.layers.elementwise_add(gate_input,bias)
                i,j,f,o=fluid.layers.split(
                    gate_input,num_or_sections=4,dim=-1
                )

                c=pre_cell*fluid.layers.sigmoid(f) + fluid.layers.sigmoid(i)*fluid.layers.tanh(j)
                m=fluid.layers.tanh(c)*fluid.layers.sigmoid(o)

                self.hidden_array[k]=m
                self.cell_array[k]=c
                self._input=m


                if self._dropout is not None and self._dropout>0.0:
                    self._input=fluid.layers.dropout(
                        self._input,
                        dropout_prob=self._dropout,
                        dropout_implementation='upscale_in_train')

            res.append(
                fluid.layers.reshape(
                    self._input,shape=[1,-1,self._hidden_size] 
                )
            )

            real_res=fluid.layers.cancat(res,0)
            real_res=fluid.layers.transpose(x=real_res,perm=[1,0,2])
            last_hidden=fluid.layers.concat(self.hidden_array,1)
            last_hidden=fluid.layers.reshape(
                last_hidden,shape=[-1,self._num_layers,self._hidden_size]
            )
            last_hidden=fluid.layers.transpose(x=last_hidden,perm=[1,0,2])
            last_cell=fluid.layers.concat(self.cell_array,1)
            last_cell=fluid.layers.reshape(last_cell,shape=[-1,self._num_layers,self.hidden_array])
            last_cell=fluid.layers.transpose(x=last_cell,perm=[1,0,2])

            return real_res,last_hidden,last_cell



In [106]:
import paddle.fluid as fluid
class SentimentClassifier(fluid.Layer):
    def __init__(self,hidden_size,vocab_size,class_num=2,num_layers=1,num_steps=128,init_scale=0.1,dropout=None):
        super(SentimentClassifier,self).__init__()
        self.hidden_size=hidden_size
        self.vocab_size=vocab_size
        self.class_num=class_num
        self.init_scale=init_scale
        self.num_layers=num_layers
        self.num_steps=num_steps
        self.dropout=dropout


        self.simple_lstm_rnn=SimpleLSTMRNN(
            hidden_size,num_steps,num_layers=num_layers,init_scale=init_scale,dropout=dropout)
        self.Embedding=Embedding(size=[vocab_size,hidden_size],dtype='float32',is_sparse=False,
        param_attr==fluid.ParamAttr(
            name='embedding_para',
            initializer=fluid.initializer.UniformInitializer(low=-init_scale,high=init_scale) 


        
        )
        
        self.softmax_weight=self.create_parameter(attr=fluid.ParamAttr(),shape=[self.hidden_size,self.class_num],dtype="float32",default_initializer=fluid.initializer.UniformInitializer(low=-self.init_scale,high=self.init_scale
            ))
        
        
        )



        self.softmax_bias=self.create_parameter(attr=fluid.ParamAttr(),shape=[self.class_num],dtype='float32',default_initializer=fluid.initializer.UniformInitializer(
            low=-self.init_scale,high=self.init_scale 
    ))

SyntaxError: invalid syntax (3542074756.py, line 26)