In [1]:
import os
import sys
import re
module_path = os.path.abspath(os.path.join('C:/Users/user/metin özütleme/hphaos summarunner/models'))
import json
import models

In [2]:
import torch
from torch.autograd import Variable
class BasicModule(torch.nn.Module):

    def __init__(self, args):
        
        super(BasicModule,self).__init__()
        self.args = args
        print(self.args.device)
        print()
        self.model_name = str(type(self))

    def pad_doc(self,words_out,doc_lens):
        pad_dim = words_out.size(1)
        max_doc_len = max(doc_lens)
        sent_input = []
        start = 0
        for doc_len in doc_lens:
            stop = start + doc_len
            valid = words_out[start:stop]                                       # (doc_len,2*H)
            start = stop
            if doc_len == max_doc_len:
                sent_input.append(valid.unsqueeze(0))
            else:
                pad = Variable(torch.zeros(max_doc_len-doc_len,pad_dim))
                if self.args.device is not None:
                    pad = pad.cuda()
                sent_input.append(torch.cat([valid,pad]).unsqueeze(0))          # (1,max_len,2*H)
        sent_input = torch.cat(sent_input,dim=0)                                # (B,max_len,2*H)
        return sent_input
    
    def save(self):
        checkpoint = {'model':self.state_dict(), 'args': self.args}
        best_path = '%s%s_seed_%d.pt' % (self.args.save_dir,self.model_name,self.args.seed)
        torch.save(checkpoint,best_path)

        return best_path

    def load(self, best_path):
        if self.args.device is not None:
            print(self.args.device)
            data = torch.load(best_path)['model']
        else:
            data = torch.load(best_path, map_location=lambda storage, loc: storage)['model']
        self.load_state_dict(data)
        if self.args.device is not None:
            return self.cuda()
        else:
            return self


In [3]:
from BasicModule import BasicModule
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable

class CNN_RNN(BasicModule):
    def __init__(self, args, embed=None):
        super(CNN_RNN,self).__init__(args)
        self.model_name = 'CNN_RNN'
        self.args = args
        
        Ks = args.kernel_sizes
        Ci = args.embed_dim
        Co = args.kernel_num
        V = args.embed_num
        D = args.embed_dim
        H = args.hidden_size
        S = args.seg_num
        P_V = args.pos_num
        P_D = args.pos_dim
        self.abs_pos_embed = nn.Embedding(P_V,P_D)
        self.rel_pos_embed = nn.Embedding(S,P_D)
        self.embed = nn.Embedding(V,D,padding_idx=0)
        if embed is not None:
            self.embed.weight.data.copy_(embed)

        self.convs = nn.ModuleList([ nn.Sequential(
                                            nn.Conv1d(Ci,Co,K),
                                            nn.BatchNorm1d(Co),
                                            nn.LeakyReLU(inplace=True),

                                            nn.Conv1d(Co,Co,K),
                                            nn.BatchNorm1d(Co),
                                            nn.LeakyReLU(inplace=True)
                                     )
                                    for K in Ks])
        self.sent_RNN = nn.GRU(
                        input_size = Co * len(Ks),
                        hidden_size = H,
                        batch_first = True,
                        bidirectional = True
                        )
        self.fc = nn.Sequential(
                nn.Linear(2*H,2*H),
                nn.BatchNorm1d(2*H),
                nn.Tanh()
                )
        # Parameters of Classification Layer
        self.content = nn.Linear(2*H,1,bias=False)
        self.salience = nn.Bilinear(2*H,2*H,1,bias=False)
        self.novelty = nn.Bilinear(2*H,2*H,1,bias=False)
        self.abs_pos = nn.Linear(P_D,1,bias=False)
        self.rel_pos = nn.Linear(P_D,1,bias=False)
        self.bias = nn.Parameter(torch.FloatTensor(1).uniform_(-0.1,0.1))

    def max_pool1d(self,x,seq_lens):
        # x:[N,L,O_in]
        out = []
        for index,t in enumerate(x):
            t = t[:seq_lens[index],:]
            t = torch.t(t).unsqueeze(0)
            out.append(F.max_pool1d(t,t.size(2)))
        
        out = torch.cat(out).squeeze(2)
        return out
    def avg_pool1d(self,x,seq_lens):
        # x:[N,L,O_in]
        out = []
        for index,t in enumerate(x):
            t = t[:seq_lens[index],:]
            t = torch.t(t).unsqueeze(0)
            out.append(F.avg_pool1d(t,t.size(2)))
        
        out = torch.cat(out).squeeze(2)
        return out
    def forward(self,x,doc_lens):
        sent_lens = torch.sum(torch.sign(x),dim=1).data 
        H = self.args.hidden_size
        x = self.embed(x)                                                       # (N,L,D)
        # word level GRU
        x = [conv(x.permute(0,2,1)) for conv in self.convs]
        x = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in x]
        x = torch.cat(x,1)
        # make sent features(pad with zeros)
        x = self.pad_doc(x,doc_lens)

        # sent level GRU
        sent_out = self.sent_RNN(x)[0]                                           # (B,max_doc_len,2*H)
        docs = self.max_pool1d(sent_out,doc_lens)                                # (B,2*H)
        docs = self.fc(docs)
        probs = []
        for index,doc_len in enumerate(doc_lens):
            valid_hidden = sent_out[index,:doc_len,:]                            # (doc_len,2*H)
            doc = docs[index].unsqueeze(0)
            s = Variable(torch.zeros(1,2*H))
            if self.args.device is not None:
                s = s.cuda()
            for position, h in enumerate(valid_hidden):
                h = h.view(1, -1)                                                # (1,2*H)
                # get position embeddings
                abs_index = Variable(torch.LongTensor([[position]]))
                if self.args.device is not None:
                    abs_index = abs_index.cuda()
                abs_features = self.abs_pos_embed(abs_index).squeeze(0)
                
                rel_index = int(round((position + 1) * 9.0 / doc_len))
                rel_index = Variable(torch.LongTensor([[rel_index]]))
                if self.args.device is not None:
                    rel_index = rel_index.cuda()
                rel_features = self.rel_pos_embed(rel_index).squeeze(0)
                
                # classification layer
                content = self.content(h) 
                salience = self.salience(h,doc)
                novelty = -1 * self.novelty(h,F.tanh(s))
                abs_p = self.abs_pos(abs_features)
                rel_p = self.rel_pos(rel_features)
                prob = F.sigmoid(content + salience + novelty + abs_p + rel_p + self.bias)
                s = s + torch.mm(prob,h)
                probs.append(prob)
        return torch.cat(probs).squeeze()

In [4]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable

class Attention(nn.Module):
    r"""
    Applies an attention mechanism on the query features from the decoder.
    .. math::
            \begin{array}{ll}
            x = context*query \\
            attn_scores = exp(x_i) / sum_j exp(x_j) \\
            attn_out = attn * context
            \end{array}
    Args:
        dim(int): The number of expected features in the query
    Inputs: query, context
        - **query** (batch, query_len, dimensions): tensor containing the query features from the decoder.
        - **context** (batch, input_len, dimensions): tensor containing features of the encoded input sequence.
    Outputs: query, attn
        - **query** (batch, query_len, dimensions): tensor containing the attended query features from the decoder.
        - **attn** (batch, query_len, input_len): tensor containing attention weights.
    Attributes:
        mask (torch.Tensor, optional): applies a :math:`-inf` to the indices specified in the `Tensor`.
    """
    def __init__(self):
        super(Attention, self).__init__()
        self.mask = None

    def set_mask(self, mask):
        """
        Sets indices to be masked
        Args:
            mask (torch.Tensor): tensor containing indices to be masked
        """
        self.mask = mask
    
    """
        - query   (batch, query_len, dimensions): tensor containing the query features from the decoder.
        - context (batch, input_len, dimensions): tensor containing features of the encoded input sequence.
    """
    def forward(self, query, context):
        batch_size = query.size(0)
        dim = query.size(2)
        in_len = context.size(1)
        # (batch, query_len, dim) * (batch, in_len, dim) -> (batch, query_len, in_len)
        attn = torch.bmm(query, context.transpose(1, 2))
        if self.mask is not None:
            attn.data.masked_fill_(self.mask, -float('inf'))
        attn_scores = F.softmax(attn.view(-1, in_len),dim=1).view(batch_size, -1, in_len)

        # (batch, query_len, in_len) * (batch, in_len, dim) -> (batch, query_len, dim)
        attn_out = torch.bmm(attn_scores, context)

        return attn_out, attn_scores

if __name__ == '__main__':
    torch.manual_seed(1)
    attention = Attention()
    context = Variable(torch.randn(10, 20, 4))
    query = Variable(torch.randn(10, 1, 4))
    query, attn = attention(query, context)
    print(query)

tensor([[[ 1.4918, -0.8075, -3.1827, -0.7470]],

        [[-0.6181, -0.9139, -0.2974,  0.3185]],

        [[ 0.6534,  1.3140,  0.5316,  1.4086]],

        [[-0.5043,  1.8300, -0.3793, -2.1729]],

        [[-1.3261, -0.2732, -0.0319, -0.5234]],

        [[ 0.3076, -1.0698, -0.6218,  0.6029]],

        [[-1.8304,  0.3810, -0.0749, -2.1429]],

        [[ 0.4508, -0.1929,  0.0575, -0.6249]],

        [[ 0.7716, -0.4401,  0.3739,  0.7881]],

        [[ 0.8114, -0.2754,  0.1713, -0.2009]]])


In [5]:
#!/usr/bin/env python
#coding:utf8
from BasicModule import BasicModule
import torch
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
#from .Attention import Attention
from torch.autograd import Variable

class AttnRNN(BasicModule):
    def __init__(self, args, embed=None):
        super(AttnRNN,self).__init__(args)
        self.model_name = 'AttnRNN'
        self.args = args
        
        V = args.embed_num
        D = args.embed_dim
        H = args.hidden_size
        S = args.seg_num

        P_V = args.pos_num
        P_D = args.pos_dim
        self.abs_pos_embed = nn.Embedding(P_V,P_D)
        self.rel_pos_embed = nn.Embedding(S,P_D)
        self.embed = nn.Embedding(V,D,padding_idx=0)
        if embed is not None:
            self.embed.weight.data.copy_(embed)

        self.attn = Attention()
        self.word_query = nn.Parameter(torch.randn(1,1,2*H))
        self.sent_query = nn.Parameter(torch.randn(1,1,2*H))

        self.word_RNN = nn.GRU(
                        input_size = D,
                        hidden_size = H,
                        batch_first = True,
                        bidirectional = True
                        )
        self.sent_RNN = nn.GRU(
                        input_size = 2*H,
                        hidden_size = H,
                        batch_first = True,
                        bidirectional = True
                        )
               
        self.fc = nn.Linear(2*H,2*H)

        # Parameters of Classification Layer
        self.content = nn.Linear(2*H,1,bias=False)
        self.salience = nn.Bilinear(2*H,2*H,1,bias=False)
        self.novelty = nn.Bilinear(2*H,2*H,1,bias=False)
        self.abs_pos = nn.Linear(P_D,1,bias=False)
        self.rel_pos = nn.Linear(P_D,1,bias=False)
        self.bias = nn.Parameter(torch.FloatTensor(1).uniform_(-0.1,0.1))
    def forward(self,x,doc_lens):
        N = x.size(0)
        L = x.size(1)
        B = len(doc_lens)
        H = self.args.hidden_size
        word_mask = torch.ones_like(x) - torch.sign(x)
        word_mask = word_mask.data.type(torch.cuda.ByteTensor).view(N,1,L)
        
        x = self.embed(x)                                # (N,L,D)
        x,_ = self.word_RNN(x)
        
        # attention
        query = self.word_query.expand(N,-1,-1).contiguous()
        self.attn.set_mask(word_mask)
        word_out = self.attn(query,x)[0].squeeze(1)      # (N,2*H)

        x = self.pad_doc(word_out,doc_lens)
        # sent level GRU
        sent_out = self.sent_RNN(x)[0]                                           # (B,max_doc_len,2*H)
        #docs = self.avg_pool1d(sent_out,doc_lens)                               # (B,2*H)
        max_doc_len = max(doc_lens)
        mask = torch.ones(B,max_doc_len)
        for i in range(B):
            for j in range(doc_lens[i]):
                mask[i][j] = 0
        sent_mask = mask.type(torch.cuda.ByteTensor).view(B,1,max_doc_len)
        
        # attention
        query = self.sent_query.expand(B,-1,-1).contiguous()
        self.attn.set_mask(sent_mask)
        docs = self.attn(query,x)[0].squeeze(1)      # (B,2*H)
        probs = []
        for index,doc_len in enumerate(doc_lens):
            valid_hidden = sent_out[index,:doc_len,:]                            # (doc_len,2*H)
            doc = F.tanh(self.fc(docs[index])).unsqueeze(0)
            s = Variable(torch.zeros(1,2*H))
            if self.args.device is not None:
                s = s.cuda()
            for position, h in enumerate(valid_hidden):
                h = h.view(1, -1)                                                # (1,2*H)
                # get position embeddings
                abs_index = Variable(torch.LongTensor([[position]]))
                if self.args.device is not None:
                    abs_index = abs_index.cuda()
                abs_features = self.abs_pos_embed(abs_index).squeeze(0)
                
                rel_index = int(round((position + 1) * 9.0 / doc_len))
                rel_index = Variable(torch.LongTensor([[rel_index]]))
                if self.args.device is not None:
                    rel_index = rel_index.cuda()
                rel_features = self.rel_pos_embed(rel_index).squeeze(0)
                
                # classification layer
                content = self.content(h) 
                salience = self.salience(h,doc)
                novelty = -1 * self.novelty(h,F.tanh(s))
                abs_p = self.abs_pos(abs_features)
                rel_p = self.rel_pos(rel_features)
                prob = F.sigmoid(content + salience + novelty + abs_p + rel_p + self.bias)
                s = s + torch.mm(prob,h)
                #print position,F.sigmoid(abs_p + rel_p)
                probs.append(prob)
        return torch.cat(probs).squeeze()

In [6]:
import torch

class Vocab():
    def __init__(self,embed,word2id):
        self.embed = embed
        self.word2id = word2id
        self.id2word = {v:k for k,v in word2id.items()}
        assert len(self.word2id) == len(self.id2word)
        self.PAD_IDX = 0
        self.UNK_IDX = 1
        self.PAD_TOKEN = 'PAD_TOKEN'
        self.UNK_TOKEN = 'UNK_TOKEN'
    
    def __len__(self):
        return len(word2id)

    def i2w(self,idx):
        return self.id2word[idx]
    def w2i(self,w):
        if w in self.word2id:
            return self.word2id[w]
        else:
            return self.UNK_IDX
    
    def make_features(self,batch,sent_trunc=25,doc_trunc=50,split_token='\n'):
        sents_list,targets,doc_lens = [],[],[]
        # trunc document
        for doc,label in zip(batch['doc'],batch['labels']):
            sents = doc.split(split_token)
            labels = label.split(split_token)
            labels = [int(l) for l in labels]
            max_sent_num = min(doc_trunc,len(sents))##doküman 50 den az cümle içerebilir.
            sents = sents[:max_sent_num]
            labels = labels[:max_sent_num]
            sents_list += sents
            targets += labels
            doc_lens.append(len(sents)) ##doküman uzunluğu maksimum cümle kadar içerir
        # trunc or pad sent
        max_sent_len = 0
        batch_sents = []
        for sent in sents_list:
            words = sent.split()
            if len(words) > sent_trunc:##cümle içindeki maksimum kelime sayısı
                words = words[:sent_trunc]
            max_sent_len = len(words) if len(words) > max_sent_len else max_sent_len
            batch_sents.append(words)
        
        features = []
        for sent in batch_sents:
            feature = [self.w2i(w) for w in sent] + [self.PAD_IDX for _ in range(max_sent_len-len(sent))]
            features.append(feature)
        
        features = torch.LongTensor(features)    
        targets = torch.LongTensor(targets)
        summaries = batch['summaries']

        return features,targets,summaries,doc_lens



In [7]:
import csv
import torch
import torch.utils.data as data
from torch.autograd import Variable
#from .Vocab import Vocab
import numpy as np

class Dataset(data.Dataset):
    def __init__(self, examples):
        super(Dataset,self).__init__()
        # data: {'sents':xxxx,'labels':'xxxx', 'summaries':[1,0]}
        self.examples = examples 
        self.training = False
    def train(self):
        self.training = True
        return self
    def test(self):
        self.training = False
        return self
    def shuffle(self,words):
        np.random.shuffle(words)
        return ' '.join(words)
    def dropout(self,words,p=0.3):
        l = len(words)
        drop_index = np.random.choice(l,int(l*p))
        keep_words = [words[i] for i in range(l) if i not in drop_index]
        return ' '.join(keep_words)
    def __getitem__(self, idx):
        ex = self.examples[idx]
        return ex,
    def __len__(self):
        return len(self.examples)

In [8]:
from BasicModule import BasicModule
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable

class RNN_RNN(BasicModule):
    def __init__(self, args, embed=None):
        super(RNN_RNN, self).__init__(args)
        self.model_name = 'RNN_RNN'
        self.args = args
        
        V = args.embed_num
        D = args.embed_dim
        H = args.hidden_size
        S = args.seg_num
        P_V = args.pos_num
        P_D = args.pos_dim
        self.abs_pos_embed = nn.Embedding(P_V,P_D)
        self.rel_pos_embed = nn.Embedding(S,P_D)
        self.embed = nn.Embedding(V,D,padding_idx=0)
        if embed is not None:
            self.embed.weight.data.copy_(embed)

        self.word_RNN = nn.GRU(
                        input_size = D,
                        hidden_size = H,
                        batch_first = True,
                        bidirectional = True
                        )
        self.sent_RNN = nn.GRU(
                        input_size = 2*H,
                        hidden_size = H,
                        batch_first = True,
                        bidirectional = True
                        )
        self.fc = nn.Linear(2*H,2*H)

        # Parameters of Classification Layer
        self.content = nn.Linear(2*H,1,bias=False)
        self.salience = nn.Bilinear(2*H,2*H,1,bias=False)
        self.novelty = nn.Bilinear(2*H,2*H,1,bias=False)
        self.abs_pos = nn.Linear(P_D,1,bias=False)
        self.rel_pos = nn.Linear(P_D,1,bias=False)
        self.bias = nn.Parameter(torch.FloatTensor(1).uniform_(-0.1,0.1))

    def max_pool1d(self,x,seq_lens):
        # x:[N,L,O_in]
        out = []
        for index,t in enumerate(x):
            t = t[:seq_lens[index],:]
            t = torch.t(t).unsqueeze(0)
            out.append(F.max_pool1d(t,t.size(2)))
        
        out = torch.cat(out).squeeze(2)
        return out
    def avg_pool1d(self,x,seq_lens):
        # x:[N,L,O_in]
        out = []
        for index,t in enumerate(x):
            t = t[:seq_lens[index],:]
            t = torch.t(t).unsqueeze(0)
            out.append(F.avg_pool1d(t,t.size(2)))
        
        out = torch.cat(out).squeeze(2)
        return out
    def forward(self,x,doc_lens):
        sent_lens = torch.sum(torch.sign(x),dim=1).data 
        x = self.embed(x)                                                      # (N,L,D)
        # word level GRU
        H = self.args.hidden_size
        x = self.word_RNN(x)[0]                                                 # (N,2*H,L)
        #word_out = self.avg_pool1d(x,sent_lens)
        word_out = self.max_pool1d(x,sent_lens)
        # make sent features(pad with zeros)
        x = self.pad_doc(word_out,doc_lens)

        # sent level GRU
        sent_out = self.sent_RNN(x)[0]                                           # (B,max_doc_len,2*H)
        #docs = self.avg_pool1d(sent_out,doc_lens)                               # (B,2*H)
        docs = self.max_pool1d(sent_out,doc_lens)                                # (B,2*H)
        probs = []
        for index,doc_len in enumerate(doc_lens):
            valid_hidden = sent_out[index,:doc_len,:]                            # (doc_len,2*H)
            doc = F.tanh(self.fc(docs[index])).unsqueeze(0)
            s = Variable(torch.zeros(1,2*H))
            if self.args.device is not None:
                s = s.cuda()
            for position, h in enumerate(valid_hidden):
                h = h.view(1, -1)                                                # (1,2*H)
                # get position embeddings
                abs_index = Variable(torch.LongTensor([[position]]))
                if self.args.device is not None:
                    abs_index = abs_index.cuda()
                abs_features = self.abs_pos_embed(abs_index).squeeze(0)
                
                rel_index = int(round((position + 1) * 9.0 / doc_len))
                rel_index = Variable(torch.LongTensor([[rel_index]]))
                if self.args.device is not None:
                    rel_index = rel_index.cuda()
                rel_features = self.rel_pos_embed(rel_index).squeeze(0)
                
                # classification layer
                content = self.content(h) 
                salience = self.salience(h,doc)
                novelty = -1 * self.novelty(h,F.tanh(s))
                abs_p = self.abs_pos(abs_features)
                rel_p = self.rel_pos(rel_features)
                prob = F.sigmoid(content + salience + novelty + abs_p + rel_p + self.bias)
                s = s + torch.mm(prob,h)
                probs.append(prob)
        return torch.cat(probs).squeeze()

In [21]:

    def sigirdata():
    #with open('sigirtrain.json', 'w') as outfile:
        filecount=0
        abslinecount=0
        dir = "C:/Users/user/Desktop/sondönem/bitirme/sigir/SIGIR2018_Extracts/SIGIR_Sessions" 
        finaldoc=[]
        mydict={}
        finalstr=""
        newlinecount=0
        labels="1"
        keys=['doc', 'labels','summaries']
        for i in keys:
            mydict[i] = ""
        for dirPath, foldersInDir,fileName in os.walk(dir):
            if fileName is not []:
                for file in fileName:
                    if file.endswith('t.txt'):
                        loc = os.sep.join([dirPath,file])
                        abstract=open(loc,encoding="utf8")
                        abst=abstract.read()

                        lines = abst.split("\n")
                        #print(lines)
                        #print("--------------------------")
                        m = re.findall('1">(.+?).</', str(lines))
                        if m:
                            for t in m:
                                abslinecount=abslinecount+1
                                t = t.replace("',", "")
                                t = t.replace("'", "")
                                t=t.lower()
                                t= t.translate(string.maketrans('""','""'), string.punctuation)
                                finalstr=finalstr+t+"\n"
                                mydict["summaries"]=finalstr[:-2].replace("'",'"')
                    #abssizes.append(abslinecount)
                    abslinecount=0
                    finalstr=""

                    if file.endswith('o.txt'):

                        #print("readin intro")
                        loc = os.sep.join([dirPath,file])
                        doc=open(loc,encoding="utf8")
                        doc=doc.read()
                        #print(doc)
                        #print()
                        lines = doc.split("\n")
                        #print(lines)
                        #print("--------------------------")
                        #text = 'gfgfdAAA1234ZZZuijjk'
                        #while m!=[]:
                        m = re.findall('"[0-9]">(.+?).</', str(lines))
                        if m:
                            for t in m:
                                #print(t)
                                #labels=labels+""


                                #labels=labels.replace(" ","")
                                newlinecount=newlinecount+1
                                t = t.replace("',", "").lower()
                                t = t.replace("'", "")
                                finalstr=finalstr+t+"\n"
                                labels=labels+"k\n"+"1"
                                labels=labels.replace("k", "")
                                mydict["doc"]=finalstr[:-2].replace("'",'"')
                                lb=str(labels)
                                mydict["labels"]=str(lb).replace("'",'"')
                        
                        #json.dump(mydict, outfile)
                        #print("-------------")
                        finaldoc.append(dict(mydict))
                        #print(finaldoc[filecount])
                        #print("----------")
                        filecount+=1
                        finalstr=""
                        labels="1"
                        newlinecount=0
                        #print(finaldoc)
                    
        print("SIGIR DATA GENERATED")

        """  with open("sigirtrain.json", 'w') as file:
            json_string = json.dumps(finaldoc, default=lambda o: o.__dict__, sort_keys=True, indent=2)
            file.write(json_string)
        #with open('sigirtrain.json', 'w') as outfile:
            #for i in range(len(finaldoc)):
            #json.dump(finaldoc, outfile,indent=2)
        with open('sigirval.json', 'w') as outfile:
            json.dump(finaldoc[-10:], outfile)"""
    
        return finaldoc

In [22]:
#!/usr/bin/env python3

import json
import models
#import utils
import argparse,random,logging,numpy,os
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from torch.autograd import Variable
from torch.utils.data import DataLoader
from torch.nn.utils import clip_grad_norm
from time import time,sleep
from tqdm import tqdm

logging.basicConfig(level=logging.INFO, format='%(asctime)s [INFO] %(message)s')
parser = argparse.ArgumentParser(description='extractive summary')
# model
parser.add_argument('-save_dir',type=str,default='C:/Users/user/Desktop/sondönem/bitirme/sigir/sigircheckpoints/')
parser.add_argument('-embed_dim',type=int,default=100)
parser.add_argument('-embed_num',type=int,default=100)
parser.add_argument('-pos_dim',type=int,default=50)
parser.add_argument('-pos_num',type=int,default=100)
parser.add_argument('-seg_num',type=int,default=10)
parser.add_argument('-kernel_num',type=int,default=100)
parser.add_argument('-kernel_sizes',type=str,default='3,4,5')
parser.add_argument('-model',type=str,default='RNN_RNN')
parser.add_argument('-hidden_size',type=int,default=200)
# train
parser.add_argument('-lr',type=float,default=1e-3)
parser.add_argument('-batch_size',type=int,default=32)
parser.add_argument('-epochs',type=int,default=3)
parser.add_argument('-seed',type=int,default=1)
parser.add_argument('-train_dir',type=str,default='data/train.json')
parser.add_argument('-val_dir',type=str,default='data/val.json')
parser.add_argument('-embedding',type=str,default='data/embedding.npz')
parser.add_argument('-word2id',type=str,default='data/word2id.json')
parser.add_argument('-report_every',type=int,default=1500)
parser.add_argument('-seq_trunc',type=int,default=50)
parser.add_argument('-max_norm',type=float,default=1.0)
# test
parser.add_argument('-load_dir',type=str,default='C:/Users/user/Desktop/sondönem/bitirme/checkpoints/CNN_RNN_seed_1.pt')
parser.add_argument('-sigir_dir',type=str,default='C:/Users/user/Desktop/sondönem/bitirme/sigir/sigircheckpoints/RNN_RNN_seed_1.pt')
parser.add_argument('-test_dir',type=str,default='C:/Users/user/metin özütleme/hphaos summarunner/data/test.json')
parser.add_argument('-ref',type=str,default='C:/Users/user/Desktop/sondönem/bitirme/outputs/ref')
parser.add_argument('-sigirref',type=str,default='C:/Users/user/Desktop/sondönem/bitirme/sigir/sigirref')
parser.add_argument('-sigirhyp',type=str,default='C:/Users/user/Desktop/sondönem/bitirme/sigir/sigirhyp')
parser.add_argument('-hyp',type=str,default='C:/Users/user/Desktop/sondönem/bitirme/outputs/hyp')
parser.add_argument('-filename',type=str,default='x.txt') # TextFile to be summarized
parser.add_argument('-topk',type=int,default=7)
# device
parser.add_argument('-device',type=int)
# option
parser.add_argument('-test',action='store_true')
parser.add_argument('-debug',action='store_true')
parser.add_argument('-predict',action='store_true')
args = parser.parse_args()
use_gpu = args.device is not None

if torch.cuda.is_available() and not use_gpu:
    print("WARNING: You have a CUDA device, should run with -device 0")

# set cuda device and seed
if use_gpu:
    torch.cuda.set_device(args.device)
torch.cuda.manual_seed(args.seed)
torch.manual_seed(args.seed)
random.seed(args.seed)
numpy.random.seed(args.seed) 
    
def eval(net,vocab,data_iter,criterion):
    net.eval()
    total_loss = 0
    batch_num = 0
    for batch in data_iter:
        features,targets,_,doc_lens = vocab.make_features(batch)
        features,targets = Variable(features), Variable(targets.float())
        if use_gpu:
            features = features.cuda()
            targets = targets.cuda()
        probs = net(features,doc_lens)
        loss = criterion(probs,targets)
        total_loss += loss.data
        batch_num += 1
    loss = total_loss / batch_num
    net.train()
    return loss

def trainsigir():
    logging.info('Loading vocab,train and val dataset.Wait a second,please')
    
    embed = torch.Tensor(np.load(args.embedding)['embedding'])
    with open(args.word2id) as f:
        word2id = json.load(f)
    vocab = Vocab(embed, word2id)

    with open("C:/Users/user/metin özütleme/hphaos summarunner/sigirtrain.json",encoding="utf8") as f:
        examples = json.load(f)
    train_dataset = Dataset(examples)
    with open("C:/Users/user/metin özütleme/hphaos summarunner/sigirval.json",encoding="utf8") as f:
        examples = json.load(f)
    val_dataset = Dataset(examples)

    #with open("C:/Users/user/metin özütleme/hphaos summarunner/sigirval.json",encoding="utf8") as f:
     #   examples = [json.loads(line) for line in f]
    #val_dataset = Dataset(examples)
    #with open(args.val_dir,encoding="utf8") as f:
        #examples = [json.loads(line) for line in f]
    
    
    # update args
    args.embed_num = embed.size(0)
    args.embed_dim = embed.size(1)
    args.kernel_sizes = ['3,4,5']
    # build model
    net = getattr(models,args.model)(args,embed)
    if use_gpu:
        net.cuda()
    # load dataset
    train_iter = DataLoader(dataset=train_dataset,
            batch_size=args.batch_size,
            shuffle=True)
    val_iter = DataLoader(dataset=val_dataset,
            batch_size=args.batch_size,
            shuffle=False)
    # loss function
    criterion = nn.BCELoss()
    # model info
    print(net)
    params = sum(p.numel() for p in list(net.parameters())) / 1e6
    print('#Params: %.1fM' % (params))
    
    min_loss = float('inf')
    optimizer = torch.optim.Adam(net.parameters(),lr=args.lr)
    net.train()
    
    t1 = time() 
    for epoch in range(1,args.epochs+1):
        for i,batch in enumerate(train_iter):
            print(batch)
            features,targets,_,doc_lens = vocab.make_features(batch)
            features,targets = Variable(features), Variable(targets.float())
            if use_gpu:
                features = features.cuda()
                targets = targets.cuda()
            probs = net(features,doc_lens)
            
            loss = criterion(probs,targets)
            optimizer.zero_grad()
            loss.backward()
            clip_grad_norm(net.parameters(), args.max_norm)
            optimizer.step()
            if args.debug:
                print('Batch ID:%d Loss:%f' %(i,loss.data[0]))
                continue
            if i % args.report_every == 0:
                cur_loss = eval(net,vocab,val_iter,criterion)
                if cur_loss < min_loss:
                    min_loss = cur_loss
                    best_path = net.save()
                logging.info('Epoch: %2d Min_Val_Loss: %f Cur_Val_Loss: %f'
                        % (epoch,min_loss,cur_loss))
    t2 = time()
    logging.info('Total Cost:%f h'%((t2-t1)/3600))

def testsigir():
     
    embed = torch.Tensor(np.load(args.embedding)['embedding'])
    with open(args.word2id) as f:
        word2id = json.load(f)
    vocab = Vocab(embed, word2id)
    with open("C:/Users/user/metin özütleme/hphaos summarunner/sigirtest.json",encoding="utf8") as f:
        examples = json.load(f)
    test_dataset = Dataset(examples)

    test_iter = DataLoader(dataset=test_dataset,
                            batch_size=args.batch_size,
                            shuffle=False)
    if use_gpu:
        checkpoint = torch.load(args.sigir_dir)
    else:
        checkpoint = torch.load(args.sigir_dir, map_location=lambda storage, loc: storage)

    # checkpoint['args']['device'] saves the device used as train time
    # if at test time, we are using a CPU, we must override device to None
    if not use_gpu:
        checkpoint['args'].device = None
    net = getattr(models,checkpoint['args'].model)(checkpoint['args'])
    net.load_state_dict(checkpoint['model'])
    if use_gpu:
        net.cuda()
    net.eval()
    
    doc_num = len(test_dataset)
    time_cost = 0
    file_id = 1
    for batch in tqdm(test_iter):
        features,_,summaries,doc_lens = vocab.make_features(batch)
        t1 = time()
        if use_gpu:
            probs = net(Variable(features).cuda(), doc_lens)
        else:
            probs = net(Variable(features), doc_lens)
        t2 = time()
        time_cost += t2 - t1
        start = 0
        for doc_id,doc_len in enumerate(doc_lens):
            stop = start + doc_len
            prob = probs[start:stop]
            topk = min(args.topk,doc_len)
            #topk=min(abssizes[file_id-1],doc_len)
            topk_indices = prob.topk(topk)[1].cpu().data.numpy()
            topk_indices.sort()
            doc = batch['doc'][doc_id].split('\n')[:doc_len]
            hyp = [doc[index] for index in topk_indices]
            ref = summaries[doc_id]
            with open(os.path.join(args.sigirref,str(file_id)+'.txt'), 'w',encoding="utf8") as f:
                f.write(ref)
            with open(os.path.join(args.sigirhyp,str(file_id)+'.txt'), 'w',encoding="utf8") as f:
                f.write('\n'.join(hyp))
            start = stop
            file_id = file_id + 1
    print('Speed: %.2f docs / s' % (doc_num / time_cost))
    print("test ended")

"""
if __name__=='__main__':
    if args.test:
        test()
    elif args.predict:
        with open(args.filename) as file:
            bod = [file.read()]
        predict(bod)
    else:
        train()"""



"\nif __name__=='__main__':\n    if args.test:\n        test()\n    elif args.predict:\n        with open(args.filename) as file:\n            bod = [file.read()]\n        predict(bod)\n    else:\n        train()"

In [23]:

import os
from pyrouge import Rouge155
def remove_broken_files():
    error_id = []
    for f in os.listdir('C:/Users/user/Desktop/sondönem/bitirme/outputs/ref'):
        try:
            open('ref/' + f).read()
        except:
            error_id.append(f)
    for f in os.listdir('C:/Users/user/Desktop/sondönem/bitirme/outputs/hyp'):
        try:
            open('hyp/' + f).read()
        except:
            error_id.append(f)
    error_set = set(error_id)
    for f in error_set:
        #os.remove('ref/' + f)
        os.remove('hyp/' + f)

def rouge():
    r = Rouge155()
    r.home_dir = '.'
    r.system_dir = 'C:/Users/user/Desktop/sondönem/bitirme/outputs/hyp'
    r.model_dir =  'C:/Users/user/Desktop/sondönem/bitirme/outputs/ref'
    
    r.system_filename_pattern = '(\d+).txt'
    r.model_filename_pattern = '#ID#.txt'

    command = '-e C:/ROUGE-1.5.5/data -a -c 95 -m -n 2 -b 75'
    output = r.convert_and_evaluate(rouge_args=command)
    print("we done")
    print(output)
def rougesigir():
    r = Rouge155()
    r.home_dir = '.'
    r.system_dir = 'C:/Users/user/Desktop/sondönem/bitirme/sigir/sigirhyp'
    r.model_dir =  'C:/Users/user/Desktop/sondönem/bitirme/sigir/sigirref'
    
    r.system_filename_pattern = '(\d+).txt'
    r.model_filename_pattern = '#ID#.txt'

    command = '-e C:/ROUGE-1.5.5/data -a -c 95 -m -n 2 -b 75'
    output = r.convert_and_evaluate(rouge_args=command)
    print("we done")
    print(output)

if __name__ == '__main__':
    #remove_broken_files()
    #rouge()
    print()

SyntaxError: invalid syntax (Rouge155.py, line 335)

In [24]:
import sklearn
import string

from sklearn.model_selection import KFold
#print(len(sigirdata()))
kfold = KFold(5, True, 1)
# enumerate splits
counter=0
trainx=[]
testx=[]
data=sigirdata()

for train, test in kfold.split(data):
    for t in range (0,len(train)):
        val=data[train[t]]
        
        trainx.append(val)
        with open('sigirtrain.json', 'w') as fp:
            fp.write(
                '[' +
                ',\n'.join(json.dumps(i) for i in trainx) +
                ']\n')
        with open('sigirval.json', 'w') as fp:
            fp.write(
                '[' +
                ',\n'.join(json.dumps(i) for i in trainx[90:100]) +
                ']\n')
    for i in range (0,len(test)):
        val=data[test[i]]
        testx.append(val)
        with open('sigirtest.json', 'w') as fp:
            fp.write(
                '[' +
                ',\n'.join(json.dumps(i) for i in testx) +
                ']\n')
    print(len(testx),"sended to test")
    print(len(trainx),"sended to train")
    #print(type(trianx[]))
    
    trainsigir()
    testsigir()
    try:
        rougesigir()
    except Exception:
        pass
    sleep(30)
    





AttributeError: module 'string' has no attribute 'maketrans'

In [None]:
for i in range(0,10):
    

In [None]:
#x=sigirdata()
print("dsad")

In [17]:
with open("C:/Users/user/metin özütleme/hphaos summarunner/sigirtest.json",encoding="utf8") as f:
        examples = json.load(f)
        for i in range(0,10):
            print(examples[i])
            print("\n")

{'doc': 'the transition to web 2.0 transformed the business models of online marketing from a global ad approach based to individual opinions and targeted campaigns [2, 23, 35, 40]\nweb 2.0 not only took traditional marketing strategies to the extreme via viral marketing campaigns [31, 36, 43], but it also gave rise to new techniques of brand building and audience targeting via influencer marketing [12, 44].\nin fact, the use of micro-influencers, trusted individuals within their communities, has been seen as a more effective way to build a brand in terms of audience reception and return on investment [9, 25, 32]\ninstagram, which is a visual content sharing online social network (osn), has become a focal point for influencer marketing\nwith power users and micro-influencers publishing sponsored content companies need to rate these influencers and determine their value[17, 18, 38].\nmost of today’s scoring themes rely on graphbased algorithms of a known network graph.\nsuch graphs are 