In [57]:
import glob

from word2vec_doc import Word2VecModel, Word2VecDoc
from doc import Doc
from scorer import Scorer

def filecontent(filename):
    #print("opening: ", filename)
    content = ''
    with open(filename) as f:
        content = f.read()
    return content

#TODO extract summary types?
#TODO build drop_punct and drop_stop_words - DONE
#TODO build with_article_text
#TODO run against train, val, test sets - DONE
#TODO - split building summaries and saving to file  THEN run validation on saved summaries
#TODO - work out divide by zero issue / nan cosine

def basic_summaries(model, path, drop_punct=False, drop_common=False):
    scorer = Scorer()
    articles = glob.glob(path)

    for article in articles:
        c = filecontent(article)
        doc = Doc.build(c)
        if drop_punct:
            doc.strip_punctuation()
        if drop_common:
            doc.strip_common_words()
    
        wv_doc = Word2VecDoc.build(model, doc)

        # mapping from sentence number to word2vec cosine similarity
        sentence_rank = []
        for i,ts in enumerate(doc.tokenised_sentences):
            sentence_rank.append([i, wv_doc.similarity(ts)])

        # order by cosine similarity
        ordered_sentences = sorted(sentence_rank, key=lambda x: x[1], reverse=True)
    
        # build the resulting summary
        result = doc.sentences[ordered_sentences[0][0]]
        i = 1
        while len(result) < 1600 and i<len(ordered_sentences):
            result += ' ' + doc.sentences[ordered_sentences[i][0]]
            i += 1
        print(result)
    
        # validate against reference file
        ref_name = article.replace("News Article","Summaries",1)
        reference = filecontent(ref_name)
    
        ref_doc = Doc.build(reference)
        wv_ref_doc = Word2VecDoc.build(model, ref_doc)
    
        scorer.add(result, reference, wv_doc.similar_to(wv_ref_doc.doc_vector))
    
    return scorer.averages()


# Brown Model

In [4]:
model = Word2VecModel.build_brown_model()

print("Average scores for single training data")
print(basic_summaries(model,"./News Articles/*/Training/001.txt"))


Average scores for single training data
{'rouge-1': {'f': 0.6339626676410022, 'p': 0.5290867872744981, 'r': 0.8172959038687939}, 'rouge-2': {'f': 0.5364965543545195, 'p': 0.43682566859663635, 'r': 0.7338738533140157}, 'rouge-l': {'f': 0.5734875978404441, 'p': 0.5267544059335487, 'r': 0.8137633797480344}, 'cosine': 0.9772376094227511}


All data...

{'rouge-1': {'f': 0.6543733394817078, 'p': 0.5412527577174195, 'r': 0.8686513312834784}, 'rouge-2': {'f': 0.56886931959749, 'p': 0.4602217950850871, 'r': 0.7975526243518252}, 'rouge-l': {'f': 0.5818416467919256, 'p': 0.5378988721498956, 'r': 0.864464556732853}}

In [None]:
model = Word2VecModel.build_brown_model()

print("Average scores for all training data")
print(basic_summaries(model, "./News Articles/*/Training/*.txt"))


In [None]:
model = Word2VecModel.build_brown_model()

print("Average scores for all validation data")
print(basic_summaries(model,"./News Articles/*/Validation/*.txt"))


In [None]:
model = Word2VecModel.build_brown_model()

print("Average scores for final test data")
print(basic_summaries(model, "./News Articles/*/Testing/*.txt"))


# Reuters Model

In [2]:
model = Word2VecModel.build_reuters_model()

print("Average scores for all training data")
print(basic_summaries(model, "./News Articles/*/Training/*.txt"))

Average scores for all training data


  s_vector = s_vector / s_count


{'rouge-1': {'f': 0.6525187391119968, 'p': 0.539012385733829, 'r': 0.868348460068383}, 'rouge-2': {'f': 0.5672608635815085, 'p': 0.4582970071062302, 'r': 0.7975077448662234}, 'rouge-l': {'f': 0.5794827074299134, 'p': 0.5357427628124779, 'r': 0.864229339908405}, 'cosine': nan}


# Augmenting existing model with training article sentences

In [None]:
def add_sentences_to_model(model, path):
    articles = glob.glob(path)
    for article in articles:
        c = filecontent(article)
        doc = Doc.build(c)
        model.train(doc.tokenised_sentences, epochs=model.iter)



# Dropping punctuation, common words or both

In [4]:
model = Word2VecModel.build_reuters_model()

print("Average scores for all training data")
print(basic_summaries(model, "../News Article/Business/Testing/015.txt"))

Average scores for all training data
{'rouge-1': {'f': 0.7085020197163534, 'p': 1.0, 'r': 0.54858934169279}, 'rouge-2': {'f': 0.6277372218911503, 'p': 0.9662921348314607, 'r': 0.4648648648648649}, 'rouge-l': {'f': 0.6125556679501356, 'p': 1.0, 'r': 0.54858934169279}, 'cosine': 1.0}


In [27]:
model = Word2VecModel.build_brown_model()

print("Average scores for all training data")
print(basic_summaries(model, "../News Article/Business/Testing/015.txt"))

Average scores for all training data
Andy Clarke, ERAA director of air transport, said that the EC advice misleads customers as it leads them to believe that airlines could be liable for payouts if flights are delayed because of bad weather. The ERAA's Mr Clarke also warned that while airlines would comply with the new rules, the extra costs would be passed onto passengers. Airlines fear that "extraordinary circumstances" may not include bad weather, security alerts or strikes - events which are outside of their control. In addition, if a flight is cancelled or delayed for more than two hours through the fault of the airline, all passengers must be paid compensation. "The boom in air travel needs to be accompanied by proper protection of passengers' right." "We reckon it's going to cost European air passengers - not the airlines, the airlines have no money, it has to be paid by passengers - 1.5bn euros, that's over Â£1bn a year loaded onto European passengers," Mr Clarke said. New tech

In [29]:
model = Word2VecModel.build_brown_model()

print("Average scores for all training data")
print(basic_summaries(model, "../News Article/Business/Testing/015.txt"))

Average scores for all training data
Andy Clarke, ERAA director of air transport, said that the EC advice misleads customers as it leads them to believe that airlines could be liable for payouts if flights are delayed because of bad weather. The ERAA's Mr Clarke also warned that while airlines would comply with the new rules, the extra costs would be passed onto passengers. Airlines fear that "extraordinary circumstances" may not include bad weather, security alerts or strikes - events which are outside of their control. In addition, if a flight is cancelled or delayed for more than two hours through the fault of the airline, all passengers must be paid compensation. "The boom in air travel needs to be accompanied by proper protection of passengers' right." "We reckon it's going to cost European air passengers - not the airlines, the airlines have no money, it has to be paid by passengers - 1.5bn euros, that's over Â£1bn a year loaded onto European passengers," Mr Clarke said. New tech

In [31]:
model = Word2VecModel.build_brown_model()

print("Average scores for all training data")
print(basic_summaries2(model, "../News Article/Business/Testing/015.txt",True,True))

Average scores for all training data
Andy Clarke, ERAA director of air transport, said that the EC advice misleads customers as it leads them to believe that airlines could be liable for payouts if flights are delayed because of bad weather. However, only scheduled flight operators were obliged to offer compensation in cases of overbooking and they did not have to offer compensation for flight cancellations. The ERAA's Mr Clarke also warned that while airlines would comply with the new rules, the extra costs would be passed onto passengers. Airlines fear that "extraordinary circumstances" may not include bad weather, security alerts or strikes - events which are outside of their control. "The boom in air travel needs to be accompanied by proper protection of passengers' right." Air passengers who are unable to board their flights because of overbooking, cancellations or flight delays can now demand greater compensation. "That's basically a transfer of money from passengers whose journe

In [12]:
model = Word2VecModel.build_brown_model()

print("Average scores for all training data")
print(basic_summaries(model, "../News Article/Business/Testing/015.txt",True,False))

Average scores for all training data
Andy Clarke, ERAA director of air transport, said that the EC advice misleads customers as it leads them to believe that airlines could be liable for payouts if flights are delayed because of bad weather. The ERAA's Mr Clarke also warned that while airlines would comply with the new rules, the extra costs would be passed onto passengers. "The boom in air travel needs to be accompanied by proper protection of passengers' right." On Wednesday, Jacques Barrot, vice president of the European Commission and also Commissioner for Transport, said that the changes were necessary. New technology means it is easier for airlines to take off and land in bad weather, she added. However, only scheduled flight operators were obliged to offer compensation in cases of overbooking and they did not have to offer compensation for flight cancellations. Airlines fear that "extraordinary circumstances" may not include bad weather, security alerts or strikes - events which

In [13]:
model = Word2VecModel.build_brown_model()

print("Average scores for all training data")
print(basic_summaries(model, "../News Article/Business/Testing/015.txt",False,True))

Average scores for all training data
Andy Clarke, ERAA director of air transport, said that the EC advice misleads customers as it leads them to believe that airlines could be liable for payouts if flights are delayed because of bad weather. The ERAA's Mr Clarke also warned that while airlines would comply with the new rules, the extra costs would be passed onto passengers. Airlines fear that "extraordinary circumstances" may not include bad weather, security alerts or strikes - events which are outside of their control. However, only scheduled flight operators were obliged to offer compensation in cases of overbooking and they did not have to offer compensation for flight cancellations. Air passengers who are unable to board their flights because of overbooking, cancellations or flight delays can now demand greater compensation. "That's basically a transfer of money from passengers whose journeys are not disrupted to passengers whose journeys are disrupted." "We reckon it's going to c

In [30]:
import glob

from word2vec_doc import Word2VecModel, Word2VecDoc
from doc import Doc
from scorer import Scorer

def filecontent(filename):
    #print("opening: ", filename)
    content = ''
    with open(filename) as f:
        content = f.read()
    return content

#TODO extract summary types?
#TODO build drop_punct and drop_stop_words - DONE
#TODO build with_article_text
#TODO run against train, val, test sets - DONE
#TODO - split building summaries and saving to file  THEN run validation on saved summaries
#TODO - work out divide by zero issue / nan cosine

def basic_summaries2(model, path, drop_punct=False, drop_common=False):
    scorer = Scorer()
    articles = glob.glob(path)

    for article in articles:
        c = filecontent(article)
        doc = Doc.build(c)
        if drop_punct:
            doc.strip_punctuation()
        if drop_common:
            doc.strip_common_words()
    
        wv_doc = Word2VecDoc.build(model, doc)

        # mapping from sentence number to word2vec cosine similarity
        sentence_rank = []
        for i,ts in enumerate(doc.tokenised_sentences):
            sentence_rank.append([i, wv_doc.similarity(ts)])

        # order by cosine similarity
        ordered_sentences = sorted(sentence_rank, key=lambda x: x[1], reverse=True)
    
        # build the resulting summary
        result = doc.sentences[ordered_sentences[0][0]]
        i = 1
        while len(result) < (len(c)/2.8) and i<len(ordered_sentences):
            result += ' ' + doc.sentences[ordered_sentences[i][0]]
            i += 1
        print(result)
    
        # validate against reference file
        ref_name = article.replace("News Articles","Summaries",1)
        reference = filecontent(ref_name)
    
        ref_doc = Doc.build(reference)
        wv_ref_doc = Word2VecDoc.build(model, ref_doc)
    
        scorer.add(result, reference, wv_doc.similar_to(wv_ref_doc.doc_vector))
    
    return scorer.averages()


In [24]:
c = filecontent("../News Article/Tech/Training/003.txt")
print(len(c))
doc = Doc.build(c)
print(len(doc.sentences))


c1 = filecontent("../Summaries/Tech/Training/003.txt")
c1 = c1.replace('.','. ')
print(len(c1))
doc = Doc.build(c1)
print(len(doc.sentences))
    

1330
11
469
4


In [58]:
import glob

from word2vec_doc import Word2VecModel, Word2VecDoc
from doc import Doc
from scorer import Scorer

def filecontent(filename):
    print("opening: ", filename,"\n")
    content = ''
    with open(filename) as f:
        content = f.read()
    return content

#TODO extract summary types?
#TODO build drop_punct and drop_stop_words - DONE
#TODO build with_article_text
#TODO run against train, val, test sets - DONE
#TODO - split building summaries and saving to file  THEN run validation on saved summaries
#TODO - work out divide by zero issue / nan cosine

def basic_summaries3(model, path, drop_punct=False, drop_common=False):
    scorer = Scorer()
    articles = glob.glob(path)

    for article in articles:
        c = filecontent(article)
        doc = Doc.build(c)
        if drop_punct:
            doc.strip_punctuation()
        if drop_common:
            doc.strip_common_words()
    
        wv_doc = Word2VecDoc.build(model, doc)

        # mapping from sentence number to word2vec cosine similarity
        sentence_rank = []
        for i,ts in enumerate(doc.tokenised_sentences):
            sentence_rank.append([i, wv_doc.similarity(ts)])

        # order by cosine similarity
        ordered_sentences = sorted(sentence_rank, key=lambda x: x[1], reverse=True)
        
        # build the resulting summary
        result_order = []
        result_order.append(ordered_sentences[0][0])
        result = doc.sentences[ordered_sentences[0][0]]
        i = 1
        while len(result) < 1600 and i<len(ordered_sentences):
            result_order.append(ordered_sentences[i][0])
            result += ' ' + doc.sentences[ordered_sentences[i][0]]
            i += 1
        
        #Ordering the sentences in the result according to the source text
        result_order.sort()
        #print(result_order)
        
        result = doc.sentences[0]+"\n"
        for i in range(len(result_order)):
            if i == 0:
                result += "\n" + doc.sentences[result_order[i]]
            else:    
                result += ' ' + doc.sentences[result_order[i]]            
        print(result,"\n")
        # validate against reference file
        ref_name = article.replace("News Article","Summaries",1)
        reference = filecontent(ref_name)
    
        ref_doc = Doc.build(reference)
        wv_ref_doc = Word2VecDoc.build(model, ref_doc)
    
        scorer.add(result, reference, wv_doc.similar_to(wv_ref_doc.doc_vector))
        
        print(reference)
        
    return scorer.averages()


In [60]:
model = Word2VecModel.build_reuters_model()

x = basic_summaries3(model, "../News Article/Business/Training/020.txt")

#print("Average scores for all training data")
print(x)

{'rouge-1': {'f': 0.5626740897550454, 'p': 0.5126903553299492, 'r': 0.6234567901234568}, 'rouge-2': {'f': 0.430426711167179, 'p': 0.4013840830449827, 'r': 0.464}, 'rouge-l': {'f': 0.5413382022757234, 'p': 0.5025380710659898, 'r': 0.6111111111111112}, 'cosine': 0.9896161300923307}

opening:  ../News Article/Business/Training/020.txt 

Call centre users 'lose patience'.

"Customers are getting used to the idea of an 'always available' society," says Cara Diemont of IT firm Dimension Data, which commissioned the survey. When automated phone message systems are taken out of the equation, where customers have to pick their way through multiple options and messages, the number of abandoned calls is even higher - a sixth of all callers give up rather than wait. One possible reason for the lack in patience, Ms Diemont says, is the fact that more customers are calling 'on the move' using their mobile phones. The surge in customers trying to get through to call centres is also a reflection of the centres' growing range of tasks. In what Dimension Data calls an "alarming development", the average induction time for a call centre worker fell last year from 36 to just 21 days, leaving "agents not equipped to deal with customers". As a result, call centres have a high "churn 

{'rouge-1': {'f': 0.5626740897550454,
  'p': 0.5126903553299492,
  'r': 0.6234567901234568},
 'rouge-2': {'f': 0.430426711167179, 'p': 0.4013840830449827, 'r': 0.464},
 'rouge-l': {'f': 0.5413382022757234,
  'p': 0.5025380710659898,
  'r': 0.6111111111111112},
 'cosine': 0.9896161300923307}