# Libraries

In [3]:
import re
import requests
import json
import numpy as np
import math
import nltk
import csv
import statistics as st
from rouge_score import rouge_scorer
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/aldo/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

# General Functions

In [4]:
#Counts the documents in a category (in the entire collection if category not specified)
def count_docs(collection, category="*"):
    r = requests.get('http://localhost:8984/solr/'+collection+'/select?q=category:'+category+'&wt=json').json()
    return r['response']['numFound']

#Returns a document given its docId
def get_doc(collection, docId):
    r = requests.get('http://localhost:8984/solr/'+collection+'/select?q=docId:'+str(docId)+'&wt=json').json()
    return r['response']['docs'][0]

#Runs the custom analyzer on a sentence and returns a list of tokens
def analyze(sentence):
    sentenceT = re.sub('[\"\'&#]', ' ', sentence)
    r = requests.get('http://localhost:8984/solr/myDocs/stream?expr=analyze(\"'+sentenceT+'\", full_text)').json()
    return r['result-set']['docs'][0]['return-value']

#Extracts a list of sentences from a document
def extract_sentences(document):
    sentences = re.split(r' *[\.\?!][\'"\)\]]* *', document)
    sentences = [x.replace('"', "").replace("'", "") for x in sentences if x]
    return list(filter(None, sentences))

#Computes the cosine similarity of two documents' weighted vectors
def cosine_similarity(wvA, wvB):
    num = 0
    den1 = 0
    den2 = 0
    for i in range(0, len(wvA)):
        num += wvA[i]*wvB[i]
        den1 += wvA[i]*wvA[i]
        den2 += wvB[i]*wvB[i]
    den = math.sqrt(den1)*math.sqrt(den2)
    try:
        res = num/den
    except ZeroDivisionError:
        res = 0.0
    return res

# TF-IDF

In [5]:
#Returns the idf value of a term, based on the specified field 
def idf(field, term, docId):
    r = requests.get('http://localhost:8984/solr/myDocs/select?q=docId:'+str(docId)+'&fl=idf('+field+','+term+'),*&wt=json').json()
    return r['response']['docs'][0]['idf('+field+','+term+')']

#Returns the tf values of a term, in document with docId
def tf(field, term, docId):
    r = requests.get('http://localhost:8984/solr/myDocs/select?q=docId:'+str(docId)+'&fl=tf('+field+','+term+'),*&wt=json').json()
    return r['response']['docs'][0]['tf('+field+','+term+')']

def generate_summary(docId):
    doc = get_doc("myDocs", docId)
    sentences = extract_sentences(doc['full_text'])
    scores = []
    for sentence in sentences:
        score = 0
        analyzedSentence = analyze(sentence)
        for elem in analyzedSentence:
            score+= tf("full_text", elem, docId)*idf("full_text", elem, docId)   
        scores.append(score/len(analyzedSentence))
    
    scores = np.array(scores)
    topIdx = np.sort(np.argsort(scores)[-math.ceil(len(sentences)/3):])
    topValues = [sentences[i] for i in topIdx]
    return (". ".join(topValues)+".")
    

#### Summary Generation

In [6]:
summaries = []
for i in range(0, 100):
    summaries.append({"docId":i, "hypotesis":generate_summary(i), "reference":get_doc("myDocs", i)['summary'] })

#### Evaluation

In [7]:
def mean_score(summaries, rouge, scope):
    scorer = rouge_scorer.RougeScorer([rouge])
    scores = []
    for i in range(100):
        hyp = summaries[i]['hypotesis']
        ref = summaries[i]['reference']
        if scope == "precision":
            scores.append(scorer.score(hyp, ref)[rouge].precision)
        elif scope == "recall":
            scores.append(scorer.score(hyp, ref)[rouge].recall)
        elif scope == "fmeasure":
            scores.append(scorer.score(hyp, ref)[rouge].fmeasure)
    return st.mean(scores)

0.5776129097369671
0.761491938020721
0.6537314611777362

0.49015608237983693
0.6470373099065739
0.5551377897721256

0.4562320433604605
0.6036472285303791
0.5172375287688392

0.4019874712308498
0.5301989321686369
0.4549990602243374


# Feature Extraction

In [8]:
def get_article_info(docId):
    expression= "let(echo=\"a, b, c, d\", a=select(search(mySentences,\
    q=\"docId:"+str(docId)+"\", fl=\"sentence\", rows=1000), \
    analyze(sentence, sentence) as terms),\
    b=termVectors(a, minTermLength=0, minDocFreq=0, maxDocFreq=1),\
    c=getColumnLabels(b), \
    d=search(mySentences, q=\"docId:"+str(docId)+"\", fl=\"sentence, title, summary\", rows=1000))"

    response = requests.get('http://localhost:8984/solr/mySentences/stream?expr='+expression).json()

    dictionary = response['result-set']['docs'][0]['c']            
    sentenceTerms = response['result-set']['docs'][0]['a']          
    weightedVectors = response['result-set']['docs'][0]['b']
    title = response['result-set']['docs'][0]['d'][0]['title']
    
    sentences = []
    classes = []
    for elem in response['result-set']['docs'][0]['d']:    
        sentences.append(elem['sentence'])
        classes.append(elem['summary'])
    
    return {"title":title, "sentences":sentences, "classes": classes, "sentenceTerms":sentenceTerms,
            "weightedVectors":weightedVectors, "dictionary":dictionary}

#### Article-Independent Features

In [9]:
def sentence_lengths(sentenceTerms):
    weights = []
    for terms in sentenceTerms:
        weights.append(len(terms['terms']))
    return weights

def pos_tagging_features(sentence):
    sentenceT = re.sub('[\"\'&#]', ' ', sentence)
    r = requests.get('http://localhost:8984/solr/mySentences/stream?expr=analyze(\"'+sentenceT+'\", simpleTokenizer)').json()
    tagged = nltk.pos_tag(r['result-set']['docs'][0]['return-value'])

    properNouns = [word for (word, pos) in tagged if (pos == 'NNP' or pos == 'NNPS')]
    nouns = [word for (word, pos) in tagged if (pos == 'NN' or pos == 'NNS' or pos == 'NNP' or pos == 'NNPS')]
    verbs = [word for (word, pos) in tagged if (pos == 'VB' or pos == 'VBG' or pos == 'VBD' or pos == 'VBN' or pos == 'VBP' or pos == 'VBZ')]
    adjectives = [word for (word, pos) in tagged if (pos == 'JJ' or pos == 'JJR' or pos == 'JJS')]
    adverbs = [word for (word, pos) in tagged if (pos == 'RB' or pos == 'RBR' or pos == 'RBS')]
    
    if not properNouns:
        proper = False
    else: proper = True
    
    if tagged:
        nounRatio = len(nouns)/len(tagged)
        verbRatio = len(verbs)/len(tagged)
        adjectiveRatio = len(adjectives)/len(tagged)
        adverbRatio = len(adverbs)/len(tagged)
    else: nounRatio = verbRatio = adjectiveRatio = adverbRatio = 0.0
    
    return {"proper":proper, "nounRatio":nounRatio, 
            "verbRatio":verbRatio, "adjectiveRatio":adjectiveRatio, 
            "adverbRatio":adverbRatio}


#### Article-Dependent Features

In [10]:
def sentence_positions(sentences):
    nSent = len(sentences)
    return [x/nSent for x in list(range(0, nSent))]

def title_similarities(sentences, title):
    weights = []
    titleTerms = analyze(title)
    for sentence in sentences:
        sentenceTerms = analyze(sentence)
        try:
            commonTerms = len([x for x in sentenceTerms if x in titleTerms])/len(sentenceTerms)
        except ZeroDivisionError:
            commonTerms = 0.0
        weights.append(commonTerms)
    return weights

def sent_to_sent_cohesion(weightedVectors):
    rawValues = []
    for i in range(0, len(weightedVectors)):
        score = 0
        for j in range(0, len(weightedVectors)):
            if j!=i:
                score += cosine_similarity(weightedVectors[i], weightedVectors[j])
        rawValues.append(score)
    scores = [element/max(rawValues) for element in rawValues]
    return scores

def sent_to_centroid_cohesion(weightedVectors):
    centroid = np.zeros(len(weightedVectors[0]))
    for vector in weightedVectors:
        centroid += np.array(vector)
    centroid = (centroid/len(weightedVectors)).tolist()
    rawValues = []
    for vector in weightedVectors:
        score = cosine_similarity(vector, centroid)
        rawValues.append(score)

    scores = [element/max(rawValues) for element in rawValues]
    return scores  

def tf_isf(weightedVectors):
    #doc = get_doc_info(docId)
    weights = []
    for elem in weightedVectors:
        temp = [x for x in elem if x !=0]
        weights.append(sum(temp)/len(temp))
    return weights

#### Feature Matrix

In [11]:
def extract_features(docId):
    doc = get_article_info(docId)
    sentences = doc['sentences']
    classes = doc['classes']
    sentPos = sentence_positions(sentences)
    titleSim = title_similarities(sentences, doc['title'])
    sentLen = sentence_lengths(doc['sentenceTerms'])
    stsCoh = sent_to_sent_cohesion(doc['weightedVectors'])
    stcCoh = sent_to_centroid_cohesion(doc['weightedVectors'])
    
    features = []
    for i in range(0, len(sentences)):
        generalFeatures = {"sentence":sentences[i], "docId":docId, "sentPos":sentPos[i], 
                            "titleSim":titleSim[i], "sentLen":sentLen[i],
                            "stsCoh":stsCoh[i], "stcCoh":stcCoh[i]}
        posFeatures = pos_tagging_features(sentences[i])
        classSumm = {"class": classes[i]}
        features.append({**generalFeatures, **posFeatures, **classSumm})
    
    return features

#### Export Features to CSV

In [12]:
feats = []
length = count_docs("myDocs")
for i in range(0, length):
    feats.extend(extract_features(i))
    
csv_columns = ['sentence','docId','sentPos','titleSim',
               'sentLen', 'stsCoh', 'stcCoh', 'proper', 'nounRatio', 'verbRatio',
              'adjectiveRatio', 'adverbRatio', 'class']
dict_data = feats
csv_file = "features.csv"
try:
    with open(csv_file, 'w') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=csv_columns)
        writer.writeheader()
        for data in dict_data:
            writer.writerow(data)
except IOError:
    print("I/O error")