# Libraries

In [2]:
import csv
from sklearn import svm, metrics, naive_bayes
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from rouge_score import rouge_scorer
import requests
import statistics as st

#A function from the other file that is also needed here
def get_doc(collection, docId):
    r = requests.get('http://localhost:8984/solr/'+collection+'/select?q=docId:'+str(docId)+'&wt=json').json()
    return r['response']['docs'][0]

# Feature Matrix Import
The feature matrix needs to be properly pre-processed into a classifier-compatible structure. Moreover, as stated in the paper, the dataset is split in test set (the first 100 documents) and training set (the other documents). These operations are all implemented in the following code:

In [3]:
with open('features.csv') as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=',')
    sentences, data, classes = ([] for i in range(3))
    sentencesTest, dataTest, classesTest = ([] for i in range(3))
    line_count = 0
    for row in csv_reader:
        if line_count == 0:
            features = row
            line_count += 1
        elif int(row[1]) < 100:
            sentencesTest.append(row[0])
            dataTest.append(row[1:12])
            classesTest.append(row[12])
            line_count += 1
        else:
            sentences.append(row[0])
            data.append(row[1:12])
            classes.append(row[12])
            line_count += 1

#Transforms the boolean value of the class in a integer
target = []
targetTest = []
for elem in classes:
    if elem == 'True':
        target.append(1)
    else: target.append(0)
for elem in classesTest:
    if elem == 'True':
        targetTest.append(1)
    else: targetTest.append(0)

for i in range(0, len(data)):
    if data[i][6] == 'True':
        data[i][6] = 1
    else: data[i][6] = 0
    for j in range(0, len(data[i])):
        data[i][j] = float(data[i][j])
        
for i in range(0, len(dataTest)):
    if dataTest[i][6] == 'True':
        dataTest[i][6] = 1
    else: dataTest[i][6] = 0
    for j in range(0, len(dataTest[i])):
        dataTest[i][j] = float(dataTest[i][j])

X_train = data
y_train = target
X_test = dataTest
y_test = targetTest

### SVM

In [4]:
modelSVM = svm.NuSVC() 
modelSVM.fit(X_train, y_train)
y_predSVM = modelSVM.predict(X_test)
print("SVM Accuracy:",metrics.accuracy_score(y_test, y_predSVM))

SVM Accuracy: 0.6121134020618557


### Naive Bayes

In [5]:
modelNB = naive_bayes.GaussianNB()
modelNB.fit(X_train, y_train)
y_predNB = modelNB.predict(X_test)
print("NB Accuracy:",metrics.accuracy_score(y_test, y_predNB))

NB Accuracy: 0.7371134020618557


### Decision Tree

In [6]:
modelDT = DecisionTreeClassifier()
modelDT.fit(X_train, y_train)
y_predDT = modelDT.predict(X_test)
print("DT Accuracy:",metrics.accuracy_score(y_test, y_predDT))

DT Accuracy: 0.6378865979381443


### Summary Generation

In [7]:
results = []
for i in range(0, len(dataTest)):
    results.append({'docId':int(dataTest[i][0]), 'sentence':sentencesTest[i], 
                    'SVM':y_predSVM[i], 'NB':y_predNB[i], 'DT':y_predDT[i]})
    
summariesSVM = []
for i in range(0, 100):
    doc = [d for d in results if d['docId'] == i]
    summariesSVM.append({'docId':i, 'hypotesis':" ".join([d['sentence'] for d in doc if d['SVM'] == 1]),
                        'reference': get_doc('myDocs', i)['summary']})

summariesNB = []
for i in range(0, 100):
    doc = [d for d in results if d['docId'] == i]
    summariesNB.append({'docId':i, 'hypotesis':" ".join([d['sentence'] for d in doc if d['NB'] == 1]),
                        'reference': get_doc('myDocs', i)['summary']})

summariesDT = []
for i in range(0, 100):
    doc = [d for d in results if d['docId'] == i]
    summariesDT.append({'docId':i, 'hypotesis':" ".join([d['sentence'] for d in doc if d['DT'] == 1]), 
                         'reference': get_doc('myDocs', i)['summary']})


### Evaluation

In [8]:
def mean_score(summaries, rouge, scope):
    scorer = rouge_scorer.RougeScorer([rouge])
    scores = []
    for i in range(0, 100):
        hyp = summaries[i]['hypotesis']
        ref = summaries[i]['reference']
        if scope == "precision":
            scores.append(scorer.score(hyp, ref)[rouge].precision)
        elif scope == "recall":
            scores.append(scorer.score(hyp, ref)[rouge].recall)
        elif scope == "fmeasure":
            scores.append(scorer.score(hyp, ref)[rouge].fmeasure)
    return round(st.mean(scores), 4)
