## Evaluation de la présence de terms de subjectivité et leur puissance

In [1]:
import re
import pandas as pd
import spacy
nlp = spacy.load("en_core_web_sm")

In [2]:
def tokenize(text):
    return [str(word) for word in nlp(str(text))]

def most_frequent(List):
    return max(set(List), key = List.count)

In [3]:
def subj_eval(file_path):
    
    subj_referenecs = pd.read_csv("./subjectivity_clues.csv")
    df_to_eval = pd.read_csv(file_path)
    summaries = list(df_to_eval["summary"])
    #summaries = list(df_to_eval["summ_3"])
    
    n_weak_scores, weak_scores = [], []
    n_strong_scores,strong_scores = [], []
    
    for i, summary in enumerate(summaries):
        n_weak_score, weak_score = 0., 0.
        n_strong_score, strong_score = 0., 0.
        token_list = tokenize(summary)
        for token in token_list:
            df_temp = subj_referenecs[subj_referenecs["word"]==token]
            if len(df_temp) > 0:
                type_ = most_frequent(list(df_temp["type"]))
                if type_ == "weaksubj":
                    weak_score += 1
                    n_weak_score += 1/len(token_list)
                if type_ == "strongsubj":
                    strong_score +=1
                    n_strong_score += 1/len(token_list)
        
        weak_scores.append(weak_score)
        n_weak_scores.append(n_weak_score)
        strong_scores.append(strong_score)
        n_strong_scores.append(n_strong_score)
    
    return weak_scores, n_weak_scores, strong_scores, n_strong_scores

In [12]:
path_meansum = "../baselines/meansum_results_new.csv"
path_gpt2 = "../baselines/gpt2_summaries.csv"
path_reference = "../baselines/reference_summaries.csv"
path_textrank = "../baselines/textrank_summaries.csv"
our_path = "../outputs/train_300epochs.baseHtilt_cosineHhatversHhat_HtiltmeanHtiltcontext_FULL.csv"

In [13]:
weak_scores, n_weak_scores, strong_scores, n_strong_scores = subj_eval(our_path)

In [14]:
print(sum(weak_scores)/len(weak_scores))
print(sum(strong_scores)/len(strong_scores))

2.6473429951690823
0.5072463768115942


In [15]:
print(sum(n_weak_scores)/len(n_weak_scores))
print(sum(n_strong_scores)/len(n_strong_scores))

0.016966197265221812
0.007764534603548347


In [8]:
count_weak = 0
count_strong = 0
count_neutre = 0
for i,_ in enumerate(weak_scores):
    if weak_scores[i] > strong_scores[i]:
        count_weak += 1
    elif weak_scores[i] < strong_scores[i]:
        count_strong += 1
    else:
        count_neutre += 1
print("nombre de document ou il y a plus de weak que de strong :", count_weak)
print("nombre de document ou il y a plus de strong que de weak :", count_strong)
print("nombre de document ou same :", count_neutre)

nombre de document ou il y a plus de weak que de strong : 91
nombre de document ou il y a plus de strong que de weak : 82
nombre de document ou same : 34


In [9]:
count_weak = 0
count_strong = 0
count_neutre = 0
for i,_ in enumerate(n_weak_scores):
    if n_weak_scores[i] > n_strong_scores[i]:
        count_weak += 1
    elif n_weak_scores[i] < n_strong_scores[i]:
        count_strong += 1
    else:
        count_neutre += 1
print("nombre de document ou il y a plus de weak que de strong :", count_weak)
print("nombre de document ou il y a plus de strong que de weak :", count_strong)
print("nombre de document ou same :", count_neutre)

nombre de document ou il y a plus de weak que de strong : 91
nombre de document ou il y a plus de strong que de weak : 82
nombre de document ou same : 34


## Evaluation VADER

In [8]:
from nltk.corpus import stopwords
stopwords_nltk = stopwords.words('english')

In [9]:
def tokenize_vader(text, filter_POS=['PUNCT', 'DET'], stopwords_n=[]):
    list_ = []
    for token in nlp(str(text)):
        if token.text in stopwords_n:
            continue
        elif token.pos_ in filter_POS:
            continue
        else:
            list_.append(token.text.lower())
    return ' '.join(list_)

In [10]:
# import SentimentIntensityAnalyzer class
# from vaderSentiment.vaderSentiment module.
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
 
# function to print sentiments
# of the sentence.
def sentiment_scores(file_path):
     
    # Create a SentimentIntensityAnalyzer object.
    sid_obj = SentimentIntensityAnalyzer()
     
    df = pd.read_csv(file_path)
    summaries = list(df["summary"])
    #summaries = list(df["summ_2"])
    
    neu_dim, pos_dim, neg_dim = 0, 0, 0
    neg_comp, pos_comp, neu_comp= 0, 0, 0

    for summary in summaries : 
        summary = tokenize_vader(summary, stopwords_n=stopwords_nltk)
        sentiment_dict = sid_obj.polarity_scores(summary)
        ########### 1 -- -Getting main dimension
        neg_score = sentiment_dict['neg']
        neu_score = sentiment_dict['neu']
        pos_score = sentiment_dict['pos']

        main_dim_sent = sentiment_dict.copy()
        del main_dim_sent['compound']
        dimension = (max(main_dim_sent, key=main_dim_sent.get))
        if dimension == 'neg':
            neg_dim += 1
        if dimension == 'pos':
            pos_dim += 1
        if dimension == 'neu':
            neu_dim += 1
        ################## 2 -- Getting the compound sentiment of summary
        if sentiment_dict['compound'] >= 0.6 :
            pos_comp += 1
        elif sentiment_dict['compound'] <= - 0.6 :
            neg_comp += 1
        else: 
            neu_comp += 1
            
    return neu_dim, pos_dim, neg_dim, neg_comp, pos_comp, neu_comp

In [15]:
sentiment_scores(our_path)

(150, 57, 0, 0, 96, 111)