# Dependencies

In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import glob

# Load Data

In [2]:
sites = glob.glob('*.gz')

In [3]:
site_names = [site.split('.')[0] for site in sites]
site_names

['24',
 'hvg',
 'index',
 'm4sport',
 'metropol',
 'nepszava',
 'origo',
 'portfolio',
 'telex']

In [4]:
dfs = {}

for site, name in zip(sites, site_names):
    df = pd.read_json(site, lines=True, compression='gzip')
    dfs[name] = df

In [5]:
all_df = pd.concat(dfs.values())
all_df.head()

Unnamed: 0,uuid,title,lead,article,domain,url,date_of_creation,cc_date,tags
0,e8e3aee8-070d-4655-a9ba-349dc9eb50f0,"Inspiráció felsőfokon, avagy az egyediség tesz...","A lenyűgöző otthon tervezője és megálmodója, M...",Manon Valesca Maria a “The Life Traveler” blog...,24.hu,https://24.hu/otthon/2017/08/24/inspiracio-fel...,2017-08-24T09:15:00,2020-10-25T17:46:00,[Otthon]
1,cf031445-163e-444f-ba11-e7da1bd1e093,"Visszakérné férjétől a veséjét, mert a férfi e...","A megcsalt asszony azt kívánja, bárcsak valaki...","2009 őszén, a most 41 éves Samantha Lamb so...",24.hu,https://24.hu/elet-stilus/2014/01/27/visszaker...,2014-01-27T13:25:00,2020-07-10T07:38:12,[Élet-Stílus]
2,859b1db3-24ae-471e-9930-6ebc316b307a,Félévnyi alvás marad ki a kismamák és kispapák...,"Az első két évben a minimum 5 helyett van, aki...",Az újszülöttek szülei 6 hónapnyi alvási ...,24.hu,https://24.hu/elet-stilus/2010/07/30/felevnyi-...,2010-07-30T07:36:00,2021-09-22T00:36:12,[Élet-Stílus]
3,d8a6ea8b-a443-423a-8e1c-0803eb5f26ed,Neten is kampányol a Fidesz a Jobbik-alelnök l...,Szerintük Janiczak Dávid nemi erőszakkal fenye...,"A Fidesz ifjúsági szárnya, a Fidelitas onli...",24.hu,https://24.hu/belfold/2018/03/27/fidesz-jobbik...,2018-03-27T15:04:00,2018-11-16T18:33:49,[fidesz]
4,17a0a40b-7025-45f7-a412-1b3b1571ad79,A drónverseny már létezik és leszakítja az arcod,Nagyon magasak a részvételi adatok az időközi ...,Közelebb hoz egy lépéssel a Csillagok hábo...,24.hu,https://24.hu/tech/2017/06/24/a-dronverseny-ma...,2017-06-24T17:25:00,2018-02-25T18:06:49,[drón]


In [27]:
all_df.shape

(1148247, 9)

## Article Count

In [28]:
article_count = all_df.groupby('domain')['domain'].count().sort_values(ascending=True)
article_count

domain
telex.hu          4446
metropol.hu      11119
m4sport.hu       17915
portfolio.hu     22572
nepszava.hu      56738
index.hu        154494
hvg.hu          216755
origo.hu        304968
24.hu           359240
Name: domain, dtype: int64

## N-Grams

In [100]:
from collections import defaultdict
from nltk.corpus import stopwords
import string
punctuation = [',', ':', '!', '?', ';', '-', '_', '–', '—']

In [101]:
def generate_N_grams(text: str, ngram=1):
    words = [word for word in text.split(" ") if (word not in punctuation)] 
    temp = zip(*[words[i:] for i in range(0, ngram)])
    ans = [' '.join(ngram) for ngram in temp]
    return ans

## Novel N-gram Ratio

The percentage of n-grams in the summary that do not occur in the input article is a means of measuring abstractiveness

In [31]:
def novel_N_gram_ratio(ngram=1):
    n_grams = set()
    n_grams_not_in_article = set()
    article_words = set()
    
    for text in all_df.lead:
        for word in generate_N_grams(str(text), ngram):
            n_grams.add(word)
    
    for text in all_df.article:
        for word in generate_N_grams(str(text), ngram):
            article_words.add(word)
            
    n_grams_not_in_article = [n_gram for n_gram in n_grams if n_gram not in article_words]
    
    return len(n_grams_not_in_article) / len(n_grams)

In [32]:
unigram_ratio = novel_N_gram_ratio(ngram=1)
print(f"novel unigram ratio: {round(unigram_ratio, 4) * 100}%")

novel unigram ratio: 78.0%


In [33]:
bigram_ratio = novel_N_gram_ratio(ngram=2)
print(f"novel bigram ratio: {round(bigram_ratio, 4) * 100}%")

novel bigram ratio: 90.09%


In [34]:
trigram_ratio = novel_N_gram_ratio(ngram=3)
print(f"novel trigram ratio: {round(trigram_ratio, 4) * 100}%")

novel trigram ratio: 95.48%


## Abstractivity

$$
  1 - \frac{\sum |Fragment|}{|Summary|}
$$

In [7]:
from newsroom.analyze import Fragments

In [8]:
def abstractivity():
    results = []
    for lead, article in zip(all_df.lead, all_df.article):
        fragment = Fragments(lead, article)
        results.append(1 - fragment.coverage()) #coverage method calculates sum(|fragment|)/|summary|
    return results

In [9]:
abstractivity_data = abstractivity()



In [10]:
abstractivity = sum(abstractivity_data) / len(abstractivity_data)
print(f"abstractivity: {round(abstractivity, 4) * 100}")

abstractivity: 58.13


## Compression

$$
1 - \frac{|Summary|}{|Article|}
$$

In [35]:
def compression():
    results = []
    for lead, article in zip(all_df.lead, all_df.article):
        lead_len = 0
        article_len = 0
        for word in lead.split(" "):
            lead_len += 1
        for word in article.split(" "):
            article_len += 1
        results.append(1 - lead_len / article_len)
    return results

In [36]:
compression_data = compression()

In [37]:
compression = sum(compression_data) / len(compression_data)
print(f"compression: {round(compression, 4) * 100}")

compression: 88.58


## Redundancy

$$
  \frac{\sum (Frequency - 1)}{\sum Frequency}
$$

In [109]:
def redundancy(ngram=1):
    results = []
    
    for text in all_df.lead:
        n_grams = defaultdict(int)
        numerator = []
        for word in generate_N_grams(str(text), ngram):
            n_grams[word] += 1

        if(sum(n_grams.values()) == 0):
            break
            
        numerator = [value - 1 for value in n_grams.values()]
        results.append(sum(numerator) / sum(n_grams.values()))
        
    return results

In [110]:
unigram_redundancy_data = redundancy(ngram=1)
unigram_redundancy = sum(unigram_redundancy_data) / len(unigram_redundancy_data)
print(f"unigram redundancy: {round(unigram_redundancy, 4) * 100}")

unigram redundancy: 6.58


In [130]:
bigram_redundancy_data = redundancy(ngram=2)
bigram_redundancy = sum(bigram_redundancy_data) / len(bigram_redundancy_data)
print(f"bigram redundancy: {round(bigram_redundancy, 4) * 100}")

bigram redundancy: 0.1


## Utils

In [132]:
def rounding(metric):
    return round(metric, 4) * 100

## Evaluation

In [133]:
data = {
    'NNG-1': [rounding(unigram_ratio)],
    'NNG-2': [rounding(bigram_ratio)],
    'NNG-3': [rounding(trigram_ratio)],
    'ABS':   ['TBD'],
    'CMP':   [rounding(compression)],
    'RED-1': [rounding(unigram_redundancy)],
    'RED-2': [rounding(bigram_redundancy)],
}

pd.DataFrame(data=data)

Unnamed: 0,NNG-1,NNG-2,NNG-3,ABS,CMP,RED-1,RED-2
0,78.0,90.09,95.48,TBD,88.58,6.58,0.1


##### It is preferred of a good summary to have a high novel n-gram ratio, abstractivity, and compression; while having a low redundancy score...