In [11]:
import os
import json
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
import seaborn as sns

cleansed_path = 'cleansed_data.json'
with open(cleansed_path, 'r') as f:
    cleansed_data = json.load(f)


In [12]:
# Check if there is a duplicated 'summary'

duplicated_summaries = []

for i, data in tqdm(enumerate(cleansed_data)):
    if data['summary'] in [x['summary'] for x in cleansed_data[:i]]:
        duplicated_summaries.append(data['summary'])

print(f'Number of duplicated summaries: {len(duplicated_summaries)}')

0it [00:00, ?it/s]

56216it [03:55, 239.05it/s] 

Number of duplicated summaries: 0





In [14]:
# Check if there is a empty 'summary'

empty_summaries = []

for i, data in tqdm(enumerate(cleansed_data)):
    if data['summary'] == '':
        empty_summaries.append(i)

print(f'Number of empty summaries: {len(empty_summaries)}')

56216it [00:00, 1095700.11it/s]

Number of empty summaries: 0





In [15]:
# Check if there is a duplicate within each 'raw_documents'

duplicate_raw_documents = []

for i, data in tqdm(enumerate(cleansed_data)):
    # Check if there is a duplicate in data['raw_document']
    if len(data['raw_document']) != len(set(data['raw_document'])):
        duplicate_raw_documents.append(i)

print(f'Number of duplicate raw documents: {len(duplicate_raw_documents)}')


56216it [00:00, 236251.23it/s]

Number of duplicate raw documents: 570





In [22]:
# Check if there is an empty article within each 'raw_documents'

duplicate_raw_documents = []

for i, data in tqdm(enumerate(cleansed_data)):
    # Check if there is an empty article in data['raw_document']
    if '' in data['raw_document']:
        duplicate_raw_documents.append(i)

print(f'Number of empty raw documents: {len(duplicate_raw_documents)}')


56216it [00:00, 523783.70it/s]

Number of empty raw documents: 544





In [20]:
def get_abstractivity_ratio(article_list: list, summary: str) -> float:
    """
    Get get_abstractivity_ratio ratio of summary.
    """
    summary_words = summary.split()
    article_words = [word for article in article_list for word in article.split()]

    # Get number of words in summary that are also in article
    coverage = 0
    for word in summary_words:
        if word in article_words:
            coverage += 1

    return 1 - (coverage / len(summary_words))

def get_compression_ratio(article_list: list, summary: str) -> float:
    """
    Get compression ratio of summary.
    """
    summary_words = summary.split()
    article_words = [word for article in article_list for word in article.split()]

    return 1 - (len(summary_words) / len(article_words))

In [21]:
for i, data in tqdm(enumerate(cleansed_data)):
    data['abstractivity_ratio'] = get_abstractivity_ratio(data['raw_document'], data['summary'])
    data['compression_ratio'] = get_compression_ratio(data['raw_document'], data['summary'])

# Print average abstractivity ratio and compression ratio

abstractivity_ratios = [data['abstractivity_ratio'] for data in cleansed_data]
compression_ratios = [data['compression_ratio'] for data in cleansed_data]

print(f'Average abstractivity ratio: {sum(abstractivity_ratios) / len(abstractivity_ratios)}')
print(f'Average compression ratio: {sum(compression_ratios) / len(compression_ratios)}')

56216it [00:40, 1388.79it/s]

Average abstractivity ratio: 0.41416674804480585
Average compression ratio: 0.4618661500032838





In [28]:
# Average article amount & word length

for i, data in tqdm(enumerate(cleansed_data)):
    concat_article = ' '.join(data['raw_document'])

    # Word length
    data['article_word_length'] = len(concat_article.split())

    # Sentence length
    data['article_sentence_length'] = len(concat_article.split('.'))

article_word_lengths = [data['article_word_length'] for data in cleansed_data]
article_sentence_lengths = [data['article_sentence_length'] for data in cleansed_data]

print(f'Average article word length: {sum(article_word_lengths) / len(article_word_lengths)}')
print(f'Average article sentence length: {sum(article_sentence_lengths) / len(article_sentence_lengths)}')

56216it [00:01, 37534.06it/s]


Average article word length: 433.61966344101324
Average article sentence length: 23.427102604240787


In [29]:
# Average sentence length and word length of the summary

for i, data in tqdm(enumerate(cleansed_data)):
    # Word length
    data['summary_word_length'] = len(data['summary'].split())

    # Sentence length
    data['summary_sentence_length'] = len(data['summary'].split('.'))

summary_word_lengths = [data['summary_word_length'] for data in cleansed_data]
summary_sentence_lengths = [data['summary_sentence_length'] for data in cleansed_data]

print(f'Average summary word length: {sum(summary_word_lengths) / len(summary_word_lengths)}')
print(f'Average summary sentence length: {sum(summary_sentence_lengths) / len(summary_sentence_lengths)}')

56216it [00:00, 73461.26it/s]

Average summary word length: 228.68654831364736
Average summary sentence length: 11.516916891988046





In [30]:
# Count the number of data with article sentence length < 4

article_sentence_length_less_than_4 = [data for data in cleansed_data if data['article_sentence_length'] < 4]

print(f'Number of data with article sentence length < 4: {len(article_sentence_length_less_than_4)}')

Number of data with article sentence length < 4: 45


In [31]:
# Count the number of data with article word length < 40

article_word_length_less_than_40 = [data for data in cleansed_data if data['article_word_length'] < 40]

print(f'Number of data with article word length < 40: {len(article_word_length_less_than_40)}')

Number of data with article word length < 40: 7


In [32]:
# Count the number of data with summary word length < 10

summary_word_length_less_than_10 = [data for data in cleansed_data if data['summary_word_length'] < 10]

print(f'Number of data with summary word length < 10: {len(summary_word_length_less_than_10)}')

Number of data with summary word length < 10: 0


In [33]:
# Count the number of data with compression < 50

compression_ratio_less_than_50 = [data for data in cleansed_data if data['compression_ratio'] < 0.5]

print(f'Number of data with compression ratio < 50: {len(compression_ratio_less_than_50)}')

Number of data with compression ratio < 50: 31994


In [34]:
# Count the number of data with compression > 80

compression_ratio_more_than_80 = [data for data in cleansed_data if data['compression_ratio'] > 0.8]

print(f'Number of data with compression ratio > 80: {len(compression_ratio_more_than_80)}')

Number of data with compression ratio > 80: 496


In [35]:
# Count the number of data with abstractivity < 10

abstractivity_ratio_less_than_10 = [data for data in cleansed_data if data['abstractivity_ratio'] < 0.1]

print(f'Number of data with abstractivity ratio < 10: {len(abstractivity_ratio_less_than_10)}')

Number of data with abstractivity ratio < 10: 390


In [36]:
# Count the number of data with abstractivity > 80

abstractivity_ratio_more_than_80 = [data for data in cleansed_data if data['abstractivity_ratio'] > 0.8]

print(f'Number of data with abstractivity ratio > 80: {len(abstractivity_ratio_more_than_80)}')

Number of data with abstractivity ratio > 80: 126
