# Text summarization - Cosine similarity

# Preparing the environment

In [2]:
import re
import nltk
import string
import numpy as np
import networkx as nx
from nltk.cluster.util import cosine_distance

In [3]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [4]:
stopwords = nltk.corpus.stopwords.words('english')
print(stopwords)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [5]:
def preprocess(text):
  formatted_text = text.lower()
  tokens = []
  for token in nltk.word_tokenize(formatted_text):
    tokens.append(token)
  tokens = [word for word in tokens if word not in stopwords and word not in string.punctuation]
  formatted_text = ' '.join(element for element in tokens)

  return formatted_text

In [6]:
original_text = """Cristiano Ronaldo dos Santos Aveiro GOIH ComM (Portuguese pronunciation: [kɾiʃˈtjɐnu ʁɔˈnaɫdu]; born 5 February 1985) is a Portuguese professional footballer who plays as a forward for Premier League club Manchester United and captains the Portugal national team. Often considered the best player in the world and widely regarded as one of the greatest players of all time, Ronaldo has won five Ballon d'Or awards[note 3] and four European Golden Shoes, the most by a European player. He has won 32 trophies in his career, including seven league titles, five UEFA Champions Leagues, one UEFA European Championship and one UEFA Nations League. Ronaldo holds the records for most appearances (180), most goals (139) and assists (42) in the Champions League, most goals in the European Championship (14), most international goals by a male player (115), and most international appearances by a European male (184). He is one of the few players to have made over 1,100 professional career appearances, and has scored over 790 official senior career goals for club and country.

Born and raised in Madeira, Ronaldo began his senior club career playing for Sporting CP, before signing with Manchester United in 2003, aged 18, winning the FA Cup in his first season. He would also go onto win three consecutive Premier League titles, the Champions League and the FIFA Club World Cup; at age 23, he won his first Ballon d'Or. Ronaldo was the subject of the then-most expensive association football transfer when he signed for Real Madrid in 2009 in a transfer worth €94 million (£80 million), where he won 15 trophies, including two La Liga titles, two Copa del Rey and four Champions Leagues, and became the club's all-time top goalscorer. He also finished runner-up for the Ballon d'Or three times, behind Lionel Messi (his perceived career rival), and won back-to-back Ballons d'Or in 2013 and 2014, and again in 2016 and 2017. In 2018, he signed for Juventus in a transfer worth an initial €100 million (£88 million), the most expensive transfer for an Italian club and the most expensive transfer for a player over 30 years old. He won two Serie A titles, two Supercoppe Italiana and a Coppa Italia, before returning to Manchester United in 2021.

Ronaldo made his senior international debut for Portugal in 2003 at the age of 18 and has since earned over 180 caps, making him Portugal's most-capped player. With more than 100 goals at international level, he is also the nation's all-time top goalscorer. He has played in and scored at 11 major tournaments; he scored his first international goal at Euro 2004, where he helped Portugal reach the final. He assumed full captaincy of the national team in July 2008. In 2015, Ronaldo was named the best Portuguese player of all time by the Portuguese Football Federation. The following year, he led Portugal to their first major tournament title at Euro 2016, and received the Silver Boot as the second-highest goalscorer of the tournament. He also led them to victory in the inaugural UEFA Nations League in 2019, and later received the Golden Boot as top scorer of Euro 2020.

One of the world's most marketable and famous athletes, Ronaldo was ranked the world's highest-paid athlete by Forbes in 2016 and 2017 and the world's most famous athlete by ESPN from 2016 to 2019. Time included him on their list of the 100 most influential people in the world in 2014. He is the first footballer and the third sportsman to earn US$1 billion in his career.[8]"""
original_text = re.sub(r'\s+', ' ', original_text)
original_text

"Cristiano Ronaldo dos Santos Aveiro GOIH ComM (Portuguese pronunciation: [kɾiʃˈtjɐnu ʁɔˈnaɫdu]; born 5 February 1985) is a Portuguese professional footballer who plays as a forward for Premier League club Manchester United and captains the Portugal national team. Often considered the best player in the world and widely regarded as one of the greatest players of all time, Ronaldo has won five Ballon d'Or awards[note 3] and four European Golden Shoes, the most by a European player. He has won 32 trophies in his career, including seven league titles, five UEFA Champions Leagues, one UEFA European Championship and one UEFA Nations League. Ronaldo holds the records for most appearances (180), most goals (139) and assists (42) in the Champions League, most goals in the European Championship (14), most international goals by a male player (115), and most international appearances by a European male (184). He is one of the few players to have made over 1,100 professional career appearances, a

# Function to calculate similarity between sentences

- Link: https://en.wikipedia.org/wiki/Cosine_similarity
- Step by step calculations: https://janav.wordpress.com/2013/10/27/tf-idf-and-cosine-similarity/

In [7]:
original_sentences = [sentence for sentence in nltk.sent_tokenize(original_text)]
original_sentences

['Cristiano Ronaldo dos Santos Aveiro GOIH ComM (Portuguese pronunciation: [kɾiʃˈtjɐnu ʁɔˈnaɫdu]; born 5 February 1985) is a Portuguese professional footballer who plays as a forward for Premier League club Manchester United and captains the Portugal national team.',
 "Often considered the best player in the world and widely regarded as one of the greatest players of all time, Ronaldo has won five Ballon d'Or awards[note 3] and four European Golden Shoes, the most by a European player.",
 'He has won 32 trophies in his career, including seven league titles, five UEFA Champions Leagues, one UEFA European Championship and one UEFA Nations League.',
 'Ronaldo holds the records for most appearances (180), most goals (139) and assists (42) in the Champions League, most goals in the European Championship (14), most international goals by a male player (115), and most international appearances by a European male (184).',
 'He is one of the few players to have made over 1,100 professional care

In [8]:
formatted_sentences = [preprocess(original_sentence) for original_sentence in original_sentences]
formatted_sentences

['cristiano ronaldo dos santos aveiro goih comm portuguese pronunciation kɾiʃˈtjɐnu ʁɔˈnaɫdu born 5 february 1985 portuguese professional footballer plays forward premier league club manchester united captains portugal national team',
 "often considered best player world widely regarded one greatest players time ronaldo five ballon d'or awards note 3 four european golden shoes european player",
 '32 trophies career including seven league titles five uefa champions leagues one uefa european championship one uefa nations league',
 'ronaldo holds records appearances 180 goals 139 assists 42 champions league goals european championship 14 international goals male player 115 international appearances european male 184',
 'one players made 1,100 professional career appearances scored 790 official senior career goals club country',
 'born raised madeira ronaldo began senior club career playing sporting cp signing manchester united 2003 aged 18 winning fa cup first season',
 "would also go ont

In [9]:
def calculate_sentence_similarity(sentence1, sentence2):
  words1 = [word for word in nltk.word_tokenize(sentence1)]
  words2 = [word for word in nltk.word_tokenize(sentence2)]
  #print(words1)
  #print(words2)

  all_words = list(set(words1 + words2))
  #print(all_words)

  vector1 = [0] * len(all_words)
  vector2 = [0] * len(all_words)
  #print(vector1)
  #print(vector2)

  for word in words1: # Bag of words
    #print(word)
    vector1[all_words.index(word)] += 1
  for word in words2:
    vector2[all_words.index(word)] += 1
  
  #print(vector1)
  #print(vector2)

  return 1 - cosine_distance(vector1, vector2)

In [10]:
calculate_sentence_similarity(formatted_sentences[0], formatted_sentences[1])

0.03394221166510658

In [11]:
test = ['human', 'study', 'intelligence', 'agents', 'intelligent', 'artificial', 'like']
test.index('agents')

3

# Function to create the similarity matrix

In [12]:
# The higher the value, the greater the similarity between the sentences
# The more words in common, the greater the similarity

In [13]:
def calculate_similarity_matrix(sentences):
  similarity_matrix = np.zeros((len(sentences), len(sentences)))
  #print(similarity_matrix)
  for i in range(len(sentences)):
    for j in range(len(sentences)):
      if i == j:
        continue
      similarity_matrix[i][j] = calculate_sentence_similarity(sentences[i], sentences[j])
  return similarity_matrix

In [14]:
calculate_similarity_matrix(formatted_sentences)

array([[0.        , 0.03394221, 0.06670373, 0.05751973, 0.08712137,
        0.1914599 , 0.14980118, 0.0547791 , 0.        , 0.03279129,
        0.09600307, 0.1204829 , 0.        , 0.04356068, 0.13576885,
        0.25923792, 0.04233338, 0.04490133, 0.02839809, 0.        ,
        0.05986843, 0.        ],
       [0.03394221, 0.        , 0.1754656 , 0.21182964, 0.09166985,
        0.04029115, 0.11821656, 0.05763904, 0.12087344, 0.06900656,
        0.        , 0.12677314, 0.        , 0.        , 0.        ,
        0.27277236, 0.        , 0.04724556, 0.14940358, 0.13363062,
        0.        , 0.        ],
       [0.06670373, 0.1754656 , 0.        , 0.17841031, 0.18015094,
        0.03959038, 0.23232093, 0.14159137, 0.03959038, 0.        ,
        0.04962917, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.27854301, 0.05872202, 0.        ,
        0.06189845, 0.        ],
       [0.05751973, 0.21182964, 0.17841031, 0.        , 0.19418391,
        0.0341394

# Function to summarize the texts

- Pagerank algorithm: https://en.wikipedia.org/wiki/PageRank


In [15]:
for i, score in enumerate(original_sentences):
  print(i, score)

0 Cristiano Ronaldo dos Santos Aveiro GOIH ComM (Portuguese pronunciation: [kɾiʃˈtjɐnu ʁɔˈnaɫdu]; born 5 February 1985) is a Portuguese professional footballer who plays as a forward for Premier League club Manchester United and captains the Portugal national team.
1 Often considered the best player in the world and widely regarded as one of the greatest players of all time, Ronaldo has won five Ballon d'Or awards[note 3] and four European Golden Shoes, the most by a European player.
2 He has won 32 trophies in his career, including seven league titles, five UEFA Champions Leagues, one UEFA European Championship and one UEFA Nations League.
3 Ronaldo holds the records for most appearances (180), most goals (139) and assists (42) in the Champions League, most goals in the European Championship (14), most international goals by a male player (115), and most international appearances by a European male (184).
4 He is one of the few players to have made over 1,100 professional career appea

In [16]:
def summarize(text, number_of_sentences, percentage = 0):
  original_sentences = [sentence for sentence in nltk.sent_tokenize(text)]
  formatted_sentences = [preprocess(original_sentence) for original_sentence in original_sentences]
  similarity_matrix = calculate_similarity_matrix(formatted_sentences)
  #print(similarity_matrix)

  similarity_graph = nx.from_numpy_array(similarity_matrix)
  #print(similarity_graph.nodes)
  #print(similarity_graph.edges)

  scores = nx.pagerank(similarity_graph)
  #print(scores)
  ordered_scores = sorted(((scores[i], score) for i, score in enumerate(original_sentences)), reverse=True)
  #print(ordered_scores)

  if percentage > 0:
    number_of_sentences = int(len(formatted_sentences) * percentage)

  best_sentences = []
  for sentence in range(number_of_sentences):
    best_sentences.append(ordered_scores[sentence][1])
  
  return original_sentences, best_sentences, ordered_scores

In [17]:
original_sentences, best_sentences, scores = summarize(original_text, 3)

In [18]:
original_sentences

['Cristiano Ronaldo dos Santos Aveiro GOIH ComM (Portuguese pronunciation: [kɾiʃˈtjɐnu ʁɔˈnaɫdu]; born 5 February 1985) is a Portuguese professional footballer who plays as a forward for Premier League club Manchester United and captains the Portugal national team.',
 "Often considered the best player in the world and widely regarded as one of the greatest players of all time, Ronaldo has won five Ballon d'Or awards[note 3] and four European Golden Shoes, the most by a European player.",
 'He has won 32 trophies in his career, including seven league titles, five UEFA Champions Leagues, one UEFA European Championship and one UEFA Nations League.',
 'Ronaldo holds the records for most appearances (180), most goals (139) and assists (42) in the Champions League, most goals in the European Championship (14), most international goals by a male player (115), and most international appearances by a European male (184).',
 'He is one of the few players to have made over 1,100 professional care

In [19]:
best_sentences

["He would also go onto win three consecutive Premier League titles, the Champions League and the FIFA Club World Cup; at age 23, he won his first Ballon d'Or.",
 "Ronaldo was the subject of the then-most expensive association football transfer when he signed for Real Madrid in 2009 in a transfer worth €94 million (£80 million), where he won 15 trophies, including two La Liga titles, two Copa del Rey and four Champions Leagues, and became the club's all-time top goalscorer.",
 'Cristiano Ronaldo dos Santos Aveiro GOIH ComM (Portuguese pronunciation: [kɾiʃˈtjɐnu ʁɔˈnaɫdu]; born 5 February 1985) is a Portuguese professional footballer who plays as a forward for Premier League club Manchester United and captains the Portugal national team.']

In [20]:
scores

[(0.06892459374683861,
  "He would also go onto win three consecutive Premier League titles, the Champions League and the FIFA Club World Cup; at age 23, he won his first Ballon d'Or."),
 (0.06431689340219814,
  "Ronaldo was the subject of the then-most expensive association football transfer when he signed for Real Madrid in 2009 in a transfer worth €94 million (£80 million), where he won 15 trophies, including two La Liga titles, two Copa del Rey and four Champions Leagues, and became the club's all-time top goalscorer."),
 (0.06398060624362796,
  'Cristiano Ronaldo dos Santos Aveiro GOIH ComM (Portuguese pronunciation: [kɾiʃˈtjɐnu ʁɔˈnaɫdu]; born 5 February 1985) is a Portuguese professional footballer who plays as a forward for Premier League club Manchester United and captains the Portugal national team.'),
 (0.06280695160464797,
  "Often considered the best player in the world and widely regarded as one of the greatest players of all time, Ronaldo has won five Ballon d'Or awards[

In [21]:
from IPython.core.display import HTML
def visualize(title, sentence_list, best_sentences):
  text = ''

  display(HTML(f'<h1>Summary - {title}</h1>'))
  for sentence in sentence_list:
    if sentence in best_sentences:
      text += ' ' + str(sentence).replace(sentence, f"<mark>{sentence}</mark>")
    else:
      text += ' ' + sentence
  display(HTML(f""" {text} """))

In [23]:
visualize('Ronaldo', original_sentences, best_sentences)

# Extracting texts from the Internet

In [24]:
!pip install goose3

Collecting goose3
  Downloading goose3-3.1.9-py3-none-any.whl (86 kB)
[?25l[K     |███▉                            | 10 kB 21.4 MB/s eta 0:00:01[K     |███████▋                        | 20 kB 26.6 MB/s eta 0:00:01[K     |███████████▍                    | 30 kB 31.3 MB/s eta 0:00:01[K     |███████████████▏                | 40 kB 34.1 MB/s eta 0:00:01[K     |███████████████████             | 51 kB 11.3 MB/s eta 0:00:01[K     |██████████████████████▊         | 61 kB 11.0 MB/s eta 0:00:01[K     |██████████████████████████▌     | 71 kB 6.2 MB/s eta 0:00:01[K     |██████████████████████████████▎ | 81 kB 6.8 MB/s eta 0:00:01[K     |████████████████████████████████| 86 kB 2.9 MB/s 
Collecting cssselect
  Downloading cssselect-1.1.0-py2.py3-none-any.whl (16 kB)
Installing collected packages: cssselect, goose3
Successfully installed cssselect-1.1.0 goose3-3.1.9


In [25]:
from goose3 import Goose
g = Goose()
url = 'https://en.wikipedia.org/wiki/Cristiano_Ronaldo'
article = g.extract(url)

In [26]:
article.cleaned_text

'Cristiano Ronaldo dos Santos Aveiro GOIH ComM (Portuguese pronunciation: [kɾiʃˈtjɐnu ʁɔˈnaɫdu]; born 5 February 1985) is a Portuguese professional footballer who plays as a forward for Premier League club Manchester United and captains the Portugal national team. Often considered the best player in the world and widely regarded as one of the greatest players of all time, Ronaldo has won five Ballon d\'Or awards[note 3] and four European Golden Shoes, the most by a European player. He has won 32 trophies in his career, including seven league titles, five UEFA Champions Leagues, one UEFA European Championship and one UEFA Nations League. Ronaldo holds the records for most appearances (180), most goals (139) and assists (42) in the Champions League, most goals in the European Championship (14), most international goals by a male player (115), and most international appearances by a European male (184). He is one of the few players to have made over 1,100 professional career appearances, 

In [27]:
original_sentences, best_sentences, scores = summarize(article.cleaned_text, 120, 0.2)

In [28]:
(120 / len(original_sentences)) * 100

22.900763358778626

In [29]:
original_sentences

['Cristiano Ronaldo dos Santos Aveiro GOIH ComM (Portuguese pronunciation: [kɾiʃˈtjɐnu ʁɔˈnaɫdu]; born 5 February 1985) is a Portuguese professional footballer who plays as a forward for Premier League club Manchester United and captains the Portugal national team.',
 "Often considered the best player in the world and widely regarded as one of the greatest players of all time, Ronaldo has won five Ballon d'Or awards[note 3] and four European Golden Shoes, the most by a European player.",
 'He has won 32 trophies in his career, including seven league titles, five UEFA Champions Leagues, one UEFA European Championship and one UEFA Nations League.',
 'Ronaldo holds the records for most appearances (180), most goals (139) and assists (42) in the Champions League, most goals in the European Championship (14), most international goals by a male player (115), and most international appearances by a European male (184).',
 'He is one of the few players to have made over 1,100 professional care

In [30]:
best_sentences

["[208] In the Champions League Final, Ronaldo scored two goals in a 4–1 victory over Juventus to take him to 12 goals for the season, making him the competition's top goalscorer for the fifth straight season (sixth overall), as well as the first player to score in three finals in the Champions League era; the second goal was the 600th of his senior career.",
 '[230] On 16 September, Ronaldo scored his first two goals for Juventus in his fourth appearance in a 2–1 home win over Sassuolo in Serie A; his second was the 400th league goal of his career.',
 "[392] In the following match against Luxembourg on 12 October, Ronaldo scored a hat-trick in a 5–0 win for Portugal, and became the first player to score 10 hat-tricks in men's international football.",
 '[217]\n\nOn 3 April, Ronaldo scored the first two goals in a 3–0 away win against Juventus in the quarter-finals of the 2017–18 UEFA Champions League, with his second goal being an acrobatic bicycle kick.',
 "[38] Ronaldo ended his fir

In [31]:
scores

[(0.004087680322671557,
  "[208] In the Champions League Final, Ronaldo scored two goals in a 4–1 victory over Juventus to take him to 12 goals for the season, making him the competition's top goalscorer for the fifth straight season (sixth overall), as well as the first player to score in three finals in the Champions League era; the second goal was the 600th of his senior career."),
 (0.0038481749258914926,
  '[230] On 16 September, Ronaldo scored his first two goals for Juventus in his fourth appearance in a 2–1 home win over Sassuolo in Serie A; his second was the 400th league goal of his career.'),
 (0.0037609376673144045,
  "[392] In the following match against Luxembourg on 12 October, Ronaldo scored a hat-trick in a 5–0 win for Portugal, and became the first player to score 10 hat-tricks in men's international football."),
 (0.003727730879965477,
  '[217]\n\nOn 3 April, Ronaldo scored the first two goals in a 3–0 away win against Juventus in the quarter-finals of the 2017–18 UE

In [32]:
visualize(article.title, original_sentences, best_sentences)