In [None]:
%pip install datasets

In [None]:
%pip install spacy
!python -m spacy download fr_core_news_sm

In [None]:
import numpy as np
import re
import spacy
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from datasets import load_dataset

dataset_train = load_dataset("orange_sum", 'abstract', split='test')
summary = dataset_train['summary']
text = dataset_train['text']

In [98]:
lines_to_add = []
for line in summary:
    line = line.strip()

    if line == "" or len(line.split()) <=1:
      continue      
    else:
      lines_to_add.append(line)

In [99]:
file_in = lines_to_add
file_out = "sample_data/orange_sum_summary.txt"

with open(file_out, 'w', encoding="utf-8") as f_out:  
  for entry in file_in:
    f_out.write(" ".join(entry) + '\n')

In [72]:
def clean_str(txt):
  txt = txt.lower()
  
  # Add a space in the beggining of the phrase
  txt = re.sub('^',' ', txt)

  # Add a space in the end of the phrase
  txt = re.sub('$',' ', txt)

  # Replace http by __url__ 
  words = []
  for word in txt.split():   
    i = word.find('http')   
    if i >=0:
      word = word[:i] + ' ' + '__url__'
    words.append(word.strip())
  txt = ' '.join(words)

  # Add space after and before a dot and get rid of triple dot
  words = []
  for word in txt.split():      
    dot = word.find('.')    
    if dot >=0:      
      if dot+1 == len(word):        
        word = word[0:dot] + ' ' + word[dot]
        
      elif word[dot+1] == word[dot]:        
        substr1 = word[0:dot]     
        substr2 = word[dot + 3:len(word)]
        word  = substr1 + ' ' + word[dot] + ' ' + substr2        
     
      else:
        substr1 = word[0:dot] 
        substr2 = word[dot + 1:len(word)]
        word  = substr1 + ' ' + word[dot] + ' ' + substr2        
    words.append(word.strip())  
  txt = ' '.join(words)
  
  # Remove markdown URL
  txt = re.sub(r'\[([^\]]*)\] \( *__url__ *\)', r'\1', txt)

  # Remove alphanumeric characters
  txt = txt.replace(')', '')
  txt = txt.replace('(', '')
  txt = txt.replace('"', '')
  txt = txt.replace('►', '')
  
  # Remove all string breaks
  txt = re.sub(r'^\s+', '', txt)
  txt = re.sub(r'\s+$', '', txt)
  txt = re.sub(r'\s+', ' ', txt)
 
  return txt

In [73]:
def hasNumbers(inputString):
  return any(char.isdigit() for char in inputString)

In [74]:
def get_tokens(input_text):
  iterations = 0
  max_line = len(input_text)
  lines_to_add = []
  tokens = []

  # split to sentences, cleaning the text
  for line in summary:
    iterations += 1
    
    if iterations >= max_line:
      break

    line = line.strip()

    if line == "" or len(line.split()) <=1:
      continue      
    else:    
      line = clean_str(line)
      lines_to_add.append(line)

  # split to tokens inside sentences
  for line in lines_to_add:
    tokenized_line = line.split()  
    tokens.append(tokenized_line)

  return tokens

In [90]:
def get_keywords(tokenised_sentences_list):

  # 1. Transform a list of tokens in a dictionary object, calculating the number of the token occurencies
  idf_dict = {}

  for sentence in tokens:
    for token in sentence:
      idf_dict[token] = idf_dict.get(token, 0) + 1 

  # 2. Penalize frequent appearence of the word  
  docs_len = float(len(tokens))

  for token in idf_dict.keys():    
    idf_dict[token] =  np.log(docs_len / idf_dict[token])    
    '''
    Before: 1 - After: 1.3862943611198906
    Before: 2 - After: 0.6931471805599453
    Before: 9 - After: -0.8109302162163288
    '''  

  # 3. Make all the values be less than a number one, in other words - normalize the values between 0 and 1  
  for key in idf_dict.keys():
    idf_dict[key] = 1.0/(idf_dict[key] + 1e-5)
    '''
    Before: 3.2358420970216457 - After: 0.30903761050155026
    Before: 2.132934794987244 - After: 0.4688353877466296
    Before: 1.8645206796443583 - After: 0.536327994447772
    '''

  # 4. Penalize stopwords occurencies, making then lower than 0.0...
  stop_words = set(stopwords.words('french') ) | set(['à', 'de', 'en', 'été', 'est', "eu", "a" '"', 'pour', 'sur', 'comme', 'avec', 'par', 'lui', 'près', 'ça', 'quel', '.', ',', '(', ')',"'", '%'])

  for token in stop_words:
    if token in idf_dict:
      idf_dict[token] = 0.01/(idf_dict[token])
    '''
    Before: 0.536327994447772 - After: 0.018645306796443585
    Before: 0.4688353877466296 - After: 0.021329447949872442    
    Before: 0.3998475440258594 - After: 0.025009532131459754
    '''

  # 5. Filter out every string, containing a digit for the input to compare with scores in a dictionary idf_dict
  input = " ".join([key for key in idf_dict.keys() if not hasNumbers(key)])


  # 6. Get pos tags for every token in the input to compare with a dictionary idf_dict
  spacy_nlp = spacy.load('fr_core_news_sm')
  input_results = [(token.text, token.tag_) for token in spacy_nlp(input)]
  '''
  [('en', 'ADP___'),
  ('janvier', 'NOUN__Gender=Masc|Number=Sing'),
  ('dernier', 'ADJ__Gender=Masc|Number=Sing|NumType=Ord'),
  (',', 'PUNCT___'), ...)]
  '''

  # 7. Filter out words of no information and pos we are not interested in
  allowed_tags = ['VERB','NOUN','ADJ_','ADV_']
  ignored_words = ['été','a','avait','aurait','serait'] + ['du','pu','fait']

  for word, tag in input_results:
    if word in idf_dict.keys():

      # Filter the tags we are interested in 
      if len(tag)>=2 and tag[:4] in allowed_tags and (word not in ignored_words):

        # Nouns and verbs are more important - their rank is augmented in four times
        if tag[:4] in ['VERB','NOUN']:
          idf_dict[word] *= 4
        
        # Andjectives and adverbs receive the augmentation in a rank in two times
        else:
          idf_dict[word] *= 2

  # 8. Filter out everything below 2 (ADV threshold)
  keywords_sentences_list = []

  for sentence in tokens:
    keywords_sentence = []
    for token in sentence:
      probability_score = idf_dict[token]
      #print(token, probability_score)
      
      if probability_score > 2.0:
        keywords_sentence.append(token)

    
    # Resque small entries
    if len(keywords_sentence) <= 3:
      keywords_sentence = []
      
      for token in sentence:
        probability_score = idf_dict[token]
        print(token, probability_score)
        
        if probability_score > 0.5:
          keywords_sentence.append(token)

    '''
    # Reduce big ones if necessary
    elif len(keywords_sentence) >= 7:
      keywords_sentence = []
      
      for token in sentence:
        probability_score = idf_dict[token]
        print(token, probability_score)
        
        if probability_score > 1:
          keywords_sentence.append(token)
    '''
    
    if len(keywords_sentence) > 0:
      keywords_sentences_list.append(keywords_sentence)

  return keywords_sentences_list

In [None]:
file_in = get_keywords(tokens)
file_in

In [92]:
tokens = get_tokens(summary)
file_in = get_keywords(tokens)
file_out = "sample_data/orange_sum_text_key.txt"

with open(file_out, 'w', encoding="utf-8") as f_out:  
  for entry in file_in:
    f_out.write(" ".join(entry) + '\n')

[1;30;43mLe flux de sortie a été tronqué et ne contient que les 5000 dernières lignes.[0m
terrain 0.7013697909882405
d'entraînement 0.1609282027322205
des 0.005649769712732819
forces 0.8929866749503018
de -0.006897960481501102
l'ordre 0.21717821960715164
. -0.0038546567217020737
un 0.009460930503711598
problème 0.35068489549412024
de -0.006897960481501102
voisinage 0.13675094927510326
quelque 0.35068489549412024
peu 0.4343564392143033
insolite 0.321856405464441
. -0.0038546567217020737
à 0.0032230699760871697
saint-malo 0.13675094927510326
ille-et-vilaine, 0.13675094927510326
en 0.007657780873420739
bretagne, 0.15107072165106444
monique 0.3021414433021289
bunouf 0.2735018985502065
et 0.008402172036016972
son 0.016777738949333484
fils 3.423553842270562
guy 0.15107072165106444
habitent 0.2735018985502065
juste 0.35068489549412024
à 0.0032230699760871697
côté 0.7013697909882405
d'un 0.4166825331794204
terrain 0.7013697909882405
d'entraînement 0.1609282027322205
des 0.005649769712732819
