In [1]:
import spacy
import feedparser
import textnets as tn
import pandas as pd
import string
from pyvis.network import Network
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
import math
from bs4 import BeautifulSoup
import requests
import sys
from unicodedata import category
import progressbar

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
feed_urls = [
    "http://www.lemonde.fr/rss/une.xml",
    "https://www.bfmtv.com/rss/news-24-7/",
    "https://www.liberation.fr/rss/",
    "http://www.lefigaro.fr/rss/figaro_actualites.xml",
    "https://www.franceinter.fr/rss",
    "https://www.lexpress.fr/arc/outboundfeeds/rss/alaune.xml",
    "https://www.francetvinfo.fr/titres.rss",
    "https://www.la-croix.com/RSS",
    "http://tempsreel.nouvelobs.com/rss.xml",
    "http://www.lepoint.fr/rss.xml",
    "https://www.france24.com/fr/rss",
    "https://feeds.leparisien.fr/leparisien/rss",
    "https://www.ouest-france.fr/rss/une",
    "https://www.europe1.fr/rss.xml",
    "https://partner-feeds.20min.ch/rss/20minutes",
    "https://www.afp.com/fr/actus/afp_actualite/792,31,9,7,33/feed"
]

In [3]:
news_list = pd.DataFrame(columns=('title', 'summary'))

for feed_url in feed_urls:
    r = requests.get(feed_url)
    feed = BeautifulSoup(r.content, features='xml')
    #feed = feedparser.parse(feed_url)

    articles = feed.findAll('item')       
    for a in articles:
        title = BeautifulSoup(a.find('title').get_text(), "html").get_text()
        summary = ""
        if (a.find('description')):
            summary = BeautifulSoup(a.find('description').get_text(), "html").get_text()
        news_list.loc[len(news_list)] = [title, summary]



In [4]:
news_list

Unnamed: 0,title,summary
0,Grève du 31 janvier : la circulation des train...,"En Ile-de-France, le trafic sur les lignes de ..."
1,France-Danemark : les Danois douchent les espo...,"Les Scandinaves se sont offert, dimanche soir,..."
2,Le gouvernement présente son nouveau plan de l...,"Ce programme sur quatre ans, qui doit être pré..."
3,"Tennis : sacré à l’Open d’Australie, Novak Djo...","Vainqueur du Grec Stefanos Tsitsipas, le Serbe..."
4,"Pour ou contre la chasse à courre, une bataill...",Menacée par une proposition de loi sur le poin...
...,...,...
533,Lutte contre la désinformation : l’AFP dévoile...,Forte du plus grand réseau de fact-checking au...
534,Eduardo Soteras récompensé par le prix UNICEF ...,Eduardo Soteras s’est vu remettre le prix UNIC...
535,Lutter contre la désinformation avec Grégoire ...,Focus sur les formations en ligne de l'AFP ave...
536,REPLAY - L’AFP partenaire de Médias en Seine 2022,L'AFP a participé à la 5e édition de Médias en...


In [18]:
def graphnet(docs, lang='fr', min_freq=5, output_url='graph.html'):
    if (lang=='fr'):
        nlp = spacy.load('fr_core_news_sm')
    elif (lang=='en'):
        nlp = spacy.load('en_core_web_sm')

    widgets = [' [',
         progressbar.Timer(format= 'elapsed time: %(elapsed)s'),
         '] ',
           progressbar.Bar('*'),'[',
           progressbar.Percentage(), '] (',
           progressbar.ETA(), ') ',
          ]
 
    bar = progressbar.ProgressBar(max_value=len(docs),
                                widgets=widgets).start()
    
    
    # Utility functions
    punctuation_chars =  [
        chr(i) for i in range(sys.maxunicode)
        if category(chr(i)).startswith("P")
        ]
    stemmer = PorterStemmer()
    def tokenize(text):
        #parsed_text = [ch for ch in text]
        #for i, ch in enumerate(text):
        #    if (ch == "'" or ch == "’"):
        #        parsed_text[i - 1] = ""
        #    if (ch in [*string.punctuation, *punctuation_chars]):
        #        parsed_text[i] = ""
        #    else:
        #        parsed_text[i] = ch
        #text = "".join(parsed_text)
        text = "".join([ch for ch in text if ch not in [*string.punctuation, *punctuation_chars]])
        tokens = nltk.word_tokenize(text)
        return [word.lower() for word in tokens if word not in [stopwords.words('english') + stopwords.words('french')]]

    def preprocess_text(documents):
        docs = list()
        for i, doc in enumerate(documents):
            bar.update(i)
            docs.append(tokenize(doc))  # tokenize
        return docs
    
    # Clean and tokenize docs
    tokenized_news = preprocess_text(docs)
    print('tk news')
    # Lemmanize docs
    lemma_news = [[token.lemma_ for token in nlp(" ".join(doc)) if token.lemma_ not in nlp.Defaults.stop_words] for doc in tokenized_news]
    print('lemma news')
    def get_vocabulary_frequency(documents):
        vocabulary = dict()
        for doc in documents:
            for word in doc:
                if word in vocabulary:
                    vocabulary[word] += 1
                else:
                    vocabulary[word] = 1

        return vocabulary

    voc = get_vocabulary_frequency(lemma_news)
    print('voc')
    # Filter voc with min_freq
    filtered_voc = dict()
    for item in voc.items():
        if (item[1] > min_freq):
            filtered_voc[item[0]] = item[1]
    
    print(sorted(filtered_voc.items(), key=lambda x: x[1], reverse=True)[:20])

    dict_voc_id = dict()
    for i, term in enumerate(filtered_voc):
        dict_voc_id[term] = i
    
    # List bigrams (edges)
    bar = progressbar.ProgressBar(max_value=len(lemma_news),
                                widgets=widgets).start()
    bigrams = []
    for news in lemma_news:
        bar.update(i)
        for (a, b) in zip(news[:-1], news[1:]):
            if (a in filtered_voc.keys() and b in filtered_voc.keys()):
                bigrams.append((dict_voc_id[a], dict_voc_id[b]))

    # Set sizes according to (log of doc freq)*freq
    sizes = []
    num_documents = len(lemma_news)
    for i, term in enumerate(filtered_voc):
        sizes.append(math.log(sum(term in document for document in lemma_news), math.e)*filtered_voc[term] + 1)


    # Output network
    net = Network( bgcolor="#222222", font_color="white", height="800px")

    net.add_nodes(range(0, len(filtered_voc)), label=list(filtered_voc.keys()), size=sizes, title=list(filtered_voc.keys())) # node ids and labels = ["a", "b", "c", "d"]
    net.add_edges(bigrams)

    net.toggle_physics(True)
    net.save_graph(output_url)

In [19]:
graphnet(news_list['title'], lang='fr')

 [elapsed time: 0:00:00] |                            |[  0%] (ETA:  --:--:--)
 [elapsed time: 0:00:00] |                            |[  0%] (ETA:  --:--:--)
 [elapsed time: 0:00:00] |                            |[  2%] (ETA:   0:00:08)
 [elapsed time: 0:00:00] |*                           |[  5%] (ETA:   0:00:05)
 [elapsed time: 0:00:00] |**                          |[  7%] (ETA:   0:00:04)
 [elapsed time: 0:00:00] |**                          |[ 10%] (ETA:   0:00:03)
 [elapsed time: 0:00:00] |***                         |[ 12%] (ETA:   0:00:03)
 [elapsed time: 0:00:00] |****                        |[ 15%] (ETA:   0:00:02)
 [elapsed time: 0:00:00] |****                        |[ 17%] (ETA:   0:00:02)
 [elapsed time: 0:00:00] |*****                       |[ 20%] (ETA:   0:00:02)
 [elapsed time: 0:00:00] |******                      |[ 22%] (ETA:   0:00:02)
 [elapsed time: 0:00:00] |*******                     |[ 25%] (ETA:   0:00:02)
 [elapsed time: 0:00:00] |*******                   

tk news


 [elapsed time: 0:00:00] |                            |[  0%] (ETA:  --:--:--)


lemma news
voc
[('contre', 30), ('retraite', 30), ('ukraine', 29), ('guerre', 26), ('france', 21), ('janvier', 19), ('faire', 19), ('an', 19), ('31', 18), ('mort', 17), ('réforme', 16), ('grever', 15), ('violence', 14), ('prendre', 13), ('mondial', 12), ('pari', 12), ('dun', 12), ('aller', 11), ('russe', 11), ('djokovic', 10)]


In [20]:
#data = pd.read_json('News_Category_Dataset_v3.json', lines=True)

In [21]:
#data

Unnamed: 0,link,headline,category,short_description,authors,date
0,https://www.huffpost.com/entry/covid-boosters-...,Over 4 Million Americans Roll Up Sleeves For O...,U.S. NEWS,Health experts said it is too early to predict...,"Carla K. Johnson, AP",2022-09-23
1,https://www.huffpost.com/entry/american-airlin...,"American Airlines Flyer Charged, Banned For Li...",U.S. NEWS,He was subdued by passengers and crew when he ...,Mary Papenfuss,2022-09-23
2,https://www.huffpost.com/entry/funniest-tweets...,23 Of The Funniest Tweets About Cats And Dogs ...,COMEDY,"""Until you have a dog you don't understand wha...",Elyse Wanshel,2022-09-23
3,https://www.huffpost.com/entry/funniest-parent...,The Funniest Tweets From Parents This Week (Se...,PARENTING,"""Accidentally put grown-up toothpaste on my to...",Caroline Bologna,2022-09-23
4,https://www.huffpost.com/entry/amy-cooper-lose...,Woman Who Called Cops On Black Bird-Watcher Lo...,U.S. NEWS,Amy Cooper accused investment firm Franklin Te...,Nina Golgowski,2022-09-22
...,...,...,...,...,...,...
209522,https://www.huffingtonpost.com/entry/rim-ceo-t...,RIM CEO Thorsten Heins' 'Significant' Plans Fo...,TECH,Verizon Wireless and AT&T are already promotin...,"Reuters, Reuters",2012-01-28
209523,https://www.huffingtonpost.com/entry/maria-sha...,Maria Sharapova Stunned By Victoria Azarenka I...,SPORTS,"Afterward, Azarenka, more effusive with the pr...",,2012-01-28
209524,https://www.huffingtonpost.com/entry/super-bow...,"Giants Over Patriots, Jets Over Colts Among M...",SPORTS,"Leading up to Super Bowl XLVI, the most talked...",,2012-01-28
209525,https://www.huffingtonpost.com/entry/aldon-smi...,Aldon Smith Arrested: 49ers Linebacker Busted ...,SPORTS,CORRECTION: An earlier version of this story i...,,2012-01-28


In [22]:
#graphnet(data['headline'], lang='en', min_freq=500)

 [elapsed time: 0:00:00] |                            |[  0%] (ETA:  --:--:--)
 [elapsed time: 0:00:00] |                            |[  0%] (ETA:  --:--:--)
 [elapsed time: 0:00:00] |                            |[  0%] (ETA:   0:33:22)
 [elapsed time: 0:00:00] |                            |[  0%] (ETA:   0:21:46)
 [elapsed time: 0:00:00] |                            |[  0%] (ETA:   0:18:38)
 [elapsed time: 0:00:00] |                            |[  0%] (ETA:   0:17:07)
 [elapsed time: 0:00:00] |                            |[  0%] (ETA:   0:16:21)
 [elapsed time: 0:00:00] |                            |[  0%] (ETA:   0:15:44)
 [elapsed time: 0:00:00] |                            |[  0%] (ETA:   0:15:05)
 [elapsed time: 0:00:00] |                            |[  0%] (ETA:   0:14:31)
 [elapsed time: 0:00:01] |                            |[  0%] (ETA:   0:14:15)
 [elapsed time: 0:00:01] |                            |[  0%] (ETA:   0:14:06)
 [elapsed time: 0:00:01] |                          

tk news


 [elapsed time: 0:00:00] |                            |[  0%] (ETA:  --:--:--)


lemma news
voc
[('photo', 12364), ('trump', 11489), ('new', 9084), ('I', 7000), ('video', 6192), ('day', 5339), ('good', 5175), ('donald', 4842), ('woman', 4581), ('way', 4123), ('year', 3674), ('man', 3464), ('5', 3455), ('world', 3426), ('life', 3359), ('kid', 3217), ('10', 3128), ('find', 3091), ('people', 3055), ('love', 3014)]


 [elapsed time: 0:00:00] |                            |[  0%] (ETA:   0:00:47)
 [elapsed time: 0:00:00] |                            |[  0%] (ETA:   0:01:33)
 [elapsed time: 0:00:00] |                            |[  0%] (ETA:   0:02:19)
