In [1]:
import spacy
import pandas as pd
import string
import nltk
from nltk.corpus import stopwords
from bs4 import BeautifulSoup
import requests
import sys
from unicodedata import category
import json
import os
from mlxtend.frequent_patterns import apriori, association_rules
from mlxtend.preprocessing import TransactionEncoder
from datetime import date, datetime
import warnings
warnings.filterwarnings("ignore", category=UserWarning, module='bs4')

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
feed_urls = [
    "http://www.lemonde.fr/rss/une.xml",
    "https://www.bfmtv.com/rss/news-24-7/",
    "http://www.lefigaro.fr/rss/figaro_actualites.xml",
    "https://www.lexpress.fr/arc/outboundfeeds/rss/alaune.xml",
    "https://www.francetvinfo.fr/titres.rss",
    "https://www.la-croix.com/RSS",
    "http://tempsreel.nouvelobs.com/rss.xml",
    "http://www.lepoint.fr/rss.xml",
    "https://feeds.leparisien.fr/leparisien/rss",
    "https://www.europe1.fr/rss.xml",
    "https://partner-feeds.20min.ch/rss/20minutes",
    "https://www.afp.com/fr/actus/afp_actualite/792,31,9,7,33/feed"
]

In [3]:
def scrap(feed_urls):
    news_list = pd.DataFrame(columns=('title', 'summary', 'img_url', 'link'))

    for feed_url in feed_urls:
        res = requests.get(feed_url)
        feed = BeautifulSoup(res.content, features='xml')
        
        articles = feed.findAll('item')
        for article in articles:
            news = {
                'title': None,
                'summary': None,
                'link': None,
                'img_url': None
            }
            news['title'] = BeautifulSoup(article.find('title').get_text(), "html").get_text()
            if (article.find('description')):
                news['summary'] = BeautifulSoup(article.find('description').get_text(), "html").get_text()
            if (article.find('content')):
                news['img_url'] = article.find('content')['url']
            if (article.find('link')):
                news['link'] = article.find('link').get_text()
            news_list = pd.concat([news_list, pd.DataFrame([news])], ignore_index=True)
        
    return news_list

In [4]:
def process_text(docs, lang='fr'):
    if (lang=='fr'):
        nlp = spacy.load('fr_core_news_lg')
    elif (lang=='en'):
        nlp = spacy.load('en_core_web_sm')

    # Utility functions
    punctuation_chars =  [
        chr(i) for i in range(sys.maxunicode)
        if category(chr(i)).startswith("P")
    ]
    
    lemma_docs = []
    for doc in docs:
        # Tokenize docs
        tokenized_doc = nlp(doc)

        # Lemmanize docs
        lemma_doc = list(filter(lambda token: token.is_stop == False and token.pos_ in ['NOUN', 'PROPN','ADJ'] and token.lemma_ not in [*string.punctuation, *punctuation_chars], tokenized_doc))
        lemma_doc = list(map(lambda tok: tok.lemma_, lemma_doc))
        lemma_docs.append(lemma_doc)


    def get_vocabulary_frequency(documents):
        vocabulary = dict()
        for doc in documents:
            for word in doc:
                if word in list(vocabulary.keys()):
                    vocabulary[word] += 1
                else:
                    vocabulary[word] = 1

        return vocabulary

    voc = get_vocabulary_frequency(lemma_docs)

    return lemma_docs, voc

In [5]:
def graphnet(docs, voc, min_freq=5):
    
    # Filter voc with min_freq
    filtered_voc = dict(filter(lambda elem: elem[1] > min_freq, voc.items()))

    dict_voc_id = dict()
    for i, term in enumerate(filtered_voc):
        dict_voc_id[term] = i
    
    # List bigrams (edges)
    finder = nltk.BigramCollocationFinder.from_documents(docs)
    bigram_measures = nltk.collocations.BigramAssocMeasures()
    bigrams = list(finder.score_ngrams(bigram_measures.raw_freq))
    min_freq = min(list(map(lambda x: x[1], bigrams)))
    bigrams = list(map(lambda x: (x[0], x[1]/min_freq), bigrams))

    # Filter the bigrams with filtered_voc elements and replace by id
    filtered_bigrams = []
    for bigram in bigrams:
        if (bigram[0][0] in filtered_voc.keys() and bigram[0][1] in filtered_voc.keys()):
            new_bigram = ( dict_voc_id[bigram[0][0]] , dict_voc_id[bigram[0][1]] )
            filtered_bigrams.append((new_bigram, bigram[1]))

    # Set nodes sizes
    sizes = list(filtered_voc.values())

    # Format data
    nodes = []
    for i, term in enumerate(filtered_voc.keys()):
        nodes.append({
            'id': i,
            'label': term,
            'size': sizes[i]
        })
    
    edges = []
    for i, edge in enumerate(filtered_bigrams):
        (source, target) = edge[0]
        edges.append({
            'id': i,
            'source': source,
            'target': target,
            'size': edge[1]
        })

    
    # Write JSON files
    output_file(nodes, 'nodes.json')

    
    output_file(edges, 'edges.json')

In [6]:
def output_file(data, filename):
    path = f'./data/{date.today().strftime("%d-%m-%Y")}'
    if not os.path.exists(path):
        os.makedirs(path)

    with open(f'{path}/{filename}', 'w', encoding='UTF8', newline='') as f:
        writer = json.dump(data, f, ensure_ascii=False)

In [7]:
news_list = scrap(feed_urls)

In [8]:
docs, voc = process_text(news_list['title'], lang='fr')

In [9]:
graphnet(docs, voc, min_freq=2)

In [38]:
def find_topics(docs, criterion='leverage', level=0.01):
    te = TransactionEncoder()
    te_ary = te.fit(docs).transform(docs, sparse=True)
    df = pd.DataFrame.sparse.from_spmatrix(te_ary, columns=te.columns_)

    frequent_itemsets = apriori(df, min_support=0.005, use_colnames=True, verbose=1)
    frequent_itemsets['length'] = frequent_itemsets['itemsets'].apply(lambda x: len(x))

    rules = association_rules(frequent_itemsets, metric ="lift", min_threshold = 1)
    rules = rules.sort_values([criterion], ascending =[False])

    rules = rules[rules[criterion] > level]

    topics = []
    for i in rules.index:
        rule = rules.loc[i]
        x = list(rule['antecedents'])
        y = list(rule['consequents'])
        terms = x + y
        found_similar = False
        delete_topics_ids = []
        for i, topic in enumerate(topics):
            sim = similarity(topic, terms)
            if (similarity(topic, terms) > 0.2):
                found_similar = True
                new_topic = list(set(list(topic) + terms))
                delete_topics_ids.append(i)
                break
        if (found_similar == False):
            topics.append((tuple(terms)))
        else:
            topics = [x for i, x in enumerate(topics) if i not in delete_topics_ids]
            topics.insert(min(delete_topics_ids), tuple(new_topic))

    return topics, rules

In [11]:
def list_dates():
    dates = [x for x in next(os.walk('./data'))[1]]
    dates.sort(key=lambda date: datetime.strptime(date, "%d-%m-%Y"), reverse=True)
    dates = [{"name": x} for x in dates]
    with open(f'./data/list.json', 'w', encoding='UTF8', newline='') as f:
        writer = json.dump(dates, f, ensure_ascii=False)

In [12]:
list_dates()

In [36]:
topics, rules = find_topics(docs, 'leverage', 0.005)

Processing 49 combinations | Sampling itemset size 7653


In [39]:
topics

[('Kiev', 'Ukraine', 'ballon', 'guerre', 'russe', 'char', 'Russie'),
 ('Pierre',
  'Palmade',
  'accident',
  'garde',
  'vue',
  'homme',
  'humoriste',
  'passager'),
 ('Pen', 'réforme', 'motion', 'retraite', 'marine', 'censure'),
 ('vue', 'humoriste', 'passager', 'garde'),
 ('dérive',
  'audit',
  'Noël',
  'Graët',
  'FFF',
  'fonction',
  'crise',
  'comportement',
  'incompatible'),
 ('Pierre',
  'Palmade',
  'pratique',
  'affaire',
  'chemsex',
  'humoriste',
  'passager'),
 ('Écosse', 'ministre', 'Nicola', 'Sturgeon', 'démission'),
 ('Palmade', 'accident', 'comédien', 'humoriste', 'Pierre'),
 ('Buffalo', 'an', 'raciste', 'tuerie', 'prison', 'vie'),
 ('russe', 'char', 'Kiev', 'Ukraine'),
 ('dérive', 'Noël', 'impossible', 'FFF', 'statu', 'comportement'),
 ('Amélie', 'Oudéa', 'Castéra'),
 ('Vuitton', 'Louis', 'Pharrell', 'Williams'),
 ('statu', 'impossible'),
 ('démembré', 'butte', 'Chaumont', 'corps'),
 ('Netflix', 'Salto', 'français'),
 ('streaming', 'plateforme'),
 ('fonction'

In [14]:
rules.head(15)[['antecedents', 'consequents', 'support', 'confidence', 'lift', 'leverage', 'conviction']]


Unnamed: 0,antecedents,consequents,support,confidence,lift,leverage,conviction
158,(Ukraine),(guerre),0.043071,0.676471,12.901261,0.039733,2.928839
159,(guerre),(Ukraine),0.043071,0.821429,12.901261,0.039733,5.243446
84,(Pierre),(Palmade),0.041199,1.0,21.36,0.03927,inf
85,(Palmade),(Pierre),0.041199,0.88,21.36,0.03927,7.990012
272,(réforme),(retraite),0.039326,1.0,18.413793,0.03719,inf
273,(retraite),(réforme),0.039326,0.724138,18.413793,0.03719,3.482444
233,(garde),(vue),0.029963,0.941176,29.564014,0.028949,16.458801
232,(vue),(garde),0.029963,0.941176,29.564014,0.028949,16.458801
86,(Palmade),(accident),0.02809,0.6,20.025,0.026687,2.425094
87,(accident),(Palmade),0.02809,0.9375,20.025,0.026687,15.250936


In [15]:
def similarity(x, y):
    count = 0
    for a in x:
        for b in y:
            if (b == a):
                count += 1
    return count/len(x)

In [16]:
def find_similarities(trend, docs, threshold=0.3):
    results = []
    for i, doc in enumerate(docs):
        sim = similarity(trend, doc)
        if (sim > threshold):
            results.append((i, sim))
    results = sorted(results, key=lambda x: -x[1])
    return results

In [17]:
def find_trends(topics, docs):
    trends = []
    for topic in topics:
        similar_docs = find_similarities(topic, docs)
        img = None
        for doc in similar_docs:
            if (news_list.iloc[doc[0]]['img_url']):
                img = news_list.iloc[doc[0]]['img_url']
                break
        trends.append({
            "topic": topic,
            "docs": similar_docs,
            "img_url": img
        })
    
    return trends