In [1]:
!pip install python-rake yake

Collecting python-rake
  Downloading python_rake-1.5.0-py3-none-any.whl (14 kB)
Collecting yake
  Downloading yake-0.4.8-py2.py3-none-any.whl (60 kB)
     |████████████████████████████████| 60 kB 243 kB/s            
[?25hCollecting segtok
  Downloading segtok-1.5.11-py3-none-any.whl (24 kB)
Collecting jellyfish
  Downloading jellyfish-0.9.0.tar.gz (132 kB)
     |████████████████████████████████| 132 kB 474 kB/s            
[?25h  Preparing metadata (setup.py) ... [?25l- done
Building wheels for collected packages: jellyfish
  Building wheel for jellyfish (setup.py) ... [?25l- \ | / - \ | / - \ | done
[?25h  Created wheel for jellyfish: filename=jellyfish-0.9.0-cp37-cp37m-linux_x86_64.whl size=101429 sha256=171ef69fa74c1e3ad30ad481b23db02305a0e757b1d7151399ec19c73ea12cae
  Stored in directory: /root/.cache/pip/wheels/fe/99/4e/646ce766df0d070b0ef04db27aa11543e2767fda3075aec31b
Successfully built jellyfish
Installing collected packages: segtok

In [2]:
import pandas as pd
import numpy as np
import os
import re
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
import string
import random
import datetime
import unicodedata
from heapq import nlargest
from string import punctuation
from string import digits
from nltk.stem import WordNetLemmatizer

from scipy.spatial import distance
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation as LDA
from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import silhouette_score

import spacy
import yake
import RAKE
import matplotlib.pyplot as plt
import joblib
from tqdm import tnrange
import networkx as nx

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# 1. Read data

## Combine 7 dataframes together

In [3]:
df = {}
for dirname, _, filenames in os.walk('../input/herald-news'):
    for index, filename in enumerate(filenames):
        df[index] = pd.read_json(os.path.join(dirname, filename))

for key in df.keys():
    if key == 0:
        df_all = df[0]
    else:
        df_all = df_all.append(df[key])

df = df_all.rename(lambda x: x.lstrip(' '), axis='columns').sort_values('time').reset_index()
df['time'] = pd.to_datetime(df['time'])

del df_all

## 2. Preprocessing text
- replace numbers with #'s

In [4]:
contraction_dict = {
    "ain't": "is not",
    "aren't": "are not",
    "can't": "cannot",
    "'cause": "because",
    "could've": "could have",
    "couldn't": "could not",
    "didn't": "did not",
    "doesn't": "does not",
    "don't": "do not",
    "hadn't": "had not",
    "hasn't": "has not",
    "haven't": "have not",
    "he'd": "he would",
    "he'll": "he will",
    "he's": "he is",
    "how'd": "how did",
    "how'd'y": "how do you",
    "how'll": "how will",
    "how's": "how is",
    "I'd": "I would",
    "I'd've": "I would have",
    "I'll": "I will",
    "I'll've": "I will have",
    "I'm": "I am",
    "I've": "I have",
    "i'd": "i would",
    "i'd've": "i would have",
    "i'll": "i will",
    "i'll've": "i will have",
    "i'm": "i am",
    "i've": "i have",
    "isn't": "is not",
    "it'd": "it would",
    "it'd've": "it would have",
    "it'll": "it will",
    "it'll've": "it will have",
    "it's": "it is",
    "let's": "let us",
    "ma'am": "madam",
    "mayn't": "may not",
    "might've": "might have",
    "mightn't": "might not",
    "mightn't've": "might not have",
    "must've": "must have",
    "mustn't": "must not",
    "mustn't've": "must not have",
    "needn't": "need not",
    "needn't've": "need not have",
    "o'clock": "of the clock",
    "oughtn't": "ought not",
    "oughtn't've": "ought not have",
    "shan't": "shall not",
    "sha'n't": "shall not",
    "shan't've": "shall not have",
    "she'd": "she would",
    "she'd've": "she would have",
    "she'll": "she will",
    "she'll've": "she will have",
    "she's": "she is",
    "should've": "should have",
    "shouldn't": "should not",
    "shouldn't've": "should not have",
    "so've": "so have","so's": "so as",
    "this's": "this is","that'd": "that would",
    "that'd've": "that would have",
    "that's": "that is",
    "there'd": "there would",
    "there'd've": "there would have",
    "there's": "there is",
    "here's": "here is",
    "they'd": "they would",
    "they'd've": "they would have",
    "they'll": "they will",
    "they'll've": "they will have",
    "they're": "they are",
    "they've": "they have",
    "to've": "to have",
    "wasn't": "was not",
    "we'd": "we would",
    "we'd've": "we would have",
    "we'll": "we will",
    "we'll've": "we will have",
    "we're": "we are",
    "we've": "we have",
    "weren't": "were not",
    "what'll": "what will",
    "what'll've": "what will have",
    "what're": "what are",
    "what's": "what is",
    "what've": "what have",
    "when's": "when is",
    "when've": "when have",
    "where'd": "where did",
    "where's": "where is",
    "where've": "where have",
    "who'll": "who will",
    "who'll've": "who will have",
    "who's": "who is",
    "who've": "who have",
    "why's": "why is",
    "why've": "why have",
    "will've": "will have",
    "won't": "will not",
    "won't've": "will not have",
    "would've": "would have",
    "wouldn't": "would not",
    "wouldn't've": "would not have",
    "y'all": "you all",
    "y'all'd": "you all would",
    "y'all'd've": "you all would have",
    "y'all're": "you all are",
    "y'all've": "you all have",
    "you'd": "you would",
    "you'd've": "you would have",
    "you'll": "you will", 
    "you'll've": "you will have",
    "you're": "you are", "you've": "you have"}

In [5]:
# Similar word
UN_WORD = "The United Nations"
US_WORD = "The United States"
NK_WORD = "North Korea"
SK_WORD = "South Korea"

similar_words = {
    # Change to "The United States"
    "U.S.": US_WORD,
    "US": US_WORD,
    "USA": US_WORD,
    "United States": US_WORD,
    "United States'": US_WORD,
    "The United States'": US_WORD,
    
    # Change to "North Korea"
    "NK": NK_WORD,
    "NK's": NK_WORD,
    "N. Korea": NK_WORD,
    "N. Korea's": NK_WORD,
    "North Korea's": NK_WORD,
    
    # Change to "South Korea"
    "SK": SK_WORD,
    "SK's": SK_WORD,
    "S. Korea": SK_WORD,
    "S. Korea's": SK_WORD,
    "South Korea's": SK_WORD,
    
    # Change to "The United Nations"
    "United Nations": UN_WORD,
    "United Nations'": UN_WORD,
    "The United Nations'": UN_WORD,
    "UN": UN_WORD,
}

In [6]:
puncts = [',', '.', '"', ':', ')', '(', '-', '!', '?', '|', ';', '$', '&', '/', '[', ']', '>', '%', '=', '#', '*', '+', '\\', '•',  '~', '@', '£', 
          '·', '_', '{', '}', '©', '^', '®', '`',  '<', '→', '°', '€', '™', '›',  '♥', '←', '×', '§', '″', '′', ' ', '█', '½', 'à', '…', 
          '“', '★', '”', '–', '●', 'â', '►', '−', '¢', '²', '¬', '░', '¶', '↑', '±', '¿', '▾', '═', '¦', '║', '―', '¥', '▓', '—', '‹', '─', 
          '▒', '：', '¼', '⊕', '▼', '▪', '†', '■', '’', '▀', '¨', '▄', '♫', '☆', 'é', '¯', '♦', '¤', '▲', 'è', '¸', '¾', 'Ã', '⋅', '‘', '∞', 
          '∙', '）', '↓', '、', '│', '（', '»', '，', '♪', '╩', '╚', '³', '・', '╦', '╣', '╔', '╗', '▬', '❤', 'ï', 'Ø', '¹', '≤', '‡', '√', ]

In [7]:
### text cleaning function
def text_cleaning(s: str, lemmatize: bool = False, convert_number: bool = False, lower: bool = False):
    
    # Remove non-english characters
    def replace_strange_char(s: str):
        non_en_chars = {
            "’": "'",
            "‘": "'"
        }

        def remove_non_en_chars(txt):
            # remove non english characters
            txt = convert_latin_chars(txt)
            for char in non_en_chars.keys():
                txt = re.sub(char, non_en_chars[char], txt)
            txt = re.sub(r'[^\x00-\x7F]+', ' ', txt)
            return txt

        def convert_latin_chars(txt):
            # convert latin characters
            return ''.join(char for char in unicodedata.normalize('NFKD', txt) if unicodedata.category(char) != 'Mn')

        s = remove_non_en_chars(s)
        s = convert_latin_chars(s)
        return s
    s = replace_strange_char(s)
    
    # Remove punctuations except apostrophe
    for punct in puncts:
        if punct in s:
            s = s.replace(punct, ' ')
    
    # Replace multiple space with a single space
    s = re.sub(r'\s+', ' ', s, flags=re.I)
    
    # Replace similar terms
    for key,value in similar_words.items():
        s = re.sub(key, value, s)
        
    # Convert to lowercase
    if lower == True:
        s = s.lower()
    
    # Convert contraction
    def _get_contractions(contraction_dict):
        contraction_re = re.compile('(%s)' % '|'.join(contraction_dict.keys()))
        return contraction_dict, contraction_re

    contractions, contractions_re = _get_contractions(contraction_dict)

    def replace_contractions(text):
        def replace(match):
            return contractions[match.group(0)]
        return contractions_re.sub(replace, text)
    s = replace_contractions(s)
    
    s = re.sub(r"'s\b", '', s)
    
    # Lemmatize and remove stopwords
    if lemmatize == True:
        stemmer = WordNetLemmatizer()
        s = s.split()
        stopwords = nltk.corpus.stopwords.words('english')
        tokens = [stemmer.lemmatize(word) for word in s if word not in stopwords]
        s = ' '.join(tokens)
    
    # Convert number to symbol #'s
    def convert_numbers(x):
        if bool(re.search(r'\d', x)):
            x = re.sub('[0-9]{5,}', '#####', x)
            x = re.sub('[0-9]{4}', '####', x)
            x = re.sub('[0-9]{3}', '###', x)
            x = re.sub('[0-9]{2}', '##', x)
        return x
    if convert_number == True:
        s = convert_numbers(s)
    

    return s

### Importance = 7 days 1 news is represented so many times

## How to group same topic together ex. Google AI
### Approach 1
1. Consider period of 7 days
2. Check similarity between news in that 7 days

In [8]:
# NLP Tools
nlp = spacy.load("en_core_web_lg")

def load_glove_index():
    EMBEDDING_FILE = '../input/popular-embedding/glove.840B.300d/glove.840B.300d.txt'
    def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')[:300]
    embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE))
    return embeddings_index

glove_embedding_index = load_glove_index()

def create_glove(word_index,embeddings_index):
    emb_mean,emb_std = -0.005838499,0.48782197
    all_embs = np.stack(embeddings_index.values())
    embed_size = all_embs.shape[1]
    nb_words = min(max_features, len(word_index))
    embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
    count_found = nb_words
    for word, i in tqdm(word_index.items()):
        if i >= max_features: 
            continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None: 
            embedding_matrix[i] =  embedding_vector
        else:
            count_found-=1
    print("Got embedding for ",count_found," words.")
    return embedding_matrix

In [9]:
def summarize_text(body: str, top_n: float = 0.3, method: str = 'word_freq'):
    
    if body == '' or body == ' ':
        return body
    
    elif method == 'word_freq':
        docs = nlp(body)
        tokens = [text_cleaning(token.text,lemmatize=True,lower=True) for token in docs]
        sent_tokens = []
        for sent in docs.sents:
            clean_sent = text_cleaning(sent.text,lemmatize=True,lower=True)
            if clean_sent != ' ' and clean_sent != '':
                sent_tokens.append(clean_sent)
        
        # Sentence score by sum of word frequency
        word_freq = {}
        for token in tokens:
            try:
                word_freq[token] += 1
            except:
                word_freq[token] = 1
        max_freq = max(word_freq.values())
        for word in word_freq.keys():
            word_freq[word] = word_freq[word]/max_freq
        
        sent_score = {}
        for index, sent in enumerate(sent_tokens):
            for word in sent.split():
                if word in word_freq:
                    try:
                        sent_score[index] += word_freq[word]
                    except:
                        sent_score[index] = word_freq[word]
        summary = nlargest(n=int(len(sent_tokens)*top_n),iterable=sent_score,key=sent_score.get) 

        top_n_sentences = [sent_tokens[index] for index in summary]
        return top_n_sentences
            
    elif method == 'text_rank':
        docs = nlp(body)
        tokens = [text_cleaning(token.text,lemmatize=True,lower=True) for token in docs]
        sent_tokens = []
        for sent in docs.sents:
            clean_sent = text_cleaning(sent.text,lemmatize=True,lower=True)
            if clean_sent != ' ' and clean_sent != '':
                sent_tokens.append(clean_sent)
        
        # Sentence score by TextRank
        sentence_vectors = []
        for sent in sent_tokens:
            if len(sent) != 0:
                v = sum([glove_embedding_index.get(w, np.zeros((300,))) for w in sent.split()])/(len(sent.split())+0.001)
            else:
                v = np.zeros((300,))
            sentence_vectors.append(v)
            
        similarity_matrix = np.zeros([len(sent_tokens), len(sent_tokens)])
        for i in range(len(sent_tokens)):
            for j in range(len(sent_tokens)):
                if i != j:
                    similarity_matrix[i][j] = cosine_similarity(sentence_vectors[i].reshape(1,300), sentence_vectors[j].reshape(1,300))[0,0]
        nx_graph = nx.from_numpy_array(similarity_matrix)
        
        try:
            textrank_scores = nx.pagerank(nx_graph, max_iter=500)
            ranked_sentences = sorted(((textrank_scores[i],s) for i,s in enumerate(sent_tokens)), reverse=True)
            top_n_sentences = [ ranked_sentences[i][1] for i in range(int(len(sent_tokens)*top_n)) ]
            
        except:
            ### Failed to converge error
            top_n_sentences = summarize_text(body, top_n, 'word_freq')
        return top_n_sentences
    
    else:
        raise NameError('no such {} method'.format(method))

In [10]:
def find_topics(text: str,
                preprocess: bool = False,
                number_topics: int = 1,
                number_words: int = 2
               ):
    """
    Function that takes a text as an input, and finds the two most important topics/tags
    """
    if preprocess:
        text = text_cleaning(text, lemmatize=True, lower=True)
    
    count_vectorizer = CountVectorizer(stop_words='english')
    count_data = count_vectorizer.fit_transform([text])
    
    lda = LDA(n_components=number_topics, n_jobs=-1)
    lda.fit(count_data)

    words = count_vectorizer.get_feature_names()

    # Get topics from model. They are represented as a list e.g. ['military','army']
    topics = [[words[i] for i in topic.argsort()[:-number_words - 1:-1]] for (topic_idx, topic) in enumerate(lda.components_)]
    topics = np.array(topics).ravel()

    return topics

In [11]:
top_n = 0.3
df['summary'] = df['body'].apply(lambda body: ' '.join(summarize_text(body, top_n, 'text_rank')))

In [12]:
group = {}
for start_date in df['time'].dt.date.unique():
    
    end_date = start_date + datetime.timedelta(days=7)
    target_df = df[(df['time'].dt.date >= start_date) & (df['time'].dt.date < end_date)]
    
    # Find TFIDF among these news in 7 days 
    tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), min_df=0)
    tf_vector = tf.fit_transform(target_df['summary'])
    
    # Clustering by DBSCAN
    dbscan = DBSCAN(metric='cosine', eps=0.25, min_samples=1).fit(tf_vector)
    num_labels = len(np.unique(dbscan.labels_))
    
    # Save groups of same news
    sm_group = {}
    for index, label in zip(target_df.index,dbscan.labels_):
        try:
            sm_group[label].append(index)
        except:
            sm_group[label] = [index]
    for label in sm_group.keys():
        if label == -1:
            continue
        try:
            group[start_date.year].append(sm_group[label])
        except:
            group[start_date.year] = [sm_group[label]]    


In [13]:
language = "en"
max_ngram_size = 3
deduplication_threshold = 0.9
numOfKeywords = 5
kw_extractor = yake.KeywordExtractor(lan=language, n=max_ngram_size, dedupLim=deduplication_threshold, top=numOfKeywords, features=None)

In [14]:
trend = {}
for year in [2015, 2016, 2017]:
    sorted_trend = sorted(group[year], key=lambda x:len(x), reverse=True)
    print("Size of news containing in top 10 trends in {}".format(year))
    existing_keywords = []
    j=0
    for i in range(10):
        
        while True:
            # Extract keywords/topic for each group of news
            news_indexs = sorted_trend[j]
            news_body = ' '.join(df['summary'][news_indexs])

            # Print keywords
            keywords = sorted(kw_extractor.extract_keywords(news_body), key=lambda x: x[1], reverse=True)
            keyword = keywords[0][0]
            
            j+=1
            if not keyword in existing_keywords:
                print("{}. {}".format(i, keyword))
                existing_keywords.append(keyword)
                break
    trend[year] = existing_keywords

Size of news containing in top 10 trends in 2015
0. people succumbing illness
1. discus chinese led
2. korea fatality rate
3. entity china iran
4. sister north korean
5. car plunged wanganui
6. mutual exchange people
7. korean victim japan
8. dead apartment seocho
9. expected start job
Size of news containing in top 10 trends in 2016
0. treaty talk agreement
1. bound south korea
2. ordered company run
3. point diplomatic security
4. expand merit based
5. korea joel wit
6. national security threat
7. miniaturized hydrogen bomb
8. capability virus spread
9. head national oil
Size of news containing in top 10 trends in 2017
0. state special agent
1. south korea united
2. korean radio free
3. pressing court finalizing
4. solution saenuri party
5. country south korea
6. democratic party korea
7. percent month ago
8. condemning north korea
9. north korea alive
