In [1]:
import re
import string
import numpy as np
import skipthoughts
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances_argmin_min

### Text Corpus Processing

In [94]:
def sentence_tokenizer(text): 
    remove = string.punctuation
    remove = re.sub("[\,\-\%']", "", remove) # don't remove hyphens, commas and apostrophes
    
    pattern_end_sentence = r"[{}]$".format(remove)
    text = re.sub(pattern_end_sentence, "", text)
    
    pattern = r"[{}]\s?".format(remove)
    return re.split(pattern, text)

def remove_punct(sentences): 
    remove = string.punctuation
    remove = re.sub("[\']", "", remove) # don't remove apostrophes
    
    pattern_punct = r"[{}]".format(remove)
    pattern_apoth = r"^\'"
    pattern_first_space = r"^\s"

    return list(
        filter(lambda sent : len(sent) != 0,
                       list(
        map(lambda sentence : 
            "".join(
                re.split(pattern_punct, 
                         re.sub(pattern_first_space, 
                                "", 
                                re.sub(pattern_apoth, "", sentence))
                        )
            ), sentences)
    )
              )
    )


def data_processing(df_text_corpus, col_list_text):
    texts_tokenized = df_text_corpus[col_list_text].map(lambda text : sentence_tokenizer(text))
    texts_tokenized_cleaned = texts_tokenized.map(lambda text : remove_punct(text))
    
    return texts_tokenized_cleaned

### Modelisation

In [95]:
def skipthought_encode(list_text):
    """
    Obtains sentence embeddings for each sentence in the list_text
    """
    enc_list_text = [None]*len(list_text)
    cum_sum_sentences = [0]
    sent_count = 0
    for text in list_text:
        sent_count += len(text)
        cum_sum_sentences.append(sent_count)

    all_sentences = [sent for text in list_text for sent in text]
    print('Loading pre-trained models...')
    model = skipthoughts.load_model()
    encoder = skipthoughts.Encoder(model)
    print('Encoding sentences...')
    enc_sentences = encoder.encode(all_sentences, verbose=False)

    for i in range(len(list_text)):
        begin = cum_sum_sentences[i]
        end = cum_sum_sentences[i+1]
        enc_list_text[i] = enc_sentences[begin:end]
    return enc_list_text
        
    
def summarize(list_text, original_list_text):
    """
    Performs summarization of list_text
    """
    nb_text = len(list_text)
    summary = [None]*nb_text
    
    print('Starting to encode...')
    enc_list_text = skipthought_encode(list_text)
    
    print('Encoding Finished')
    
    for i in range(nb_text):
        enc_text = enc_list_text[i]
        n_clusters = int(np.ceil(len(enc_text)**0.5))
        kmeans = KMeans(n_clusters=n_clusters, random_state=0)
        kmeans = kmeans.fit(enc_text)
        avg = []
        closest = []
        for j in range(n_clusters):
            idx = np.where(kmeans.labels_ == j)[0]
            avg.append(np.mean(idx))
        closest, _ = pairwise_distances_argmin_min(kmeans.cluster_centers_,\
                                                   enc_text)
        ordering = sorted(range(n_clusters), key=lambda k: avg[k])
        
        idx_sentences_choosed = sorted([closest[idx] for idx in ordering])
        
        summary[i] = '. '.join([original_list_text[i][idx] for idx in idx_sentences_choosed])
        
    print('Clustering Finished')
    
    return summary
      

### APPLICATION

In [102]:
text_test = ["""This was a failure by the US men’s national basketball team, no question about it. Team USA don’t suffer too many defeats, so each one tends to feel like a disaster. To be more specific, Wednesday’s 89-78 loss to France in the quarter-finals of the Fiba World Cup was the program’s first since 2006 (Thursday’s loss to Serbia guaranteed USA its worst placing in a major international tournament). While the France defeat came earlier in the tournament than many had predicted, nobody can claim it was completely unexpected. History, however, tells us that the Americans will be back and stronger than ever. In the meantime, let these results be another example of the health of the sport around the world."""]
text_corpus = pd.DataFrame({"article_text": text_test})
text_corpus

Unnamed: 0,article_text
0,This was a failure by the US men’s national ba...


In [103]:
original_list_text = [text.split('. ') for text in text_corpus['article_text'].values]
texts_processed = data_processing(text_corpus, "article_text")

In [104]:
list_text_processed = texts_processed.values.tolist()

In [105]:
summaries = summarize(list_text_processed, original_list_text)

Starting to encode...
Loading pre-trained models...
Loading model parameters...
Compiling encoders...
Loading tables...
Packing up...
Encoding sentences...
Encoding Finished
Clustering Finished


In [106]:
summaries[0]

'This was a failure by the US men’s national basketball team, no question about it. History, however, tells us that the Americans will be back and stronger than ever. In the meantime, let these results be another example of the health of the sport around the world.'

In [107]:
text_corpus["article_text"][0]

'This was a failure by the US men’s national basketball team, no question about it. Team USA don’t suffer too many defeats, so each one tends to feel like a disaster. To be more specific, Wednesday’s 89-78 loss to France in the quarter-finals of the Fiba World Cup was the program’s first since 2006 (Thursday’s loss to Serbia guaranteed USA its worst placing in a major international tournament). While the France defeat came earlier in the tournament than many had predicted, nobody can claim it was completely unexpected. History, however, tells us that the Americans will be back and stronger than ever. In the meantime, let these results be another example of the health of the sport around the world.'