In [1]:
import os
import re
import numpy as np
import pandas as pd
import random
import pickle
import string
import spacy
from gensim.models import KeyedVectors

from nltk.corpus import stopwords

Using TensorFlow backend.


In [2]:
nlp = spacy.load('en')

### Create train and test sets

For the summary generating model, I decided to focus on only the articles associated with crime news (cluster 0).

In [3]:
# load articles
clustered_articles_dict = pickle.load(open("clustered_articles_dict.pkl", "rb"))
len(clustered_articles_dict)

251328

In [3]:
# extract articles from cluster 0
# cluster 0 is crime-related articles
crime_dict = [article_dict for article_dict in clustered_articles_dict if article_dict["cluster"]==0]
print(len(crime_dict))

27760


In [5]:
# extract articles from other clusters
other_topic_dict = [article_dict for article_dict in clustered_articles_dict if article_dict["cluster"]!=0 and article_dict["cluster"]]
random.seed(918)
random.shuffle(other_topic_dict)
print(len(other_topic_dict))

223568


In [4]:
# create train and test set from the crime-related articles
# set aside 500 articles for testing
num_training = 500

random.seed(910)
random.shuffle(crime_dict)
test_articles = crime_dict[:num_training]
train_articles = crime_dict[num_training:]

print("Number of train articles: {}".format(len(train_articles)))
print("Number of test articles: {}".format(len(test_articles)))

Number of train articles: 27260
Number of test articles: 500


In [7]:
# pickle for future use
pickle.dump(train_articles, open("train_articles_dict.pkl", "wb"))
pickle.dump(test_articles, open("test_articles_dict.pkl", "wb"))
pickle.dump(other_topic_dict[:500], open("other_topic_test_dict.pkl", "wb"))

### Prepare articles and highlights for modeling

In [6]:
train_articles_dict = pickle.load(open("train_articles_dict.pkl", "rb"))
test_articles_dict = pickle.load(open("test_articles_dict.pkl", "rb"))

In [7]:
print("Number of train articles: {}".format(len(train_articles_dict)))
print("Number of test articles: {}".format(len(test_articles_dict)))

Number of train articles: 27260
Number of test articles: 500


In [8]:
# sample of the articles and summaries
train_articles_dict[100:102]

[{'article': 'CNN reason believe little girl spotted India Madeleine McCann British girl missing years family spokesman said Thursday Reports spotting girl bearing resemblance child disappeared vacation parents Portugal sparked frenzy Twitter Clarence Mitchell spokesman parents Kate Gerry said latest reports credible tips proved incorrect years learnt reports seriously Mitchell said parents adding suggest breakthrough said aware requests DNA match girl seen Leh northern India regional police chief India said knew sighting report originated Indian newspaper recovered girl Leh said Abdul Gani Mir deputy inspector general police central Kashmir Leh question carrying DNA girl checked checked officers ground Leh categorically told recovered girl Mir told CNN Madeleine McCann years old disappeared condo resort Portugal parents dined restaurant nearby Journalist Mukhtar Ahmad Srinagar contributed report',
  'cluster': 0,
  'file': './cnn/stories/99f5e941b36b56c273fc9582c6526d310acd16e4.story'

In [9]:
# extract articles and summaries
train_articles = [article["article"] for article in train_articles_dict]
train_highlights = [article["highlights"] for article in train_articles_dict]

print(len(train_articles))
print(len(train_highlights))

27260
27260


In [10]:
# last minute cleaning
train_articles = [re.sub(r"\'", "", article) for article in train_articles]
train_highlights = [re.sub(r"\'", "", article) for article in train_highlights]

train_articles = [re.sub(r"million[s]*", " million", article) for article in train_articles]
train_highlights = [re.sub(r"million[s]*", " million", article) for article in train_highlights]

In [11]:
def count_words(count_dict, text):
    """
    Create a dictionary with the number of occurrences of each word.
    """
    for sentence in text:
        for word in sentence.split():
            if word not in count_dict:
                count_dict[word] = 1
            else:
                count_dict[word] += 1

In [12]:
# create a vocabulary from articles and highlights (with number of times each word was used)
word_counts = {}

count_words(word_counts, train_articles)
count_words(word_counts, train_highlights)
            
print("Size of Vocabulary:", len(word_counts))

Size of Vocabulary: 111409


After comparing words in the vocabulary dictionary with the words in different pre-trained word embeddings, Word2Vec Google News accounted for the most words in the vocabulary dictionary.

The Word2Vec Google News was trained on part of Google News dataset, and contains 300-dimensional vectors for 3 million words and phrases.

In [None]:
# download and unzip Word2Vec Google News
!curl -O https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz
!gunzip GoogleNews-vectors-negative300.bin.gz

In [14]:
# load and examine Word2Vec Google News
word2vec = KeyedVectors.load_word2vec_format("GoogleNews-vectors-negative300.bin", binary=True)
print("Number of word vectors in word2vec: {}".format(len(word2vec.vocab)))
print("Length of embedding: {}".format(len(word2vec.word_vec("jump"))))

Number of word vectors in word2vec: 3000000
Length of embedding: 300


In [17]:
# create dictionary of 
embeddings_index = {}
for word in list(word2vec.vocab):
    embeddings_index[word] = word2vec.word_vec(word)
        
print('Word embeddings:', len(embeddings_index))
print('Embedding length:', len(word2vec.word_vec(word)))

Word embeddings: 3000000
Embedding length: 300


In [18]:
# count the number of missing words (that are used more than 20 times threshold)
num_missing_words = 0
threshold = 20
missing_words = []

for word, count in word_counts.items():
    if count > threshold:
        if word not in word2vec.vocab:
            missing_words.append(word)
            num_missing_words += 1
            
missing_ratio = (num_missing_words/len(word_counts))*100
            
print("Number of words missing from CN: {}".format(num_missing_words))
print("Percent of words that are missing from vocabulary: {}%".format(missing_ratio))

Number of words missing from CN: 614
Percent of words that are missing from vocabulary: 0.5499999999999999%


In [None]:
# create a dictionary for converting words to an index value
vocab_to_int = {} 

value = 0
for word, count in word_counts.items():
    if count >= threshold or word in word2vec.vocab:
        vocab_to_int[word] = value
        value += 1

For more information about the special symbols, check out: https://medium.com/towards-data-science/sequence-to-sequence-model-introduction-and-concepts-44d9b41cd42d

In [None]:
# add special symbols to existing vocab_to_int dictionary
codes = ["<PAD>","<UNK>","<GO>"]   

for code in codes:
    vocab_to_int[code] = len(vocab_to_int)

In [None]:
# create a dictionary for convert index value to words
int_to_vocab = {}
for word, value in vocab_to_int.items():
    int_to_vocab[value] = word

In [2]:
usage_ratio = (len(vocab_to_int) / len(word_counts))*100

print("Total number of unique words:", len(word_counts))
print("Number of words included:", len(vocab_to_int))
print("Percent of words included: {}%".format(usage_ratio))

Total number of unique words: 111409
Number of words included: 87889
Percent of words included: 78.8885996643%


In [None]:
# pickle for future use
pickle.dump(vocab_to_int, open("./model_files/vocab_to_int.pkl", "wb"))
pickle.dump(int_to_vocab, open("./model_files/int_to_vocab.pkl", "wb"))

In [21]:
# create matrix of word embeddings for each word in vocab_to_int dictionary
embedding_dim = 300 # length of word embedding vectors
nb_words = len(vocab_to_int)

word_embedding_matrix = np.zeros((nb_words, embedding_dim), dtype=np.float32)
for word, i in vocab_to_int.items():
    if word in word2vec.vocab:
        word_embedding_matrix[i] = word2vec.word_vec(word)
    else:
        # create a vector of random numbers if the word is not included in the Word2Vec
        new_embedding = np.array(np.random.uniform(-1.0, 1.0, embedding_dim))
        embeddings_index[word] = new_embedding
        word_embedding_matrix[i] = new_embedding

print(len(word_embedding_matrix))

# pickle for future use
pickle.dump(word_embedding_matrix, open("./model_files/word_embedding_matrix.pkl","wb"))

87889


In [22]:
def convert_to_ints(list_text, word_count, unk_count):
    """
    Convert each work in text into an integer, while counting the total number of words and <UNK>. Include <EOS> at the end of the text.
    """
    ints = []
    word_count = 0
    unk_count = 0
    
    for sentence in list_text:
        sentence_ints = []
        for word in sentence.split():
            word_count += 1
            if word in vocab_to_int:
                sentence_ints.append(vocab_to_int[word])
            else:
                sentence_ints.append(vocab_to_int["<UNK>"])
                unk_count += 1
        sentence_ints.append(vocab_to_int["<EOS>"])
        ints.append(sentence_ints)
        
    return ints, word_count, unk_count

In [23]:
# Apply convert_to_ints to clean_summaries and clean_texts
int_summaries, word_count_summaries, unk_count_summaries = convert_to_ints(train_highlights)
int_articles, word_count_articles, unk_count_articles = convert_to_ints(train_articles)

total_word_count = word_count_summaries + word_count_articles
total_unk_count = unk_count_summaries + unk_count_summaries

print("Total number of words in headlines: {}".format(total_word_count))
print("Total number of UNKs in headlines: {}".format(total_unk_count))
print("Percent of words that are UNK: {}%".format(total_unk_count/total_word_count,4)*100))

Total number of words in headlines: 5371728
Total number of UNKs in headlines: 62941
Percent of words that are UNK: 1.17%


In [26]:
def unk_counter(int_text):
    """
    Count the total number of <UNK> in text.
    int_text is vector of integers representing text (string)
    """
    unk_count = 0
    for word in int_text:
        if word == vocab_to_int["<UNK>"]:
            unk_count += 1
    return unk_count

In [27]:
# articles with more than 20 <UNK> are excluded
unk_limit = 20
sorted_summaries = []
sorted_articles = []

# get length of each article
len_articles = [len(int_articles) for article in int_articles]

# sort the summaries and articles by the length of the articles from shortest to longest to reduce the number of <PAD> added
for length in range(min(len_articles), max(len_articles)+1):
    for index, words in enumerate(int_summaries):
        if (unk_counter(int_summaries[index]) <= unk_limit and
            unk_counter(int_articles[index]) <= unk_limit and
            length == len(int_articles[index])):
            sorted_summaries.append(int_summaries[index])
            sorted_articles.append(int_articles[index])
        
print(len(sorted_summaries))
print(len(sorted_articles))

# pickle for future use
pickle.dump(sorted_summaries, open("./model_files/sorted_summaries.pkl","wb"))
pickle.dump(sorted_articles, open("./model_files/sorted_articles.pkl","wb"))
pickle.dump(len(sorted_articles), open("./model_files/sorted_articles_length.pkl","wb"))

12653
12653


In [28]:
def pad_text_batch(text_batch):
    """
    Add <PAD> so that each text within the same batch has the same length.
    """
    max_text = max([len(text) for text in text_batch])
    return [text + [vocab_to_int["<PAD>"]] * (max_text - len(text)) for text in text_batch]

In [29]:
def get_batches(summaries, articles, batch_size):
    """
    Create dictionaries of individual batches with articles, summaries, and the respective lengths. 
    This way all of the articles and aummaries don't have to be loaded into memory at the same time.
    """
    # create directory named "batches"
    !mkdir batches
    
    num_batches = len(articles)//batch_size

    # create and pickle a dict of relevant info (articles and summaries) for each batch
    for batch_i in range(0, num_batches):
        start_i = batch_i * batch_size
        summaries_batch = summaries[start_i:start_i + batch_size]
        articles_batch = articles[start_i:start_i + batch_size]
        pad_summaries_batch = np.array(pad_text_batch(summaries_batch))
        pad_articles_batch = np.array(pad_text_batch(articles_batch))

        pad_summaries_lengths = []
        for summary in pad_summaries_batch:
            pad_summaries_lengths.append(len(summary))

        pad_articles_lengths = []
        for article in pad_articles_batch:
            pad_articles_lengths.append(len(article))

        data = {"summaries_batch":pad_summaries_batch, "articles_batch":pad_articles_batch, 
                "summaries_lengths":pad_summaries_lengths, "articles_lengths":pad_articles_lengths}
        file = "./batches/batch{}.pkl".format(batch_i)
        pickle.dump(data, open(file,"wb"))
    
    print("Number of batches created: {}".format(num_batches))
        
get_batches(sorted_summaries, sorted_articles, batch_size=64)

Number of batches created: 197
