In [1]:
import pandas as pd
import pickle

df = pd.read_csv('data/df_news.csv')
data = df['summary'].tolist()

In [3]:
def sent_to_words(sentences, min_len=2, max_len=15):
    # tokenize words
    import gensim
    for sentence in sentences:
        yield gensim.utils.simple_preprocess(str(sentence), deacc=True, 
                                             min_len=min_len, max_len=max_len)  # deacc=True removes punctuations


def remove_stopwords(texts, default='english', extensions=None):
    # nltk.download('stopwords')
    from nltk.corpus import stopwords
    stop_words = []
    if default is not None:
        stop_words.extend(stopwords.words(default))
    if extensions is not None:
        stop_words.extend(extensions)
    import gensim
    return [[word for word in gensim.utils.simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]


def make_bigrams(data_words):
    import gensim
    bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshld fewer phrases
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    return [bigram_mod[doc] for doc in data_words], bigram, bigram_mod


def make_trigrams(data_words, bigram, bigram_mod):
    import gensim
    trigram = gensim.models.Phrases(bigram[data_words], threshold=100)
    trigram_mod = gensim.models.phrases.Phraser(trigram)
    return [trigram_mod[bigram_mod[doc]] for doc in data_words], trigram, trigram_mod


def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    '''
    Lemmatization for LDA topic modeling.
    '''
    import spacy
    """https://spacy.io/api/annotation"""
    # Initialize spacy 'en' model, keeping only tagger component (for efficiency)
    # python3 -m spacy download en
    nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent))
        # do lemmatization and only keep the types of tokens in allowed_postags
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out


def lemmatization2(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    '''
    Lemmatization for BERT. 
    Although BERT has its own tokenizer, we need match the words for BERT and LDA.
    '''
    import spacy
    """https://spacy.io/api/annotation"""
    # Initialize spacy 'en' model, keeping only tagger component (for efficiency)
    # python3 -m spacy download en
    nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent))
        # for tokens whose types in allowed_postages do lemmatization otherwise keep the original form
        texts_out.append([str(token.lemma_) if token.pos_ in allowed_postags else token for token in doc])
    return texts_out


def lemmatization3(texts):
    '''
    Lemmatization for leave-out estimator
    '''
    import spacy
    """https://spacy.io/api/annotation"""
    # Initialize spacy 'en' model, keeping only tagger component (for efficiency)
    # python3 -m spacy download en
    nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent))
        # for all tokens do lemmatization and keep all tokens
        texts_out.append([str(token.lemma_) for token in doc])
    return texts_out

In [4]:
def create_dict_corpus(data_words):
    import gensim.corpora as corpora
    
    # Create Dictionary
    id2word = corpora.Dictionary(data_words)

    # Create Corpus
    texts = data_words

    # Term Document Frequency
    corpus = [id2word.doc2bow(text) for text in texts]

    return corpus, id2word

In [4]:
def preprocessing_lda(data):
    import re
    
    # Remove Emails
    data = [re.sub('\S*@\S*\s?', '', sent) for sent in data]

    # Remove new line characters
    data = [re.sub('\s+', ' ', sent) for sent in data]

    # Remove distracting single quotes
    data = [re.sub("\'", "", sent) for sent in data]

    # tokenize words and clean-up text
    data_words = list(sent_to_words(data))

    # remove stop words
    # need to remove the news source names
    data_words_nostops = remove_stopwords(data_words, 
                                          extensions=['from', 'subject', 're', 'edu', 
                                                       'use', 'rt', 'cnn', 'fox', 'huffington', 'breitbart'])

    # form bigrams
    data_words_bigrams, _, _ = make_bigrams(data_words_nostops)

    #  do lemmatization keeping only noun, adj, vb, adv, propnoun
    # other tokens are not useful for topic modeling
    data_lematized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV', 'PROPN'])
    
    corpus, id2word = create_dict_corpus(data_lematized)

    return data_lematized, corpus, id2word

In [5]:
texts_processed_lda, corpus_lda, id2word_lda = preprocessing_lda(data)

In [6]:
pickle.dump(texts_processed_lda, open('data/texts_processed_lda.pkl', 'wb'))
pickle.dump((corpus_lda, id2word_lda), open('data/corpus_lda.pkl', 'wb'))

In [7]:
def preprocessing_bert(data):
    import re
    
    # Remove Emails
    data = [re.sub('\S*@\S*\s?', '', sent) for sent in data]

    # Remove new line characters
    data = [re.sub('\s+', ' ', sent) for sent in data]

    # tokenize words and clean-up text
    data_words = list(sent_to_words(data,min_len=1, max_len=30))

    # remove stop words
    data_words_nostops = remove_stopwords(data_words, default=None,
                                          extensions=['cnn', 'fox', 'huffington', 'breitbart'])

    # form bigrams
    data_words_bigrams, _, _ = make_bigrams(data_words)

    #  do lemmatization for only noun, adj, vb, adv propnoun, following the lemmatization for LDA
    #  keep the others which will be used as context
    data_lematized = lemmatization2(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV', 'PROPN'])
    
    return data_lematized

In [8]:
text_processed_bert = preprocessing_bert(data)

In [9]:
text_processed_bert = [[str(x) for x in y] for y in text_processed_bert]
pickle.dump(text_processed_bert, open('data/texts_processed_bert.pkl', 'wb'))

In [10]:
text_processed_bert[0][0]

'reese_witherspoon'

In [11]:
def preprocessing_lo(data):
    import re
    
    # Remove Emails
    data = [re.sub('\S*@\S*\s?', '', sent) for sent in data]

    # Remove new line characters
    data = [re.sub('\s+', ' ', sent) for sent in data]
    
    # Remove distracting single quotes
    data = [re.sub("\'", "", sent) for sent in data]

    # tokenize words and clean-up text
    data_words = list(sent_to_words(data))

    # remove stop words
    data_words_nostops = remove_stopwords(data_words)

    # form bigrams
    data_words_bigrams, _, _ = make_bigrams(data_words_nostops)

    #  do lemmatization for only noun, adj, vb, adv, and keep all of them
    data_lematized = lemmatization3(data_words_bigrams)
    corpus, id2word = create_dict_corpus(data_lematized)
    
    return data_lematized, corpus, id2word

In [12]:
text_processed_lo, corpus_lo, id2word_lo = preprocessing_lo(data)

In [13]:
pickle.dump(text_processed_lo, open('data/texts_processed_lo.pkl', 'wb'))
pickle.dump((corpus_lo, id2word_lo), open('data/corpus_lo.pkl', 'wb'))