In [1]:
from typing import Any, Dict
from kedro.context import load_context

from time import time
import pandas as pd
import numpy as np
from scipy import sparse
import os
import requests

from sklearn.datasets import fetch_20newsgroups
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from gensim import matutils
from gensim.models import Phrases
from gensim.corpora import Dictionary, MmCorpus

## Get data

In [2]:
def preprocess_20NG(df):
    return df

def get_data_20NG():
    train_data = fetch_20newsgroups(subset='train')
    test_data = fetch_20newsgroups(subset='test')
    tmp_train = [train_data.data[doc] for doc in range(len(train_data.data))]
    tmp_test = [test_data.data[doc] for doc in range(len(test_data.data))]
    data = tmp_train + tmp_test
    timestamps = [0] * len(data)
    df = pd.DataFrame({'text':data, 'timestamp':timestamps})
    df = preprocess_20NG(df)
    return df


def preprocess_UNGD(df):
    df.rename({'year':'timestamp'}, axis=1, inplace=True)
    df = df[['timestamp', 'text']].copy()
    return df

def get_data_UNGD(id='1Gx1oBjcsJOgklxLGZ8iEoNJ5XHnCEcsm',
                  destination='data/01_raw/un-general-debates.csv'):

    def get_confirm_token(response):
        for key, value in response.cookies.items():
            if key.startswith('download_warning'):
                return value
        return None

    def save_response_content(response, destination):
        CHUNK_SIZE = 32768
        with open(destination, "wb") as f:
            for chunk in response.iter_content(CHUNK_SIZE):
                if chunk: # filter out keep-alive new chunks
                    f.write(chunk)

    if not os.path.isfile(destination):

        URL = "https://docs.google.com/uc?export=download"

        session = requests.Session()
        response = session.get(URL, params={'id': id}, stream=True)
        token = get_confirm_token(response)

        if token:
            params = {'id': id, 'confirm': token}
            response = session.get(URL, params=params, stream=True)

        print('Downloading dataset...')
        save_response_content(response, destination)
        print('Done downloading')

    df = pd.read_csv(destination)
    df = preprocess_UNGD(df) # apply specific pre-processing specific to UN dataset
    return df


def preprocess_SOGE(df):
    df.rename({'Date':'timestamp'}, axis=1, inplace=True)
    df.rename({'raisons_recommandation':'text'}, axis=1, inplace=True)
    df = df[['timestamp', 'text']].copy()
    return df

def get_data_SOGE(data_path='data/01_raw/verbatims_soge.csv'):
    df = pd.read_csv(data_path, sep=';', encoding='latin-1')
    df = preprocess_SOGE(df)
    return df

In [3]:
def get_data(dataset='20NG'):
    if dataset == '20NG':
        df = get_data_20NG()
    elif dataset == 'UNGD':
        df = get_data_UNGD()
    elif dataset == 'SOGE':
        df = get_data_SOGE()
    return df

In [4]:
df = get_data('20NG') #20NG, UNGD, SOGE

## Pre-process data

In [5]:
def split_by_paragraph(docs, timestamps):
    tmp_docs, tmp_timestamps = [], []
    for i, doc in enumerate(docs):
        splitted_doc = doc.split('.\n')
        for sd in splitted_doc:
            tmp_docs.append(sd)
            tmp_timestamps.append(timestamps[i])
    return tmp_docs, tmp_timestamps

def lowerize(docs):
    # Convert to lowercase.
    for idx in range(len(docs)):
        docs[idx] = str(docs[idx]).lower()
    return docs

def tokenize(docs):
    # Split into words.
    tokenizer = RegexpTokenizer(r'\w+')
    for idx in range(len(docs)):
        docs[idx] = tokenizer.tokenize(docs[idx])
    return docs

def remove_stop_words(docs):
    stop_words = set(stopwords.words('english'))
    for idx in range(len(docs)):
        docs[idx] = [w for w in docs[idx] if not w in stop_words]
    return docs

def remove_numbers(docs):
    # Remove numbers, but not words that contain numbers.
    docs = [[token for token in doc if not token.isnumeric()] for doc in docs]
    return docs

def remove_word_with_length(docs, length=1):
    # Remove words that are only (length=1) character.
    docs = [[token for token in doc if len(token) > length] for doc in docs]
    return docs

def lemmatize(docs):
    # Lemmatize the documents
    lemmatizer = WordNetLemmatizer()
    docs = [[lemmatizer.lemmatize(token) for token in doc] for doc in docs]
    return docs

def add_bigram(docs, min_bigram_count=20):
    # Add bigrams and trigrams to docs (only ones that appear 20 times or more).
    bigram = Phrases(docs, min_count=min_bigram_count)
    for idx in range(len(docs)):
        for token in bigram[docs[idx]]:
            if '_' in token:
                # Token is a bigram, add to document.
                docs[idx].append(token)
    return docs

In [6]:
def preprocess_dataset(dataset:pd.DataFrame,
                       flag_split_by_paragraph:bool,
                       flag_lemmatize:bool,
                       flag_bigram:bool,
                       min_bigram_count:int,
                       flag_word_analysis:bool) -> Dict[str, Any]:
    t0 = time()

    print('\nCurrent set of parameters :\n')
    print('\tflag_split_by_paragraph : {}'.format(flag_split_by_paragraph))
    print('\tflag_lemmatize : {}'.format(flag_lemmatize))
    print('\tflag_bigram : {}'.format(flag_bigram))
    print('\tmin_bigram_count : {}'.format(min_bigram_count))
    #print('\textreme_no_below : {}'.format(extreme_no_below))
    #print('\textreme_no_above : {}'.format(extreme_no_above))
    print('\tflag_word_analysis : {}\n'.format(flag_word_analysis))

    print('\nStart preprocessing on dataset')

    if "text" not in dataset.columns:
        raise ValueError('Dataset does not have a column named "text". You must rename the your text column to "text".')
    if "timestamp" not in dataset.columns:
        raise ValueError('Dataset does not have a column named "timestamp". You must rename your time column to "timestamp".')

    dataset.sort_values('timestamp', inplace=True)
    dataset.reset_index(drop=True, inplace=True)

    docs = dataset['text'].values
    timestamps = dataset['timestamp'].values

    if flag_split_by_paragraph:
        print('\nSplitting by paragraph...')
        docs, timestamps = split_by_paragraph(docs, timestamps)

    print('\nLowerizing...')
    docs = lowerize(docs)

    print('\nTokenizing...')
    docs = tokenize(docs)

    if flag_bigram:
        print('\nAdding bigrams...')
        docs = add_bigram(docs, min_bigram_count)

    if flag_word_analysis:

        print('\nBasic word analysis enabled. It will take more time to compute......')

        len_starting_vocab = len(Dictionary(docs))
        print('\nBeginning dictionary contains : {} words'.format(len_starting_vocab))

        print('\nRemoving stop words...')
        docs = remove_stop_words(docs)
        curr_len_vocab = len(Dictionary(docs))
        len_rm_words = len_starting_vocab - curr_len_vocab
        len_vocab = curr_len_vocab
        freq = round(len_rm_words / len_starting_vocab, 3) * 100
        print('\tRemoved {} stopwords from dictionary. It represents {}% of total words in starting vocabulary'.format(len_rm_words, freq))
        print('\tCurrent length of the vocabulary:', len_vocab)
        
        print('\nRemoving unique numbers (not words that contain numbers)...')
        docs = remove_numbers(docs)
        curr_len_vocab = len(Dictionary(docs))
        len_rm_words = len_vocab - curr_len_vocab
        len_vocab = curr_len_vocab
        freq = round(len_rm_words / len_starting_vocab, 3) * 100
        print('\tRemoved {} numeric words from dictionary. It represents {}% of total words in starting vocabulary'.format(len_rm_words, freq))
        print('\tCurrent length of the vocabulary:', len_vocab)
        
        print('\nRemoving words that contain only one character...')
        docs = remove_word_with_length(docs, length=1)
        curr_len_vocab = len(Dictionary(docs))
        len_rm_words = len_vocab - curr_len_vocab
        len_vocab = curr_len_vocab
        freq = round(len_rm_words / len_starting_vocab, 3) * 100
        print('\tRemoved {} one length characters from dictionary. It represents {}% of total words in starting vocabulary'.format(len_rm_words, freq))
        print('\tCurrent length of the vocabulary:', len_vocab)
        
        if flag_lemmatize:
            print('\nLemmatizing...')
            docs = lemmatize(docs)
            curr_len_vocab = len(Dictionary(docs))
            len_rm_words = len_vocab - curr_len_vocab
            len_vocab = curr_len_vocab
            freq = round(len_rm_words / len_starting_vocab, 3) * 100
            print('\tRemoved {} words from dictionary. It represents {}% of total words in starting vocabulary'.format(len_rm_words, freq))
            print('\tCurrent length of the vocabulary:', len_vocab)
        
    else:

        print('\nBasic word analysis disabled.')

        print('\nRemoving stop words...')
        docs = remove_stop_words(docs)

        print('\nRemoving unique numbers (not words that contain numbers)...')
        docs = remove_numbers(docs)

        print('\nRemoving words that contain only one character...')
        docs = remove_word_with_length(docs, length=1)
        
        if flag_lemmatize:
            print('\nLemmatizing...')
            docs = lemmatize(docs)


    # Create a dictionary representation of the documents.
    #print('\nCreating dictionary...')
    #dictionary = Dictionary(docs)

    #print('\nFiltering extremes...')
    # Filter out words that occur less than 20 documents, or more than 50% of the documents.
    #dictionary.filter_extremes(no_below=extreme_no_below, no_above=extreme_no_above)
    #extreme_no_below_str = str(extreme_no_below) if extreme_no_below > 1 else str(extreme_no_below*100)+'%'
    #extreme_no_above_str = str(extreme_no_above) if extreme_no_above > 1 else str(extreme_no_above*100)+'%'
    #print('\tKeeping words in no less than {} documents & in no more than {} documents'.format(extreme_no_below_str, extreme_no_above_str))

    # Bag-of-words representation of the documents.
    #corpus = [dictionary.doc2bow(doc) for doc in docs]

    #print('Number of unique tokens: %d' % len(dictionary))
    #print('Number of documents: %d' % len(corpus))


    print('\nTimestamps processing...')
    unique_time = np.unique(timestamps)
    mapper_time = dict(zip(unique_time, range(len(unique_time))))
    timestamps = np.array([mapper_time[timestamps[i]] for i in range(len(timestamps))])
    

    print('\nDone in {} minutes'.format(int((time()-t0)/60)))

    return [
        docs,
        #corpus,
        timestamps,
        #dictionary
           ]

In [7]:
docs, timestamps = preprocess_dataset(df, flag_split_by_paragraph=False, flag_lemmatize=True, flag_bigram=False,
                                      min_bigram_count=0, flag_word_analysis=True)


Current set of parameters :

	flag_split_by_paragraph : False
	flag_lemmatize : True
	flag_bigram : False
	min_bigram_count : 0
	flag_word_analysis : True


Start preprocessing on dataset

Lowerizing...

Tokenizing...

Basic word analysis enabled. It will take more time to compute......
2020-04-03 10:28:53,638 - gensim.corpora.dictionary - INFO - adding document #0 to Dictionary(0 unique tokens: [])
2020-04-03 10:28:56,008 - gensim.corpora.dictionary - INFO - adding document #10000 to Dictionary(118331 unique tokens: ['15', '2', '60s', '70s', 'a']...)
2020-04-03 10:28:58,158 - gensim.corpora.dictionary - INFO - built Dictionary(173807 unique tokens: ['15', '2', '60s', '70s', 'a']...) from 18846 documents (total 5986471 corpus positions)

Beginning dictionary contains : 173807 words

Removing stop words...
2020-04-03 10:28:58,474 - gensim.corpora.dictionary - INFO - adding document #0 to Dictionary(0 unique tokens: [])
2020-04-03 10:29:00,381 - gensim.corpora.dictionary - INFO - adding

## Split data

In [14]:
def remove_vocab(docs, vocab):
    docs = np.array([[w for w in doc if w in vocab] for doc in docs])
    return docs

def remove_empty(docs, timestamps):
    tmp_docs = []
    tmp_timestamps = []
    for i, doc in enumerate(docs):
        if(doc != []):
            tmp_docs.append(doc)
            tmp_timestamps.append(timestamps[i])
    return tmp_docs, tmp_timestamps

def remove_by_threshold(docs, timestamps, threshold):
    tmp_docs = []
    tmp_timestamps = []
    for i, doc in enumerate(docs):
        if(len(doc) > threshold):
            tmp_docs.append(doc)
            tmp_timestamps.append(timestamps[i])
    return tmp_docs, tmp_timestamps

def convert_to_bow(docs, dictionary):
    return [dictionary.doc2bow(doc) for doc in docs]

def create_list_words(in_docs, vocab):
    return [vocab.token2id[x] for y in in_docs for x in y]

def create_doc_indices(in_docs):
    aux = [[j for i in range(len(doc))] for j, doc in enumerate(in_docs)]
    return [int(x) for y in aux for x in y]

def create_sparse(doc_indices, words, n_docs, vocab_size):
    return sparse.coo_matrix(([1]*len(doc_indices), (doc_indices, words)), shape=(n_docs, vocab_size)).tocsr()

def create_sparse_matrix(docs, vocab):
    len_vocab = len(vocab)

    # Getting lists of words and doc_indices
    words = create_list_words(docs, vocab)

    # Get doc indices
    doc_indices = create_doc_indices(docs)
    
    n_docs = len(docs)

    # Create bow representation
    bow = create_sparse(doc_indices, words, n_docs, len_vocab)

    # Split bow intro token/value pairs
    #print('splitting bow intro token/value pairs and saving to disk...')
    #bow_tokens, bow_counts = split_bow(bow, n_docs)

    return bow

In [15]:
def split_data(docs:np.array,
               #corpus:MmCorpus,
               timestamps:np.array,
               #dictionary:Dictionary,
               extreme_no_below,
               extreme_no_above,
               test_size:float,
               val_size:float) -> Dict[str,Any]:

    # Split indexes into train/val/test sets
    print('\nSplitting indexes into train/val/test')
    num_docs = len(docs)

    val_len = int(val_size * num_docs)
    test_len = int(test_size * num_docs)
    train_len = int(num_docs - val_len - test_len)

    idx_permute = np.random.permutation(num_docs).astype(int)

    # Split docs and timestamps into train/val/test sets
    print('\nSpliiting docs and timestamps into train/val/test')
    train_docs = [docs[idx_permute[i]] for i in range(train_len)]
    val_docs = [docs[idx_permute[train_len+i]] for i in range(val_len)]
    test_docs = [docs[idx_permute[train_len+val_len+i]] for i in range(test_len)]

    train_timestamps = [timestamps[idx_permute[i]] for i in range(train_len)]
    val_timestamps = [timestamps[idx_permute[train_len+i]] for i in range(val_len)]
    test_timestamps = [timestamps[idx_permute[train_len+val_len+i]] for i in range(test_len)]

    print('\tNumber of documents in train set : {} [this should be equal to {} and {}]'.format(len(train_docs), train_len, len(train_timestamps)))
    print('\tNumber of documents in test set : {} [this should be equal to {} and {}]'.format(len(test_docs), test_len, len(test_timestamps)))
    print('\tNumber of documents in validation set: {} [this should be equal to {} and {}]'.format(len(val_docs), val_len, len(val_timestamps)))

    # Create a dictionary representation of the documents.
    print('\nCreating dictionary...')
    train_dictionary = Dictionary(train_docs)

    print('\tFiltering extremes...')
    # Filter out words that occur less than 20 documents, or more than 50% of the documents.
    train_dictionary.filter_extremes(no_below=extreme_no_below, no_above=extreme_no_above)
    extreme_no_below_str = str(extreme_no_below) if extreme_no_below > 1 else str(extreme_no_below*100)+'%'
    extreme_no_above_str = str(extreme_no_above) if extreme_no_above > 1 else str(extreme_no_above*100)+'%'
    print('\tKeeping words in no less than {} documents & in no more than {} documents'.format(extreme_no_below_str, extreme_no_above_str))
    print('\tNumber of unique tokens: %d' % len(train_dictionary))

    # Remove words not in train_data
    print('\nRemoving words not in train data .....')
    train_vocab = train_dictionary.token2id
    train_docs = remove_vocab(train_docs, train_vocab)
    val_docs = remove_vocab(val_docs, train_vocab)
    test_docs = remove_vocab(test_docs, train_vocab)

    # Remove empty documents
    print('\nRemoving empty documents')
    train_docs, train_timestamps = remove_empty(train_docs, train_timestamps)
    test_docs, test_timestamps = remove_empty(test_docs, test_timestamps)
    val_docs, val_timestamps = remove_empty(val_docs, val_timestamps)

    # Remove test documents with length=1
    print('\nRemoving test documents with length 1')
    test_docs, test_timestamps = remove_by_threshold(test_docs, test_timestamps, 1)

    # Split test set in 2 halves
    print('\nSplitting test set in 2 halves')
    test_docs_h1 = [[w for i,w in enumerate(doc) if i<=len(doc)/2.0-1] for doc in test_docs]
    test_docs_h2 = [[w for i,w in enumerate(doc) if i>len(doc)/2.0-1] for doc in test_docs]

    # Convert to Bag-of-Words representation
    print('\nCreating bow representation...')
    train_corpus = convert_to_bow(train_docs, train_dictionary)
    val_corpus = convert_to_bow(val_docs, train_dictionary)
    test_corpus = convert_to_bow(test_docs, train_dictionary)

    test_corpus_h1 = convert_to_bow(test_docs_h1, train_dictionary)
    test_corpus_h2 = convert_to_bow(test_docs_h2, train_dictionary)
    
    print('\tTrain bag of words shape : {}'.format(len(train_corpus)))
    print('\tVal bag of words shape : {}'.format(len(val_corpus)))
    print('\tTest bag of words shape : {}'.format(len(test_corpus)))
    print('\tTest set 1 bag of words shape : {}'.format(len(test_corpus_h1)))
    print('\tTest set 2 bag of words shape : {}'.format(len(test_corpus_h2)))

    # Convert to sparse matrices (scipy COO sparse matrix)
    print('\nCreating sparse matrices')
    train_sparse = create_sparse_matrix(train_docs, train_dictionary)
    test_sparse = create_sparse_matrix(test_docs, train_dictionary)
    test_sparse_h1 = create_sparse_matrix(test_docs_h1, train_dictionary)
    test_sparse_h2 = create_sparse_matrix(test_docs_h2, train_dictionary)
    val_sparse = create_sparse_matrix(val_docs, train_dictionary)

    print('\nDone splitting data.')

    return dict(
        train_docs=train_docs, train_corpus=train_corpus, train_sparse=train_sparse,
        val_docs=val_docs, val_corpus=val_corpus, val_sparse=val_sparse,
        test_docs=test_docs, test_corpus=test_corpus, test_sparse=test_sparse,
        test_docs_h1=test_docs_h1, test_corpus_h1=test_corpus_h1, test_sparse_h1=test_sparse_h1,
        test_docs_h2=test_docs_h2, test_corpus_h2=test_corpus_h2, test_sparse_h2=test_sparse_h2,
        train_timestamps=train_timestamps, val_timestamps=val_timestamps, test_timestamps=test_timestamps,
        dictionary=train_dictionary
    )

In [16]:
res = split_data(docs, timestamps, extreme_no_below=100, extreme_no_above=0.70, test_size=0.05, val_size=0.15)


Splitting indexes into train/val/test

Spliiting docs and timestamps into train/val/test
	Number of documents in train set : 15078 [this should be equal to 15078 and 15078]
	Number of documents in test set : 942 [this should be equal to 942 and 942]
	Number of documents in validation set: 2826 [this should be equal to 2826 and 2826]

Creating dictionary...
2020-04-03 10:35:10,790 - gensim.corpora.dictionary - INFO - adding document #0 to Dictionary(0 unique tokens: [])
2020-04-03 10:35:12,400 - gensim.corpora.dictionary - INFO - adding document #10000 to Dictionary(100923 unique tokens: ['age', 'also', 'andrew', 'audiophile', 'call']...)
2020-04-03 10:35:13,305 - gensim.corpora.dictionary - INFO - built Dictionary(127065 unique tokens: ['age', 'also', 'andrew', 'audiophile', 'call']...) from 15078 documents (total 2622559 corpus positions)
	Filtering extremes...
2020-04-03 10:35:13,444 - gensim.corpora.dictionary - INFO - discarding 124350 tokens: [('audiophile', 7), ('chan', 29), ('f