In [1]:
from keras.preprocessing.text import Tokenizer

import json
from collections import OrderedDict


def create_and_save_tokenizer(data, num_words, outfilename):
    tokenizer = Tokenizer(num_words=num_words)
    tokenizer.fit_on_texts(data)

    tokenizer_dict = {
        "word_counts": list(tokenizer.word_counts.items()),
        "word_docs": tokenizer.word_docs,
        "word_index": tokenizer.word_index,
        "document_count": tokenizer.document_count,
        "index_docs":tokenizer.index_docs
    }

    with open(outfilename, 'w') as outfile:
        json.dump(tokenizer_dict, outfile)


def load_tokenizer_from_file(filename):
    
    tokenizer = Tokenizer()

    with open(filename, 'r') as infile:
        tokenizer_data = json.load(infile)

    tokenizer.word_counts = OrderedDict(tokenizer_data['word_counts'])
    tokenizer.word_docs = tokenizer_data['word_docs']
    tokenizer.word_index = tokenizer_data['word_index']
    tokenizer.document_count = tokenizer_data['document_count']
    tokenizer.index_docs = tokenizer_data['index_docs']

    return tokenizer

Using TensorFlow backend.
  return f(*args, **kwds)


In [2]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
from sklearn.utils import shuffle, resample
from keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from keras.utils import to_categorical
from sklearn.exceptions import DataConversionWarning
import warnings

warnings.filterwarnings(action='ignore', category=DataConversionWarning)

DATADIR = os.getenv('DATADIR')

labelled_level2 = pd.read_csv(
    os.path.join(DATADIR, 'labelled_level2.csv.gz'),
    dtype=object,
    compression='gzip'
)

# Create World taxon in case any items not identified 
# through doc type in clean_content are still present
labelled_level2.loc[labelled_level2['level1taxon'] == 'World', 'level2taxon'] = 'world_level1'

# **** TAXONS TO CATEGORICAL -> DICT **********
# *********************************************

# creating categorical variable for level2taxons from values
labelled_level2['level2taxon'] = labelled_level2['level2taxon'].astype('category')

# Add 1 because of zero-indexing to get 1-number of level2taxons as numerical targets
labelled_level2['level2taxon_code'] = labelled_level2.level2taxon.astype('category').cat.codes + 1

# create dictionary of taxon category code to string label for use in model evaluation
labels_index = dict(zip((labelled_level2['level2taxon_code']),
                        labelled_level2['level2taxon']))

# ***** NEW COLUMNS FREQUENCY COUNTS **********
# *********************************************

# count the number of content items per taxon into new column
labelled_level2['num_content_per_taxon'] = labelled_level2.groupby(["level2taxon"])['level2taxon'].transform("count")

print('Number of unique level2taxons: {}'.format(labelled_level2.level2taxon.nunique()))

# count the number of taxons per content item into new column
labelled_level2['num_taxon_per_content'] = labelled_level2.groupby(["content_id"])['content_id'].transform("count")

# **** RESHAPE data long -> wide by taxon *******
# ***********************************************

# reshape to wide per taxon and keep the combined text so indexing is consistent when splitting X from Y

multilabel = (labelled_level2.pivot_table(index=['content_id',
                                                 'combined_text',
                                                 'title',
                                                 'description'
                                                 ],
                                          columns='level2taxon_code',
                                          values='num_taxon_per_content')
              )

print('labelled_level2 shape: {}'.format(labelled_level2.shape))
print('multilabel (pivot table - no duplicates): {} '.format(multilabel.shape))

multilabel.columns.astype('str')

# THIS IS WHY INDEXING IS NOT ZERO-BASED convert the number_of_taxons_per_content values to 1, meaning there was an
# entry for this taxon and this content_id, 0 otherwise
binary_multilabel = multilabel.notnull().astype('int')

# shuffle to ensure no order is captured in train/dev/test splits
binary_multilabel = shuffle(binary_multilabel, random_state=0)

# delete the 1st order column name (='level2taxon') for later calls to column names (now string numbers of each taxon)
del binary_multilabel.columns.name

# ***** RESAMPLING OF MINORITY TAXONS **************
# ****************************************************
# - Training data = 80%
# - Development data = 10%
# - Test data = 10%

size_before_resample = binary_multilabel.shape[0]

size_train = int(0.8 * size_before_resample)  # train split
print('Size of train set:', size_train)

size_dev = int(0.1 * size_before_resample)  # test split
print('Size of dev/test sets:', size_dev)

# extract indices of training samples, which are to be upsampled

training_indices = [binary_multilabel.index[i][0] for i in range(0, size_train)]

upsampled_training = pd.DataFrame()
last_taxon = len(binary_multilabel.columns) + 1

for taxon in range(1, last_taxon):
    num_samples = binary_multilabel[binary_multilabel[taxon] == 1].shape[0]
    if num_samples < 500:
        print("Taxon code:", taxon, "Taxon name:", labels_index[taxon])
        print("SMALL SUPPORT:", num_samples)
        df_minority = binary_multilabel[binary_multilabel[taxon] == 1].loc[training_indices]
        if not df_minority.empty:
            # Upsample minority class
            print(df_minority.shape)
            df_minority_upsampled = resample(df_minority,
                                             replace=True,  # sample with replacement
                                             n_samples=(500),
                                             # to match majority class, switch to max_content_freq if works
                                             random_state=123)  # reproducible results

            print("FIRST 5 IDs:", [df_minority_upsampled.index[i][0] for i in range(0, 5)])

            # Combine majority class with upsampled minority class
            upsampled_training = pd.concat([upsampled_training, df_minority_upsampled])

            # Display new shape
            print("UPSAMPLING:", upsampled_training.shape)

upsampled_training = shuffle(upsampled_training, random_state=0)

balanced_df = pd.concat([binary_multilabel, upsampled_training])

# ********** CREATE Y ARRAY **************
# ****************************************

balanced_df.astype(int)
balanced_df.columns.astype(int)
# convert columns to an array. Each row represents a content item, each column an individual taxon
binary_multilabel = balanced_df[list(balanced_df.columns)].values
print('Example row of multilabel array {}'.format(binary_multilabel[2]))

# ******* Metadata ***************
# ********************************

# extract content_id index to df
meta_df = pd.DataFrame(balanced_df.index.get_level_values('content_id'))
meta_varlist = ['document_type',
                'first_published_at',
                'publishing_app',
                'primary_publishing_organisation']

for meta_var in meta_varlist:
    meta_df[meta_var] = meta_df['content_id'].map(
        dict(zip(labelled_level2['content_id'], labelled_level2[meta_var])))

# convert nans to empty strings for labelencoder types
meta_df = meta_df.replace(np.nan, '', regex=True)


def to_cat_to_hot(var):
    """one hot encode each metavar"""
    encoder = LabelEncoder()
    metavar_cat = var + "_cat"  # get categorical codes into new column
    meta_df[metavar_cat] = encoder.fit_transform(meta_df[var])
    tf.cast(meta_df[metavar_cat], tf.float32)
    return to_categorical(meta_df[metavar_cat])


dict_of_onehot_encodings = {}
for metavar in meta_varlist:
    if metavar != "first_published_at":
        print(metavar)
        dict_of_onehot_encodings[metavar] = to_cat_to_hot(metavar)

# First_published_at:
# Convert to timestamp, then scale between 0 and 1 so same weight as binary vars
meta_df['first_published_at'] = pd.to_datetime(meta_df['first_published_at'])
first_published = np.array(meta_df['first_published_at']).reshape(meta_df['first_published_at'].shape[0], 1)

scaler = MinMaxScaler()
first_published_scaled = scaler.fit_transform(first_published)

last_year = np.where(
    (np.datetime64('today', 'D') - first_published).astype('timedelta64[Y]')
    < np.timedelta64(1, 'Y'), 1, 0)

last_2years = np.where(
    (np.datetime64('today', 'D') - first_published).astype('timedelta64[Y]')
    < np.timedelta64(2, 'Y'), 1, 0)

last_5years = np.where(
    (np.datetime64('today', 'D') - first_published).astype('timedelta64[Y]')
    < np.timedelta64(5, 'Y'), 1, 0)

olderthan5 = np.where(
    (np.datetime64('today', 'D') - first_published).astype('timedelta64[Y]')
    > np.timedelta64(5, 'Y'), 1, 0)

meta = np.concatenate((dict_of_onehot_encodings['document_type'],
                       dict_of_onehot_encodings['primary_publishing_organisation'],
                       dict_of_onehot_encodings['publishing_app'],
                       first_published_scaled,
                       last_year,
                       last_2years,
                       last_5years,
                       olderthan5),
                      axis=1)

Number of unique level2taxons: 210
labelled_level2 shape: (172916, 24)
multilabel (pivot table - no duplicates): (113481, 210) 
Size of train set: 90784
Size of dev/test sets: 11348
Taxon code: 1 Taxon name: Administrative justice reform
SMALL SUPPORT: 12
(12, 210)
FIRST 5 IDs: ['5f641586-7631-11e4-a3cb-005056011aef', '5f641586-7631-11e4-a3cb-005056011aef', '04bfd44a-51f4-44bd-aef9-743bd7caac5d', '5dc6d4a6-7631-11e4-a3cb-005056011aef', 'fd0b66df-bab6-4e8a-bd7b-bb12a8ca63ca']
UPSAMPLING: (500, 210)
Taxon code: 2 Taxon name: Adoption, fostering and surrogacy
SMALL SUPPORT: 69
(52, 210)
FIRST 5 IDs: ['5e134ab5-7631-11e4-a3cb-005056011aef', '5dc818ea-7631-11e4-a3cb-005056011aef', '24d6d3fc-ab41-456d-adc7-a265674b45cc', '5814e171-f0e9-49d1-aff0-13549db57cb6', '2d125f0b-0fee-4e86-b151-1f25b24b00b0']
UPSAMPLING: (1000, 210)
Taxon code: 4 Taxon name: Armed Forces Covenant
SMALL SUPPORT: 186
(157, 210)
FIRST 5 IDs: ['fa738fcc-fa18-4a8e-8e40-0f6573db7032', '5f4fb92d-7631-11e4-a3cb-005056011aef',

(3, 210)
FIRST 5 IDs: ['c4445136-1bae-46e6-b8a1-55640dc14f80', 'ce9ceea5-e8b5-497d-ae73-e8290dcb5a5d', 'c4445136-1bae-46e6-b8a1-55640dc14f80', 'c4445136-1bae-46e6-b8a1-55640dc14f80', 'c2734a07-31c8-4306-bb1c-d8038dba326e']
UPSAMPLING: (13000, 210)
Taxon code: 34 Taxon name: Civil service reform
SMALL SUPPORT: 414
(325, 210)
FIRST 5 IDs: ['602282dc-7631-11e4-a3cb-005056011aef', '602115e7-7631-11e4-a3cb-005056011aef', '5efa9e54-7631-11e4-a3cb-005056011aef', '5f4ff85b-7631-11e4-a3cb-005056011aef', '5d30d6fc-7631-11e4-a3cb-005056011aef']
UPSAMPLING: (13500, 210)
Taxon code: 36 Taxon name: Commercial fishing and fisheries
SMALL SUPPORT: 291
(240, 210)
FIRST 5 IDs: ['5f1e789f-7631-11e4-a3cb-005056011aef', '5fa54140-7631-11e4-a3cb-005056011aef', '5f5ea5da-7631-11e4-a3cb-005056011aef', '5f2025cf-7631-11e4-a3cb-005056011aef', '5751555d-eada-4095-a14f-1d0f54c1074a']
UPSAMPLING: (14000, 210)
Taxon code: 38 Taxon name: Conflict in fragile states
SMALL SUPPORT: 239
(203, 210)
FIRST 5 IDs: ['5d63fd0

(10, 210)
FIRST 5 IDs: ['5c84440d-7631-11e4-a3cb-005056011aef', '5c84440d-7631-11e4-a3cb-005056011aef', '5f10966c-7631-11e4-a3cb-005056011aef', '5c838905-7631-11e4-a3cb-005056011aef', '5c7166fe-7631-11e4-a3cb-005056011aef']
UPSAMPLING: (26000, 210)
Taxon code: 68 Taxon name: European funds
SMALL SUPPORT: 88
(68, 210)
FIRST 5 IDs: ['60257c39-7631-11e4-a3cb-005056011aef', 'f98efb68-aeec-4bc9-beb1-8d51994b29af', 'eb61fa86-3d2d-444c-b9fb-d3f2d2b301b2', '5d8e53ec-7631-11e4-a3cb-005056011aef', '5f9d5f42-7631-11e4-a3cb-005056011aef']
UPSAMPLING: (26500, 210)
Taxon code: 69 Taxon name: European single market
SMALL SUPPORT: 194
(159, 210)
FIRST 5 IDs: ['5e2ecff4-7631-11e4-a3cb-005056011aef', '5e3024b8-7631-11e4-a3cb-005056011aef', '5e0dbeaa-7631-11e4-a3cb-005056011aef', '5e0dbcec-7631-11e4-a3cb-005056011aef', '5ee5e6a5-7631-11e4-a3cb-005056011aef']
UPSAMPLING: (27000, 210)
Taxon code: 70 Taxon name: Expenses and employee benefits
SMALL SUPPORT: 8
(8, 210)
FIRST 5 IDs: ['6d40aebf-c225-4d3c-a751-

UPSAMPLING: (38500, 210)
Taxon code: 106 Taxon name: Law and practice
SMALL SUPPORT: 55
(41, 210)
FIRST 5 IDs: ['5fe146f5-7631-11e4-a3cb-005056011aef', '90a51dda-e839-4ee8-b4b0-3669d9b07476', 'badb98d7-1c71-4125-83c2-ec0c6c19fcf6', '5fe16c90-7631-11e4-a3cb-005056011aef', 'b59056ac-f7e9-4415-96b5-79cc5cfb0a76']
UPSAMPLING: (39000, 210)
Taxon code: 107 Taxon name: Legal aid
SMALL SUPPORT: 143
(121, 210)
FIRST 5 IDs: ['1877322e-d067-44b1-9574-388e20b5ca11', 'ba95412e-7c05-4ab7-b970-9b15e806d261', 'b6d66f08-4a52-4223-aa63-1bda948c1b10', '7251620c-979b-4e7c-85a8-a8e372ce97c6', '5f50ec8e-7631-11e4-a3cb-005056011aef']
UPSAMPLING: (39500, 210)
Taxon code: 108 Taxon name: Legal aid reform
SMALL SUPPORT: 56
(46, 210)
FIRST 5 IDs: ['6021dc5c-7631-11e4-a3cb-005056011aef', '5fed9e4a-7631-11e4-a3cb-005056011aef', '5f4937f3-7631-11e4-a3cb-005056011aef', '5f5f4d03-7631-11e4-a3cb-005056011aef', 'ba26cd19-472e-4306-8553-ac769ae8d94a']
UPSAMPLING: (40000, 210)
Taxon code: 109 Taxon name: Legislative proc

UPSAMPLING: (51500, 210)
Taxon code: 143 Taxon name: Population screening programmes
SMALL SUPPORT: 80
(67, 210)
FIRST 5 IDs: ['ad21b523-5539-4b4b-9094-749de9a71dfb', '462324b2-bf2c-4df2-a28c-920a308d9a07', 'be6411fb-9718-4458-b62d-4868a408f7e8', '2613099a-2ca9-414a-8cad-a6908c31da61', '7dd92cc3-69f9-4605-a5f4-6aa41f693607']
UPSAMPLING: (52000, 210)
Taxon code: 144 Taxon name: Postal service reform
SMALL SUPPORT: 47
(36, 210)
FIRST 5 IDs: ['5e2a235c-7631-11e4-a3cb-005056011aef', '5f534488-7631-11e4-a3cb-005056011aef', '60262bf3-7631-11e4-a3cb-005056011aef', 'cf19545b-b1c6-4c67-ae94-a4fbfac14ba9', '5e9cefb6-7631-11e4-a3cb-005056011aef']
UPSAMPLING: (52500, 210)
Taxon code: 145 Taxon name: Pregnancy and birth
SMALL SUPPORT: 16
(12, 210)
FIRST 5 IDs: ['5dc93a44-7631-11e4-a3cb-005056011aef', '5dc93a44-7631-11e4-a3cb-005056011aef', '24d6d3fc-ab41-456d-adc7-a265674b45cc', 'b555a47a-d180-4bc7-979a-e687cf6a16bf', 'd282d35a-2bd2-4e14-a7a6-a04e6b10520f']
UPSAMPLING: (53000, 210)
Taxon code: 146 

(52, 210)
FIRST 5 IDs: ['5eb60c0a-7631-11e4-a3cb-005056011aef', '5c7b932d-7631-11e4-a3cb-005056011aef', '5c77e532-7631-11e4-a3cb-005056011aef', '5ec3e2ab-7631-11e4-a3cb-005056011aef', '5c781d9f-7631-11e4-a3cb-005056011aef']
UPSAMPLING: (64500, 210)
Taxon code: 184 Taxon name: Transport modelling and appraisal
SMALL SUPPORT: 64
(45, 210)
FIRST 5 IDs: ['5e90613f-7631-11e4-a3cb-005056011aef', '5e5ca17b-7631-11e4-a3cb-005056011aef', 'b44edebd-dd56-4033-8c7c-db765ddf9574', '0b119ab4-4264-4e1c-91ae-f042f614ab40', '5e973830-7631-11e4-a3cb-005056011aef']
UPSAMPLING: (65000, 210)
Taxon code: 185 Taxon name: Transport planning
SMALL SUPPORT: 47
(33, 210)
FIRST 5 IDs: ['5dbbd68a-7631-11e4-a3cb-005056011aef', '5f181249-7631-11e4-a3cb-005056011aef', '5c7ac7eb-7631-11e4-a3cb-005056011aef', '5c80cf6a-7631-11e4-a3cb-005056011aef', '5ebdd3d9-7631-11e4-a3cb-005056011aef']
UPSAMPLING: (65500, 210)
Taxon code: 186 Taxon name: Transport research and evaluation
SMALL SUPPORT: 26
(19, 210)
FIRST 5 IDs: ['a94



In [6]:
# **** TOKENIZE TEXT ********************
# ************************************

# Load tokenizers, fitted on both labelled and unlabelled data from file
# created in clean_content.py
print('loading tokenizers')
tokenizer_combined_text = load_tokenizer_from_file(os.path.join(DATADIR, "combined_text_tokenizer.json"))

tokenizer_title = load_tokenizer_from_file(os.path.join(DATADIR,"title_tokenizer.json"))

tokenizer_description = load_tokenizer_from_file(os.path.join(DATADIR, "description_tokenizer.json"))

# Prepare combined text data for input into embedding layer
print('converting combined text to sequences')
combined_text_sequences = tokenizer_combined_text.texts_to_sequences(
    balanced_df.index.get_level_values('combined_text')
)

print('padding combined text sequences')
combined_text_sequences_padded = pad_sequences(
    combined_text_sequences,
    maxlen=1000,  # MAX_SEQUENCE_LENGTH
    padding='post', truncating='post'
)

loading tokenizers
converting combined text to sequences
padding combined text sequences


In [7]:
# prepare title and description matrices, 
# which are one-hot encoded for the 10,000 most common words
# to be fed in after the flatten layer (through fully connected layers)

# print('converting title text to sequences')
# title_sequences = tokenizer_title.texts_to_sequences(
#     balanced_df.index.get_level_values('title')
# )
# texts_to_matrix(texts):

# Return: numpy array of shape (len(texts), num_words).
# Arguments:
# texts: list of texts to vectorize.
# mode: one of "binary", "count", "tfidf", "freq" (default: "binary").
tokenizer_title = load_tokenizer_from_file(os.path.join(DATADIR,"title_tokenizer.json"))

In [8]:
# title_tfidf = tokenizer_title.texts_to_matrix(balanced_df.index.get_level_values('title'), 'tfidf')
# print(title_tfidf.shape)    

In [9]:
print('converting title text to sequences')
title_sequences = tokenizer_title.texts_to_sequences(
    balanced_df.index.get_level_values('title')
)
# print('one-hot encoding title sequences')
title_onehot = tokenizer_title.sequences_to_matrix(title_sequences)
print(title_onehot.shape)  

converting title text to sequences
(188481, 36216)


In [10]:
print('converting description text to sequences')
description_sequences = tokenizer_description.texts_to_sequences(
    balanced_df.index.get_level_values('description')
)

print('one-hot encoding description sequences')
description_onehot = tokenizer_description.sequences_to_matrix(description_sequences)

converting description text to sequences
one-hot encoding description sequences


In [11]:
# ******* TRAIN/DEV/TEST SPLIT DATA ****************
# **************************************************

# - Training data = 80%
# - Development data = 10%
# - Test data = 10%

print('train/dev/test splitting')
size_after_resample = balanced_df.shape[0]
print('size_after_resmaple ={}'.format(size_after_resample))
end_dev = size_train + size_dev
print('end_dev ={}'.format(end_dev))
# assign the indices for separating the original (pre-sampled) data into
# train/dev/test
splits = [(0, size_train), (size_train, end_dev), (end_dev, size_before_resample)]
print('splits ={}'.format(splits))
# assign the indices for separating out the resampled training data
resampled_split = [(size_before_resample, size_after_resample)]
print('resampled_split ={}'.format(resampled_split))

def split(data_to_split, split_indices):
    """split data along axis=0 (rows) at indices designated in split_indices"""
    list_of_split_data_subsets = []
    for (start, end) in split_indices:
        list_of_split_data_subsets.append(data_to_split[start:end])
    return tuple(list_of_split_data_subsets)

train/dev/test splitting
size_after_resmaple =188481
end_dev =102132
splits =[(0, 90784), (90784, 102132), (102132, 113481)]
resampled_split =[(113481, 188481)]


In [12]:
print('extract combined text arrays')
# extract arrays as subsets of original text data
x_train, x_dev, x_test = split(combined_text_sequences_padded, splits)
# extract array of all resampled training text data
x_resampled = split(combined_text_sequences_padded, resampled_split)[0]
# append resampled data to original training subset
x_train = np.concatenate([x_train, x_resampled], axis=0)

print('extract metadata arrays')
meta_train, meta_dev, meta_test = split(meta, splits)
meta_resampled = split(meta, resampled_split)[0]
meta_train = np.concatenate([meta_train, meta_resampled], axis=0)

print('extract title arrays')
title_train, title_dev, title_test = split(title_onehot, splits)
title_resampled = split(title_onehot, resampled_split)[0]
title_train = np.concatenate([title_train, title_resampled], axis=0)

print('extract description arrays')
desc_train, desc_dev, desc_test = split(description_onehot, splits)
desc_resampled = split(description_onehot, resampled_split)[0]
desc_train = np.concatenate([desc_train, desc_resampled], axis=0)

print('extract Y arrays')
y_train, y_dev, y_test = split(binary_multilabel, splits)
y_resampled = split(binary_multilabel, resampled_split)[0]
y_train = np.concatenate([y_train, y_resampled], axis=0)

extract combined text arrays
extract metadata arrays
extract title arrays
extract description arrays
extract Y arrays


In [14]:
from scipy import sparse

print('x_train.shape = {}'.format(x_train.shape))
x_train_sparse = sparse.csr_matrix(x_train)
meta_train_sparse = sparse.csr_matrix(meta_train)
title_train_sparse = sparse.csr_matrix(title_train)
description_train_sparse = sparse.csr_matrix(desc_train)
y_train_sparse = sparse.csr_matrix(y_train)
print('x_train_sparse.shape = {}'.format(x_train_sparse.shape))

x_train.shape = (165784, 1000)
x_train_sparse.shape = (165784, 1000)


In [15]:
x_dev = sparse.csr_matrix(x_dev)
meta_dev = sparse.csr_matrix(meta_dev)
title_dev = sparse.csr_matrix(title_dev)
description_dev = sparse.csr_matrix(desc_dev)
y_dev = sparse.csr_matrix(y_dev)

x_test = sparse.csr_matrix(x_test)
meta_test = sparse.csr_matrix(meta_test)
title_test = sparse.csr_matrix(title_test)
description_test = sparse.csr_matrix(desc_test)
y_test = sparse.csr_matrix(y_test)

In [16]:
# sparse.save_npz(os.path.join(DATADIR,'x_train_sparse.npz'), x_train_sparse)

In [17]:
# import codecs, json 

# x_train_list = x_train.tolist() # nested lists with same data, indices

In [18]:
#json.dump(x_train_list, codecs.open(os.path.join(DATADIR,'train_arrays.json'), 'w', encoding='utf-8'), separators=(',', ':'), sort_keys=True, indent=4) 

In [19]:
#np.save(os.path.join(DATADIR,'train_arrays.npz'), x_train)

In [20]:
print('saving arrays')
np.savez(os.path.join(DATADIR,'train_arrays.npz'),
                    x=x_train_sparse,
                    meta=meta_train_sparse,
                    title=title_train_sparse,
                    desc=description_train_sparse,
                    y=y_train_sparse)

saving arrays


In [21]:
np.savez(os.path.join(DATADIR,'dev_arrays.npz'),
                    x=x_dev,
                    meta=meta_dev,
                    title=title_dev,
                    desc=desc_dev,
                    y=y_dev)

In [22]:
np.savez_compressed(os.path.join(DATADIR,'test_arrays.npz'),
                    x=x_test,
                    meta=meta_test,
                    title=title_test,
                    desc=desc_test,
                    y=y_test)

In [24]:
id_train, id_dev, id_test = split(meta_df['content_id'], splits)

In [25]:
np.savez(os.path.join(DATADIR,'content_id_arrays.npz'), train=id_train, dev=id_dev, test=id_test)