# Create training data

In [1]:
import numpy as np

tags = []
docs = []

vocab_size = 10000

#train_ds = 'dataset/ng20/20ng-train-no-stop.txt'
train_ds = 'dataset/raw/20ng-train-stemmed.txt'
with open(train_ds) as raw_text:
    for idx, line in enumerate(raw_text):
        tokens = line.strip().split()
        tags.append(tokens[0])
        docs.append(' '.join(tokens[1:]))
        
from sklearn.feature_extraction.text import CountVectorizer
tf = CountVectorizer(min_df=3,
                     max_df=0.80, 
                     max_features=vocab_size)

from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(min_df=3,
                        max_df=0.80, 
                        max_features=vocab_size,
                        use_idf=True, sublinear_tf=True,
                        norm='l2');

scheme = 'tfidf'
transformer = tfidf

x_train = transformer.fit_transform(docs)
categories = (list(set(tags)))

indices = []
for t in tags:
    hasMatch = False
    for idx, cate in enumerate(categories):
        if t == cate:
            indices.append(idx)
            hasMatch = True
            break
    
    assert(hasMatch)

y_train = np.array(indices)

# Create testing data

In [2]:
tags = []
docs = []

#test_ds = 'dataset/ng20/20ng-test-no-stop.txt'
test_ds = 'dataset/raw/20ng-test-stemmed.txt'

with open(test_ds) as raw_text:
    for idx, line in enumerate(raw_text):
        tokens = line.strip().split()
        tags.append(tokens[0])
        docs.append(' '.join(tokens[1:]))
        
x_test = transformer.transform(docs)

indices = []
for t in tags:
    hasMatch = False
    for idx, cate in enumerate(categories):
        if t == cate:
            indices.append(idx)
            hasMatch = True
            break
    assert(hasMatch)
y_test = np.array(indices)

from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder()
gnd_train = enc.fit_transform(y_train.reshape((-1,1)))  
gnd_test = enc.transform(y_test.reshape((-1,1)))

print("train shape={}".format(x_train.shape))
print("gnd train shape={}".format(gnd_train.shape))
print("test shape={}".format(x_test.shape))
print("gnd test shape={}".format(gnd_test.shape))

train shape=(11293, 10000)
gnd train shape=(11293, 20)
test shape=(7528, 10000)
gnd test shape=(7528, 20)


In [3]:
import numpy as np

# Sort by doc length
bin_mat = x_train.toarray() > 0
X_len = np.sum(bin_mat, axis=1)
    
indices = np.argsort(X_len)
x_train_nz = x_train[indices, :]

y_train_nz = gnd_train[indices,:]

In [4]:
# Remove too short document
bin_mat = x_train_nz.toarray() > 0
X_len = np.sum(bin_mat, axis=1)
x_train_nz = x_train_nz[X_len >= 10,:]
y_train_nz = y_train_nz[X_len >= 10,:]
filtered_indices = indices[X_len >= 10]

In [5]:
# Remove too long document
bin_mat = x_train_nz.toarray() > 0
X_len = np.sum(bin_mat, axis=1)
x_train_nz = x_train_nz[X_len <= 500,:]
y_train_nz = y_train_nz[X_len <= 500,:]
filtered_indices2 = filtered_indices[X_len <= 500]

In [6]:
reverse_idx = {}
for idx, val in enumerate(filtered_indices2):
    reverse_idx[val] = idx

In [7]:
y_test_nz = np.asarray(gnd_test)

In [None]:
x_test_nz

In [8]:
import numpy as np

# # Sort by doc length
# bin_mat = x_train.toarray() > 0
# X_len = np.sum(bin_mat, axis=1)
# indices = np.argsort(X_len)
# x_train_nz = x_train[indices, :]
# y_train_nz = gnd_train[indices,:]

# # Remove too short document
# bin_mat = x_train_nz.toarray() > 0
# X_len = np.sum(bin_mat, axis=1)
# x_train_nz = x_train_nz[X_len >= 10,:]
# y_train_nz = y_train_nz[X_len >= 10,:]

# # Remove too long document
# bin_mat = x_train_nz.toarray() > 0
# X_len = np.sum(bin_mat, axis=1)
# x_train_nz = x_train_nz[X_len <= 500,:]
# y_train_nz = y_train_nz[X_len <= 500,:]

# Sort by doc length
bin_mat = x_test.toarray() > 0
    
X_len = np.sum(bin_mat, axis=1)
indices = np.argsort(X_len)
x_test_nz = x_test[indices,:]

y_test_nz = gnd_test[indices, :]
    
# Remove too short document
bin_mat = x_test_nz.toarray() > 0
    
X_len = np.sum(bin_mat, axis=1)
x_test_nz = x_test_nz[X_len >= 10,:]
y_test_nz = y_test_nz[X_len >= 10,:]

# Remove too long document
bin_mat = x_test_nz.toarray() > 0
    
X_len = np.sum(bin_mat, axis=1)
x_test_nz = x_test_nz[X_len <= 500,:]
y_test_nz = y_test_nz[X_len <= 500,:]

print('after remove empty documents...')
print('num train:{} num tag: {} num features:{}'.format(x_train_nz.shape[0], 
                                                        y_train_nz.shape[0], 
                                                        x_train_nz.shape[1]))
print('num test:{} num tag: {} num features:{}'.format(x_test_nz.shape[0], 
                                                       y_test_nz.shape[0], 
                                                       x_test_nz.shape[1]))

after remove empty documents...
num train:11016 num tag: 11016 num features:10000
num test:7335 num tag: 7335 num features:10000


In [9]:
print(x_train_nz.shape)
print(y_train_nz.shape)

(11016, 10000)
(11016, 20)


In [10]:
import scipy.io

dataset = 'ng20'
scheme = 'tfidf'

scipy.io.savemat('dataset/{}/{}.{}.mat'.format(dataset, dataset, scheme), 
                 mdict={'train': x_train_nz, 'test': x_test_nz, 
                        'gnd_train': y_train_nz.toarray(), 'gnd_test': y_test_nz.toarray()})

In [11]:
# save vocabs
import pickle
pickle.dump(transformer.vocabulary_, open("dataset/{}/{}.{}.vocabs.p".format(dataset, dataset, scheme), "wb"))

# Create CV and Test set

In [12]:
import scipy.io
import numpy as np

# Check if 2 dataset are the same
tfidf_ds = scipy.io.loadmat('dataset/{}/{}.tfidf.mat'.format(dataset, dataset))
#tf_ds = scipy.io.loadmat('dataset/{}/{}.tf.mat'.format(dataset, dataset))

In [13]:
gnd_test = tfidf_ds['gnd_test']
gnd_train = tfidf_ds['gnd_train']

# compute the label distribution
label_counts = np.sum(gnd_test, axis=0)
label_distrib = label_counts / np.sum(label_counts)

In [14]:
n_samples = gnd_test.shape[0]
indices = np.random.permutation(n_samples)

n_cv = int(n_samples / 2.)
n_test = n_samples - n_cv

y_cv = gnd_test[indices[:n_cv],:]
y_cv_counts = np.sum(y_cv, axis=0)
y_cv_distrib = y_cv_counts / np.sum(y_cv_counts)

y_test = gnd_test[indices[n_cv:],:]
y_test_counts = np.sum(y_test, axis=0)
y_test_distrib = y_test_counts / np.sum(y_test_counts)

# compute KL Divergence from CV to train
cv_kl = np.sum(np.multiply(label_distrib, (np.log(label_distrib) - np.log(y_cv_distrib))))
test_kl = np.sum(np.multiply(label_distrib, (np.log(label_distrib) - np.log(y_test_distrib))))

print(cv_kl)
print(test_kl)

0.0013775955152313761
0.0013671041113509923


In [15]:
# x_tf_cv = tf_ds['test'][indices[:n_cv],:]
# x_tf_test = tf_ds['test'][indices[n_cv:],:]
# y_tf_cv = tf_ds['gnd_test'][indices[:n_cv],:]
# y_tf_test = tf_ds['gnd_test'][indices[n_cv:],:]

x_tfidf_cv = tfidf_ds['test'][indices[:n_cv],:]
x_tfidf_test = tfidf_ds['test'][indices[n_cv:],:]
y_tfidf_cv = tfidf_ds['gnd_test'][indices[:n_cv],:]
y_tfidf_test = tfidf_ds['gnd_test'][indices[n_cv:],:]

In [None]:
print(x_tf_cv.shape)
print(x_tf_test.shape)
print(y_tf_cv.shape)
print(y_tf_test.shape)
print(x_tfidf_cv.shape)
print(x_tfidf_test.shape)
print(y_tfidf_cv.shape)
print(y_tfidf_test.shape)

# Save the dataset

In [None]:
x_tf_train = tf_ds['train']
y_tf_train = tf_ds['gnd_train']

scipy.io.savemat('dataset/{}/{}.tf.mat'.format(dataset, dataset), 
                                              mdict={ 'train': x_tf_train, 
                                                      'test': x_tf_test, 
                                                      'cv': x_tf_cv,
                                                      'gnd_train': y_tf_train, 
                                                      'gnd_test': y_tf_test,
                                                      'gnd_cv': y_tf_cv})

In [16]:
x_tfidf_train = tfidf_ds['train']
y_tfidf_train = tfidf_ds['gnd_train']

scipy.io.savemat('dataset/{}/{}.tfidf.mat'.format(dataset, dataset), 
                                                 mdict={ 'train': x_tfidf_train, 
                                                      'test': x_tfidf_test, 
                                                      'cv': x_tfidf_cv,
                                                      'gnd_train': y_tfidf_train, 
                                                      'gnd_test': y_tfidf_test,
                                                      'gnd_cv': y_tfidf_cv})

In [17]:
# Save indices
scipy.io.savemat('dataset/{}/{}.indices.mat'.format(dataset, dataset), mdict={'indices': indices})

# Create a binary dataset

In [None]:
import scipy.io
import numpy as np

# Check if 2 dataset are the same
tf_ds = scipy.io.loadmat('dataset/{}/{}.tf.mat'.format(dataset, dataset))

In [None]:
x_train = tf_ds['train']
x_test = tf_ds['test']
x_cv = tf_ds['cv']

In [None]:
x_b_train = x_train > 0
x_b_test = x_test > 0
x_b_cv = x_cv > 0

x_b_train = x_b_train.astype(float)
x_b_test = x_b_test.astype(float)
x_b_cv = x_b_cv.astype(float)

In [None]:
y_train = tf_ds['gnd_train']
y_test = tf_ds['gnd_test']
y_cv = tf_ds['gnd_train']

In [None]:
scipy.io.savemat('dataset/{}/{}.b.mat'.format(dataset, dataset), mdict={ 'train': x_b_train, 
                                                      'test': x_b_test, 
                                                      'cv': x_b_cv,
                                                      'gnd_train': y_tfidf_train, 
                                                      'gnd_test': y_tfidf_test,
                                                      'gnd_cv': y_tfidf_cv})