In [1]:
import os
import numpy as np
import pandas as pd
from pathlib import Path

##################################################################################################

home = str(Path.home())

In [2]:
from sklearn.datasets import fetch_rcv1
rcv1 = fetch_rcv1()

In [3]:
num_labels = 40
num_vocabs = 15000

In [4]:
num_labels = 40
num_vocabs = 15000

feature_indices = np.argsort(-rcv1.target.sum(axis=0), axis=1)[0, :num_labels]
feature_indices = np.asarray(feature_indices).squeeze()
targets = rcv1.target[:, feature_indices]

word_indices = np.argsort(-rcv1.data.sum(axis=0), axis=1)[0, :num_vocabs]
word_indices = np.asarray(word_indices).squeeze()
documents = rcv1.data[:, word_indices]

targets = [t for t in targets]
documents = [d for d in documents]

df = pd.DataFrame({'doc_id': rcv1.sample_id.tolist(), 'bow': documents, 'label': targets})
df.set_index('doc_id', inplace=True)
print('total docs: {}'.format(len(df)))

In [33]:
# remove any empty labels
def count_num_tags(target):
    return target.sum()

df = df[df.label.apply(count_num_tags) > 0]
print('after filter: total docs: {}'.format(len(df)))

def get_num_word(bow):
    return bow.count_nonzero()

df = df[df.bow.apply(get_num_word) > 0]
print('after filter: total docs: {}'.format(len(df)))

after filter: total docs: 804414


after filter: total docs: 804414


In [40]:
# remove any empty documents
if remove_short_document:
    print('remove any short document that has less than 5 words.')
    df = df[df.bow.apply(get_num_word) > 5]
    print('num docs: {}'.format(len(df)))

if remove_long_document:
    print('remove any long document that has more than 500 words.')
    df = df[df.bow.apply(get_num_word) <= 500]
    print('num docs: {}'.format(len(df)))

df = df.reindex(np.random.permutation(df.index))

remove any short document that has less than 5 words.
num docs: 804393
remove any long document that has more than 500 words.
num docs: 804279


In [55]:
num_train = 100000
num_test = 20000

sampled_df = df.sample(num_train + num_test)
train_df = sampled_df.iloc[:num_train]

test_df = sampled_df.iloc[num_train:]
cv_df = test_df[:num_test//2]
test_df = test_df[num_test//2:]

print('num train: {} num test: {} num cv: {}'.format(len(df_train), len(df_test), len(df_cv)))

In [59]:
# save the dataframes
save_dir = '../dataset/rcv1'.format(args.dataset)
print('save tfidf dataset to {} ...'.format(save_dir))

train_df.to_pickle(os.path.join(save_dir, 'train.tf.df.pkl'))
test_df.to_pickle(os.path.join(save_dir, 'test.tf.df.pkl'))
cv_df.to_pickle(os.path.join(save_dir, 'cv.tf.df.pkl'))

num train: 100000 num test: 10000 num cv: 10000


In [None]:
from scipy.sparse import csr_matrix, vstack

data_dir = os.path.join(home, 'datasets/tmc')
with open(os.path.join(data_dir, 'TMC_TrainCategoryMatrix.csv')) as handle:
    y_train = [[(int(v)+1)//2 for v in line.strip().split(',')] for line in handle]
    y_train = np.array(y_train)
    y_train = csr_matrix(y_train)

with open(os.path.join(data_dir, 'TMC_TestTruth.csv')) as handle:
    y_test = [[(int(v)+1)//2 for v in line.strip().split(',')] for line in handle]
    y_test = np.array(y_test)
    y_test = csr_matrix(y_test)


In [None]:
len(train_docs)

In [None]:
y_train

In [None]:
with open('dataset/tmc/TMC_TrainCategoryMatrix.csv') as label_data:
    cnt = 0
    y = []
    for line in label_data:
        y.append([(int(v)+1)/2 for v in line.strip().split(',')])

    gnd_train = np.array(y)

print(gnd_train.shape)

with open('dataset/tmc/TMC_TestTruth.csv') as label_data:
    cnt = 0
    gnd_test = []
    for line in label_data:
        gnd_test.append([(int(v)+1)/2 for v in line.strip().split(',')])

    gnd_test = np.array(gnd_test)

print(gnd_test.shape)

In [None]:
%matplotlib inline

import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pickle
from sklearn.utils import shuffle
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem import PorterStemmer

import os
from pathlib import Path

################################################################################################

home = str(Path.home())

################################################################################################
use_sklearn_data = True
use_stemmer = False
remove_short_document = True
remove_long_document = True

In [None]:
if use_sklearn_data:
    train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes')) 
    test = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'))
    
    train_docs = train.data
    train_tags = train.target
    test_docs = test.data
    test_tags = test.target
else:
    raw_train_fn = '../dataset/raw/20ng-train-stemmed.txt'
    train_docs = []
    train_tags = []
    with open(raw_train_fn) as raw_text:
        for idx, line in enumerate(raw_text):
            tokens = line.strip().split()
            train_tags.append(tokens[0])
            train_docs.append(' '.join(tokens[1:]))

    raw_test_fn = '../dataset/raw/20ng-test-stemmed.txt'
    test_docs = []
    test_tags = []
    with open(raw_test_fn) as raw_text:
        for idx, line in enumerate(raw_text):
            tokens = line.strip().split()
            test_tags.append(tokens[0])
            test_docs.append(' '.join(tokens[1:]))

    category = {cate: i for i, cate in enumerate(list(set(train_tags)))}
    train_tags = [category[cate] for cate in train_tags]
    test_tags = [category[cate] for cate in test_tags]

In [None]:
root_dir = os.path.join(home, 'datasets/dbpedia')

In [None]:
train_fn = os.path.join(root_dir, 'train.csv')
df = pd.read_csv(train_fn, header=None)
df.columns = ['label', 'title', 'body']
train_docs = list(df.body)
train_tags = list(df.label - 1)

test_fn = os.path.join(root_dir, 'test.csv')
df = pd.read_csv(test_fn, header=None)
df.columns = ['label', 'title', 'body']
test_docs = list(df.body)
test_tags = list(df.label - 1)

In [None]:
if use_stemmer:
    ps = PorterStemmer()
    def stem_docs(stemmer, docs):
        stemmed_docs = []
        for doc in train_docs:
            stemmed_docs.append(' '.join([ps.stem(w) for w in doc.split()]))
        return stemmed_docs
    train_docs = stem_docs(ps, train_docs)
    test_docs = stem_docs(ps, test_docs)

In [None]:
################################################################################################
count_vect = CountVectorizer(stop_words='english', max_features=10000, max_df=0.8, min_df=3)
train_tf = count_vect.fit_transform(train_docs)
test_tf = count_vect.transform(test_docs)

def create_dataframe(doc_tf, doc_targets):
    docs = []
    for i, bow in enumerate(doc_tf):
        d = {'doc_id': i, 'bow': bow, 'label': doc_targets[i]}
        docs.append(d)
    df = pd.DataFrame.from_dict(docs)
    df.set_index('doc_id', inplace=True)
    return df

train_df = create_dataframe(train_tf, train_tags)
test_df = create_dataframe(test_tf, test_tags)

def get_doc_length(doc_bow):
    return doc_bow.sum()

# def get_num_word(doc_bow):
#     return doc_bow.nonzero()[1].shape[0]

# remove an empty document
train_df = train_df[train_df.bow.apply(get_doc_length) > 0]
test_df = test_df[test_df.bow.apply(get_doc_length) > 0]

print('num train: {} num test: {}'.format(len(train_df), len(test_df)))

if remove_short_document:
    print('remove any short document that has less than 5 words.')
    train_df = train_df[train_df.bow.apply(get_doc_length) > 5]
    test_df = test_df[test_df.bow.apply(get_doc_length) > 5]
    print('num train: {} num test: {}'.format(len(train_df), len(test_df)))

if remove_long_document:
    print('remove any long document that has more than 500 words.')
    train_df = train_df[train_df.bow.apply(get_doc_length) <= 500]
    test_df = test_df[test_df.bow.apply(get_doc_length) <= 500]
    print('num train: {} num test: {}'.format(len(train_df), len(test_df)))

In [None]:
test_df.sample(n=1000)

In [None]:
# split test and cv
num_train = len(train_df)
num_test = len(test_df) // 2
num_cv = len(test_df) - num_test

print('train: {} test: {} cv: {}'.format(num_train, num_test, num_cv))

test_df = shuffle(test_df)
cv_df = test_df.iloc[:num_cv]
test_df = test_df.iloc[num_cv:]

In [None]:
# save the dataframes
print('save tf dataset ...')
train_df.to_pickle('../dataset/ng20/train.tf.df.pkl')
test_df.to_pickle('../dataset/ng20/test.tf.df.pkl')
cv_df.to_pickle('../dataset/ng20/cv.tf.df.pkl')

# save vocab
with open('../dataset/ng20/vocab.pkl', 'wb') as handle:
    pickle.dump(count_vect.vocabulary_, handle, protocol=pickle.HIGHEST_PROTOCOL)

## TFIDF format

In [None]:
import os
import numpy as np
import pandas as pd
import pickle
from sklearn.feature_extraction.text import TfidfTransformer
from scipy.sparse import coo_matrix, vstack

In [None]:
dataset = 'ng20'
data_dir = '../dataset/{}'.format(dataset)

train_df = pd.read_pickle(os.path.join(data_dir, 'train.tf.df.pkl'))
test_df = pd.read_pickle(os.path.join(data_dir, 'test.tf.df.pkl'))
cv_df = pd.read_pickle(os.path.join(data_dir, 'cv.tf.df.pkl'))


In [None]:
from sklearn.feature_extraction.text import TfidfTransformer
from scipy.sparse import coo_matrix, vstack

def convert_to_dataframe(source_df, doc_id_list, bow_list, label_list):
    df = pd.DataFrame({'doc_id': doc_id_list, 'bow': bow_list, 'label': label_list})
    df.set_index('doc_id', inplace=True)
    return df

train_tf = vstack(list(train_df.bow))
test_tf = vstack(list(test_df.bow))
cv_tf = vstack(list(cv_df.bow))

transformer = TfidfTransformer(sublinear_tf=True)
train_tfidf = transformer.fit_transform(train_tf)
test_tfidf = transformer.transform(test_tf)
cv_tfidf = transformer.transform(cv_tf)

train_tfidf_df = convert_to_dataframe(train_df, list(train_df.index), [bow for bow in train_tfidf], list(train_df.label))
test_tfidf_df = convert_to_dataframe(test_df, list(test_df.index), [bow for bow in test_tfidf], list(test_df.label))
cv_tfidf_df = convert_to_dataframe(cv_df, list(cv_df.index), [bow for bow in cv_tfidf], list(cv_df.label))

train_tfidf_df.to_pickle('../dataset/ng20/train.tfidf.df.pkl')
test_tfidf_df.to_pickle('../dataset/ng20/test.tfidf.df.pkl')
cv_tfidf_df.to_pickle('../dataset/ng20/cv.tfidf.df.pkl')

In [None]:
# Binary format
from scipy.sparse import csr_matrix, vstack

def create_dataframe(doc_tf, doc_targets):
    docs = []
    for i, bow in enumerate(doc_tf):
        d = {'doc_id': i, 'bow': bow, 'label': doc_targets[i]}
        docs.append(d)
    df = pd.DataFrame.from_dict(docs)
    df.set_index('doc_id', inplace=True)
    return df

def create_bin_matrix(doc_tf_df):
    # create TFIDF
    doc_bin = []
    for index, row in doc_tf_df.iterrows():
        bow = (row.bow.toarray().squeeze() > 0).astype(np.float)
        bow = csr_matrix(bow)
        doc_bin.append(bow)
    return vstack(doc_bin)

train_bin_df = create_dataframe(create_bin_matrix(train_df), list(train_df.label))
test_bin_df = create_dataframe(create_bin_matrix(test_df), list(test_df.label))
cv_bin_df = create_dataframe(create_bin_matrix(cv_df), list(cv_df.label))

# save the dataframes
train_bin_df.to_pickle('../dataset/ng20/train.bin.df.pkl')
test_bin_df.to_pickle('../dataset/ng20/test.bin.df.pkl')
cv_bin_df.to_pickle('../dataset/ng20/cv.bin.df.pkl')