In [3]:
%matplotlib inline

import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pickle
from sklearn.utils import shuffle
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem import PorterStemmer

import os
from pathlib import Path

################################################################################################

home = str(Path.home())

################################################################################################
use_sklearn_data = True
use_stemmer = False
remove_short_document = True
remove_long_document = True

In [None]:
if use_sklearn_data:
    train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes')) 
    test = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'))
    
    train_docs = train.data
    train_tags = train.target
    test_docs = test.data
    test_tags = test.target
else:
    raw_train_fn = '../dataset/raw/20ng-train-stemmed.txt'
    train_docs = []
    train_tags = []
    with open(raw_train_fn) as raw_text:
        for idx, line in enumerate(raw_text):
            tokens = line.strip().split()
            train_tags.append(tokens[0])
            train_docs.append(' '.join(tokens[1:]))

    raw_test_fn = '../dataset/raw/20ng-test-stemmed.txt'
    test_docs = []
    test_tags = []
    with open(raw_test_fn) as raw_text:
        for idx, line in enumerate(raw_text):
            tokens = line.strip().split()
            test_tags.append(tokens[0])
            test_docs.append(' '.join(tokens[1:]))

    category = {cate: i for i, cate in enumerate(list(set(train_tags)))}
    train_tags = [category[cate] for cate in train_tags]
    test_tags = [category[cate] for cate in test_tags]

In [4]:
root_dir = os.path.join(home, 'datasets/dbpedia')

In [5]:
train_fn = os.path.join(root_dir, 'train.csv')
df = pd.read_csv(train_fn, header=None)
df.columns = ['label', 'title', 'body']
train_docs = list(df.body)
train_tags = list(df.label - 1)

test_fn = os.path.join(root_dir, 'test.csv')
df = pd.read_csv(test_fn, header=None)
df.columns = ['label', 'title', 'body']
test_docs = list(df.body)
test_tags = list(df.label - 1)

In [None]:
if use_stemmer:
    ps = PorterStemmer()
    def stem_docs(stemmer, docs):
        stemmed_docs = []
        for doc in train_docs:
            stemmed_docs.append(' '.join([ps.stem(w) for w in doc.split()]))
        return stemmed_docs
    train_docs = stem_docs(ps, train_docs)
    test_docs = stem_docs(ps, test_docs)

In [6]:
################################################################################################
count_vect = CountVectorizer(stop_words='english', max_features=10000, max_df=0.8, min_df=3)
train_tf = count_vect.fit_transform(train_docs)
test_tf = count_vect.transform(test_docs)

def create_dataframe(doc_tf, doc_targets):
    docs = []
    for i, bow in enumerate(doc_tf):
        d = {'doc_id': i, 'bow': bow, 'label': doc_targets[i]}
        docs.append(d)
    df = pd.DataFrame.from_dict(docs)
    df.set_index('doc_id', inplace=True)
    return df

train_df = create_dataframe(train_tf, train_tags)
test_df = create_dataframe(test_tf, test_tags)

def get_doc_length(doc_bow):
    return doc_bow.sum()

# def get_num_word(doc_bow):
#     return doc_bow.nonzero()[1].shape[0]

# remove an empty document
train_df = train_df[train_df.bow.apply(get_doc_length) > 0]
test_df = test_df[test_df.bow.apply(get_doc_length) > 0]

print('num train: {} num test: {}'.format(len(train_df), len(test_df)))

if remove_short_document:
    print('remove any short document that has less than 5 words.')
    train_df = train_df[train_df.bow.apply(get_doc_length) > 5]
    test_df = test_df[test_df.bow.apply(get_doc_length) > 5]
    print('num train: {} num test: {}'.format(len(train_df), len(test_df)))

if remove_long_document:
    print('remove any long document that has more than 500 words.')
    train_df = train_df[train_df.bow.apply(get_doc_length) <= 500]
    test_df = test_df[test_df.bow.apply(get_doc_length) <= 500]
    print('num train: {} num test: {}'.format(len(train_df), len(test_df)))

num train: 559967 num test: 69995
remove any short document that has less than 5 words.
num train: 526319 num test: 65766
remove any long document that has more than 500 words.
num train: 526317 num test: 65766


In [9]:
test_df.sample(n=1000)

Unnamed: 0_level_0,bow,label
doc_id,Unnamed: 1_level_1,Unnamed: 2_level_1
7955,"(0, 1569)\t1\n (0, 1912)\t1\n (0, 2123)\t1...",1
4389,"(0, 561)\t1\n (0, 574)\t1\n (0, 886)\t1\n ...",0
41776,"(0, 377)\t1\n (0, 428)\t1\n (0, 1944)\t1\n...",8
54682,"(0, 1537)\t1\n (0, 1723)\t1\n (0, 2077)\t1...",10
25325,"(0, 88)\t1\n (0, 286)\t1\n (0, 1705)\t2\n ...",5
60785,"(0, 357)\t1\n (0, 590)\t3\n (0, 888)\t3\n ...",12
45945,"(0, 217)\t1\n (0, 359)\t2\n (0, 2458)\t1\n...",9
65438,"(0, 351)\t1\n (0, 949)\t1\n (0, 1135)\t1\n...",13
66751,"(0, 276)\t1\n (0, 339)\t1\n (0, 2041)\t1\n...",13
48434,"(0, 1749)\t1\n (0, 2401)\t1\n (0, 2434)\t1...",9


In [None]:
# split test and cv
num_train = len(train_df)
num_test = len(test_df) // 2
num_cv = len(test_df) - num_test

print('train: {} test: {} cv: {}'.format(num_train, num_test, num_cv))

test_df = shuffle(test_df)
cv_df = test_df.iloc[:num_cv]
test_df = test_df.iloc[num_cv:]

In [None]:
# save the dataframes
print('save tf dataset ...')
train_df.to_pickle('../dataset/ng20/train.tf.df.pkl')
test_df.to_pickle('../dataset/ng20/test.tf.df.pkl')
cv_df.to_pickle('../dataset/ng20/cv.tf.df.pkl')

# save vocab
with open('../dataset/ng20/vocab.pkl', 'wb') as handle:
    pickle.dump(count_vect.vocabulary_, handle, protocol=pickle.HIGHEST_PROTOCOL)

## TFIDF format

In [None]:
import os
import numpy as np
import pandas as pd
import pickle
from sklearn.feature_extraction.text import TfidfTransformer
from scipy.sparse import coo_matrix, vstack

In [None]:
dataset = 'ng20'
data_dir = '../dataset/{}'.format(dataset)

train_df = pd.read_pickle(os.path.join(data_dir, 'train.tf.df.pkl'))
test_df = pd.read_pickle(os.path.join(data_dir, 'test.tf.df.pkl'))
cv_df = pd.read_pickle(os.path.join(data_dir, 'cv.tf.df.pkl'))


In [None]:
from sklearn.feature_extraction.text import TfidfTransformer
from scipy.sparse import coo_matrix, vstack

def convert_to_dataframe(source_df, doc_id_list, bow_list, label_list):
    df = pd.DataFrame({'doc_id': doc_id_list, 'bow': bow_list, 'label': label_list})
    df.set_index('doc_id', inplace=True)
    return df

train_tf = vstack(list(train_df.bow))
test_tf = vstack(list(test_df.bow))
cv_tf = vstack(list(cv_df.bow))

transformer = TfidfTransformer(sublinear_tf=True)
train_tfidf = transformer.fit_transform(train_tf)
test_tfidf = transformer.transform(test_tf)
cv_tfidf = transformer.transform(cv_tf)

train_tfidf_df = convert_to_dataframe(train_df, list(train_df.index), [bow for bow in train_tfidf], list(train_df.label))
test_tfidf_df = convert_to_dataframe(test_df, list(test_df.index), [bow for bow in test_tfidf], list(test_df.label))
cv_tfidf_df = convert_to_dataframe(cv_df, list(cv_df.index), [bow for bow in cv_tfidf], list(cv_df.label))

train_tfidf_df.to_pickle('../dataset/ng20/train.tfidf.df.pkl')
test_tfidf_df.to_pickle('../dataset/ng20/test.tfidf.df.pkl')
cv_tfidf_df.to_pickle('../dataset/ng20/cv.tfidf.df.pkl')

In [None]:
# Binary format
from scipy.sparse import csr_matrix, vstack

def create_dataframe(doc_tf, doc_targets):
    docs = []
    for i, bow in enumerate(doc_tf):
        d = {'doc_id': i, 'bow': bow, 'label': doc_targets[i]}
        docs.append(d)
    df = pd.DataFrame.from_dict(docs)
    df.set_index('doc_id', inplace=True)
    return df

def create_bin_matrix(doc_tf_df):
    # create TFIDF
    doc_bin = []
    for index, row in doc_tf_df.iterrows():
        bow = (row.bow.toarray().squeeze() > 0).astype(np.float)
        bow = csr_matrix(bow)
        doc_bin.append(bow)
    return vstack(doc_bin)

train_bin_df = create_dataframe(create_bin_matrix(train_df), list(train_df.label))
test_bin_df = create_dataframe(create_bin_matrix(test_df), list(test_df.label))
cv_bin_df = create_dataframe(create_bin_matrix(cv_df), list(cv_df.label))

# save the dataframes
train_bin_df.to_pickle('../dataset/ng20/train.bin.df.pkl')
test_bin_df.to_pickle('../dataset/ng20/test.bin.df.pkl')
cv_bin_df.to_pickle('../dataset/ng20/cv.bin.df.pkl')