In [1]:
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pickle
from sklearn.utils import shuffle
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer

################################################################################################
use_sklearn_data = True

if use_sklearn_data:
    train = fetch_20newsgroups(subset='train') #, remove=('headers', 'footers', 'quotes')) 
    test = fetch_20newsgroups(subset='test') #, remove=('headers', 'footers', 'quotes'))
    
    train_docs = train.data
    train_tags = train.target
    test_docs = test.data
    test_tags = test.target
else:
    raw_train_fn = '../dataset/raw/20ng-train-stemmed.txt'
    train_docs = []
    train_tags = []
    with open(raw_train_fn) as raw_text:
        for idx, line in enumerate(raw_text):
            tokens = line.strip().split()
            train_tags.append(tokens[0])
            train_docs.append(' '.join(tokens[1:]))

    raw_test_fn = '../dataset/raw/20ng-test-stemmed.txt'
    test_docs = []
    test_tags = []
    with open(raw_test_fn) as raw_text:
        for idx, line in enumerate(raw_text):
            tokens = line.strip().split()
            test_tags.append(tokens[0])
            test_docs.append(' '.join(tokens[1:]))

    category = {cate: i for i, cate in enumerate(list(set(train_tags)))}
    train_tags = [category[cate] for cate in train_tags]
    test_tags = [category[cate] for cate in test_tags]

In [2]:
################################################################################################
# count_vect = CountVectorizer(stop_words='english', max_features=10000, max_df=0.8, min_df=3)
# train_tf = count_vect.fit_transform(train_docs)
# test_tf = count_vect.transform(test_docs)

from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(min_df=3,
                        max_df=0.80, 
                        max_features=10000,
                        use_idf=True, sublinear_tf=True,
                        norm='l2');
train_tf = tfidf.fit_transform(train_docs)
test_tf = tfidf.transform(test_docs)

################################################################################################
def create_dataframe(doc_tf, doc_targets):
    docs = []
    for i, bow in enumerate(doc_tf):
        d = {'doc_id': i, 'bow': bow, 'label': doc_targets[i]}
        docs.append(d)
    return pd.DataFrame.from_dict(docs)

train_df = create_dataframe(train_tf, train_tags)
test_df = create_dataframe(test_tf, test_tags)

def get_doc_length(doc_bow):
    return doc_bow.sum()

# remove an empty document
train_df = train_df[train_df.bow.apply(get_doc_length) > 0]
test_df = test_df[test_df.bow.apply(get_doc_length) > 0]

print('num train: {} num test: {}'.format(len(train_df), len(test_df)))

num train: 11314 num test: 7532


In [3]:
def get_num_word(doc_bow):
    return doc_bow.nonzero()[1].shape[0]

train_df = train_df[train_df.bow.apply(get_num_word) > 5]
train_df = train_df[train_df.bow.apply(get_num_word) < 500]
test_df = test_df[test_df.bow.apply(get_num_word) > 5]
test_df = test_df[test_df.bow.apply(get_num_word) < 500]

In [4]:
# split test and cv
num_train = len(train_df)
num_test = len(test_df) // 2
num_cv = len(test_df) - num_test

print('train: {} test: {} cv: {}'.format(num_train, num_test, num_cv))

test_df = shuffle(test_df)
cv_df = test_df.iloc[:num_cv]
test_df = test_df.iloc[num_cv:]

# set doc_id as an index
train_df.set_index('doc_id', inplace=True)
test_df.set_index('doc_id', inplace=True)
cv_df.set_index('doc_id', inplace=True)

train: 11162 test: 3724 cv: 3724


In [5]:
# save the dataframes
train_df.to_pickle('../dataset/ng20/train.tfidf.df.pkl')
test_df.to_pickle('../dataset/ng20/test.tfidf.df.pkl')
cv_df.to_pickle('../dataset/ng20/cv.tfidf.df.pkl')

# save vocab
with open('../dataset/ng20/vocab.pkl', 'wb') as handle:
    pickle.dump(tfidf.vocabulary_, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
# TFIDF format
from scipy.sparse import csr_matrix, vstack

# compute DocFrequency
doc_freq = np.zeros(train_df.iloc[0].bow.shape[1])
for index, row in train_df.iterrows():
    bow = (row.bow.toarray().squeeze() > 0).astype(np.float)
    doc_freq += bow
    
# compute inverse document frequency
N = len(train_df)
idf = np.log(1. + N / (doc_freq + 1.))

def create_tfidf_matrix(doc_tf_df, docidf):
    # create TFIDF
    doc_tfidf = []
    for index, row in doc_tf_df.iterrows():
        bow = (row.bow.toarray().squeeze()).astype(np.float)
        tfidf = csr_matrix(np.log1p(bow) * docidf)
        doc_tfidf.append(tfidf)
    return vstack(doc_tfidf)

train_tfidf_df = create_dataframe(create_tfidf_matrix(train_df, idf), list(train_df.label))
test_tfidf_df = create_dataframe(create_tfidf_matrix(test_df, idf), list(test_df.label))
cv_tfidf_df = create_dataframe(create_tfidf_matrix(cv_df, idf), list(cv_df.label))

# save the dataframes
train_tfidf_df.set_index('doc_id', inplace=True)
test_tfidf_df.set_index('doc_id', inplace=True)
cv_tfidf_df.set_index('doc_id', inplace=True)

train_tfidf_df.to_pickle('../dataset/ng20/train.tfidf.df.pkl')
test_tfidf_df.to_pickle('../dataset/ng20/test.tfidf.df.pkl')
cv_tfidf_df.to_pickle('../dataset/ng20/cv.tfidf.df.pkl')

In [None]:
# Binary format
from scipy.sparse import csr_matrix, vstack

def create_bin_matrix(doc_tf_df):
    # create TFIDF
    doc_bin = []
    for index, row in doc_tf_df.iterrows():
        bow = (row.bow.toarray().squeeze() > 0).astype(np.float)
        bow = csr_matrix(bow)
        doc_bin.append(bow)
    return vstack(doc_bin)

train_bin_df = create_dataframe(create_bin_matrix(train_df), list(train_df.label))
test_bin_df = create_dataframe(create_bin_matrix(test_df), list(test_df.label))
cv_bin_df = create_dataframe(create_bin_matrix(cv_df), list(cv_df.label))

# save the dataframes
train_bin_df.to_pickle('../dataset/ng20/train.bin.df.pkl')
test_bin_df.to_pickle('../dataset/ng20/test.bin.df.pkl')
cv_bin_df.to_pickle('../dataset/ng20/cv.bin.df.pkl')

In [None]:
# import os
# from os.path import join
# import numpy as np
# import pandas as pd
# import pickle
# import torch
# from torch.utils.data import Dataset

# class Newsgroups20Dataset(Dataset):
#     """Newsgroups20 dataset."""

#     def __init__(self, data_dir, download=False, subset='train', bow_format='tf'):
#         """
#         Args:
#             data_dir (string): Directory for loading and saving train, test, and cv dataframes.
#             download (boolean): Download newsgroups20 dataset from sklearn if necessary.
#             subset (string): Specify subset of the datasets. The choices are: train, test, cv.
#             bow_format (string): A weight scheme of a bag-of-words document. The choices are:
#                 tf (term frequency), tfidf (term freq with inverse document frequency), bm25.
#         """
#         self.data_dir = data_dir
#         self.subset = subset
#         self.bow_format = bow_format
#         self.df = self.load_df('{}.{}.df.pkl'.format(subset, bow_format))
        
#     def load_df(self, df_file):
#         df_file = os.path.join(self.data_dir, df_file)
#         return pd.read_pickle(df_file)
        
#     def __len__(self):
#         return len(self.df)

#     def __getitem__(self, idx):
#         doc_bow = self.df.iloc[idx].bow
#         doc_bow = torch.from_numpy(doc_bow.toarray().squeeze().astype(np.float32))
#         label = self.df.iloc[idx].label
#         return (doc_bow, label)

In [None]:
train_set = Newsgroups20Dataset('../dataset/ng20', subset='train', download=True, bow_format='tf')
train_loader = torch.utils.data.DataLoader(dataset=train_set, batch_size=64, shuffle=True)

In [None]:
for xb, yb in train_loader:
    break

In [None]:
xb.size()

In [None]:
yb