In [1]:
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pickle
import os
from os.path import join
from pathlib import Path
from sklearn.utils import shuffle
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer

################################################################################################

dataset = 'yahooanswer'

home = str(Path.home())
root_dir = os.path.join(home, 'datasets', dataset)

################################################################################################

def create_dataframe(doc_tf, doc_targets):
    docs = []
    for i, bow in enumerate(doc_tf):
        d = {'doc_id': i, 'bow': bow, 'label': doc_targets[i]}
        docs.append(d)
    return pd.DataFrame.from_dict(docs)

In [2]:
train_fn = os.path.join(root_dir, 'train.csv')
df = pd.read_csv(train_fn, header=None)
df.columns = ['label', 'title', 'body', 'answer']

In [6]:
title = list(df.title)

In [None]:
train_fn = os.path.join(root_dir, 'train.csv')
df = pd.read_csv(train_fn, header=None)
df.columns = ['label', 'title', 'body', 'answer']
train_content = list(df.title)
train_label = list(df.label - 1)

test_fn = os.path.join(root_dir, 'test.csv')
df = pd.read_csv(test_fn, header=None)
df.columns = ['label', 'title', 'body', 'answer']
test_content = list(df.title)
test_label = list(df.label - 1)

################################################################################################
count_vect = CountVectorizer(stop_words='english', max_features=12000, max_df=0.8, min_df=3)
train_tf = count_vect.fit_transform(train_content)
test_tf = count_vect.transform(test_content)

In [None]:
train_df = create_dataframe(train_tf, train_label)
test_df = create_dataframe(test_tf, test_label)

def get_doc_length(doc_bow):
    return doc_bow.sum()

# remove an empty document
train_df = train_df[train_df.bow.apply(get_doc_length) > 0]
test_df = test_df[test_df.bow.apply(get_doc_length) > 0]

# split test and cv
num_train = len(train_df)
num_test = len(test_df) // 2
num_cv = len(test_df) - num_test

test_df = shuffle(test_df)
cv_df = test_df.iloc[:num_cv]
test_df = test_df.iloc[num_cv:]

# set doc_id as an index
train_df.set_index('doc_id', inplace=True)
test_df.set_index('doc_id', inplace=True)
cv_df.set_index('doc_id', inplace=True)

In [None]:
# sample the dataset
train_df = train_df.sample(n=100000, replace=False)
test_df = test_df.sample(n=10000, replace=False)
cv_df = cv_df.sample(n=10000, replace=False)

# save the dataframes
train_df.to_pickle('../dataset/{}/train.tf.df.pkl'.format(dataset))
test_df.to_pickle('../dataset/{}/test.tf.df.pkl'.format(dataset))
cv_df.to_pickle('../dataset/{}/cv.tf.df.pkl'.format(dataset))

# save vocab
with open('../dataset/{}/vocab.pkl'.format(dataset), 'wb') as handle:
    pickle.dump(count_vect.vocabulary_, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
# TFIDF format
from scipy.sparse import csr_matrix, vstack

# compute DocFrequency
doc_freq = np.zeros(train_df.iloc[0].bow.shape[1])
for index, row in train_df.iterrows():
    bow = (row.bow.toarray().squeeze() > 0).astype(np.float)
    doc_freq += bow
    
# compute inverse document frequency
N = len(train_df)
idf = np.log(1. + N / doc_freq)

def create_tfidf_matrix(doc_tf_df, docidf):
    # create TFIDF
    doc_tfidf = []
    for index, row in doc_tf_df.iterrows():
        bow = (row.bow.toarray().squeeze()).astype(np.float)
        tfidf = csr_matrix(np.log1p(bow) * docidf)
        doc_tfidf.append(tfidf)
    return vstack(doc_tfidf)

train_tfidf_df = create_dataframe(create_tfidf_matrix(train_df, idf), list(train_df.label))
test_tfidf_df = create_dataframe(create_tfidf_matrix(test_df, idf), list(test_df.label))
cv_tfidf_df = create_dataframe(create_tfidf_matrix(cv_df, idf), list(cv_df.label))

# save the dataframes
train_tfidf_df.set_index('doc_id', inplace=True)
test_tfidf_df.set_index('doc_id', inplace=True)
cv_tfidf_df.set_index('doc_id', inplace=True)

train_tfidf_df.to_pickle('../dataset/{}/train.tfidf.df.pkl'.format(dataset))
test_tfidf_df.to_pickle('../dataset/{}/test.tfidf.df.pkl'.format(dataset))
cv_tfidf_df.to_pickle('../dataset/{}/cv.tfidf.df.pkl'.format(dataset))

In [None]:
# Binary format
from scipy.sparse import csr_matrix, vstack

def create_bin_matrix(doc_tf_df):
    # create TFIDF
    doc_bin = []
    for index, row in doc_tf_df.iterrows():
        bow = (row.bow.toarray().squeeze() > 0).astype(np.float)
        bow = csr_matrix(bow)
        doc_bin.append(bow)
    return vstack(doc_bin)

train_bin_df = create_dataframe(create_bin_matrix(train_df), list(train_df.label))
test_bin_df = create_dataframe(create_bin_matrix(test_df), list(test_df.label))
cv_bin_df = create_dataframe(create_bin_matrix(cv_df), list(cv_df.label))

# save the dataframes
train_bin_df.to_pickle('../dataset/{}/train.bin.df.pkl'.format(dataset))
test_bin_df.to_pickle('../dataset/{}/test.bin.df.pkl'.format(dataset))
cv_bin_df.to_pickle('../dataset/{}/cv.bin.df.pkl'.format(dataset))