In [1]:
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pickle
from sklearn.utils import shuffle
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer

################################################################################################

def create_dataframe(doc_tf, doc_targets):
    docs = []
    for i, bow in enumerate(doc_tf):
        d = {'doc_id': i, 'bow': bow, 'label': doc_targets[i]}
        docs.append(d)
    return pd.DataFrame.from_dict(docs)

In [2]:
import os
from os.path import join
import numpy as np 
from pathlib import Path
home = str(Path.home())
root_dir = os.path.join(home, 'datasets/dbpedia')

In [3]:
train_fn = os.path.join(root_dir, 'train.csv')
df = pd.read_csv(train_fn, header=None)
df.columns = ['label', 'title', 'body']
train_content = list(df.body)
train_label = list(df.label - 1)

test_fn = os.path.join(root_dir, 'test.csv')
df = pd.read_csv(test_fn, header=None)
df.columns = ['label', 'title', 'body']
test_content = list(df.body)
test_label = list(df.label - 1)

################################################################################################
count_vect = CountVectorizer(stop_words='english', max_features=10000, max_df=0.8, min_df=3)
train_tf = count_vect.fit_transform(train_content)
test_tf = count_vect.transform(test_content)

In [8]:
train_df = create_dataframe(train_tf, train_label)
test_df = create_dataframe(test_tf, test_label)

def get_doc_length(doc_bow):
    return doc_bow.sum()

# remove an empty document
train_df = train_df[train_df.bow.apply(get_doc_length) > 0]
test_df = test_df[test_df.bow.apply(get_doc_length) > 0]

# split test and cv
num_train = len(train_df)
num_test = len(test_df) // 2
num_cv = len(test_df) - num_test

test_df = shuffle(test_df)
cv_df = test_df.iloc[:num_cv]
test_df = test_df.iloc[num_cv:]

# set doc_id as an index
train_df.set_index('doc_id', inplace=True)
test_df.set_index('doc_id', inplace=True)
cv_df.set_index('doc_id', inplace=True)

In [14]:
# since the labels are not uniformly distributed, we can either 
# make sure all the labels are roughly equal or stick with the given label distribution

# We will create 2 datasets: one with equally distributed and without.

# save the dataframes
train_df.to_pickle('../dataset/dbpedia/train.df.pkl')
test_df.to_pickle('../dataset/dbpedia/test.df.pkl')
cv_df.to_pickle('../dataset/dbpedia/cv.df.pkl')

# save vocab
with open('../dataset/dbpedia/vocab.pkl', 'wb') as handle:
    pickle.dump(count_vect.vocabulary_, handle, protocol=pickle.HIGHEST_PROTOCOL)