In [1]:
# dataset
# https://cogcomp.seas.upenn.edu/Data/QA/QC/

In [18]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import pickle
import string

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import text_to_word_sequence, one_hot
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

In [19]:
#only if GPU is available
physical_devices = tf.config.list_physical_devices("GPU")
tf.config.experimental.set_memory_growth(physical_devices[0], True)

In [20]:
def remove_html(text) :
    '''
    parameters : text - string
    removes HTML content from the text such (eg. tags - <title></title>)
    returns text_without_html - string
    '''
    soup = BeautifulSoup(text)
    text_without_html = soup.get_text()
    return text_without_html


def remove_punctuation(text) :
    '''
    parameters : text - string
    removes punctuation from the text (eg. '.', '!', '?')
    returns : text_without_puntuation - string
    '''
    text_without_puntuation = " ".join([[char for char in text if char not in string.punctuation]])
    return text_without_puntuation


def remove_stop_words(token) :
    '''
    parameters : tokens - list of words
    removes stop words from the list (eg. 'a', 'the', 'are')
    returns : tokens_without_stop_words - list of words
    '''
    stop_words = stopwords.words('english')
    token_without_stop_words = [word for word in token if word not in stop_words]
    return token_without_stop_words


def stemmed_words(tokens) : 
    '''
    parameters : tokens - list of words
    stems the words in the list (eg. playing -> play)
    returns : stemmed_words - list of words
    '''
    porter = PorterStemmer()
    stemmed_words = [porter.stem(word) for word in tokens]
    return stemmed_words

In [21]:
def clean_data(sentences) :
    '''
    parameters : sentences - list of sentences
    cleans the sentences by
        converting the sentences into tokens and removing stop words
        joins the tokens to form a sentence again
    returns : texts - list of cleaned sentences
    '''
    texts = []
    for sentence in sentences :
        tokens = text_to_word_sequence(sentence)
        tokens = remove_stop_words(tokens)
        sentence = " ".join(tokens)
        texts.append(sentence)
    return texts

In [22]:
def make_tokenizer(dataset) :
    '''
    parameters : dataset - list of sentences
    creates a vocabulary of words based on the list of inputted sentences using the Tokenizer object
    returns : tokenizer - Tokenizer object
    '''
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(dataset)
    return tokenizer

In [23]:
def encode_texts(dataset, tokenizer) :
    '''
    parameters : dataset - list of sentences
                 tokenizer - Tokenizer object initialized using dataset
    encodes the text sequences in the dataset by mapping the index of the word in the vocabulary to each word
    in the dataset
    returns : encoded_docs - list of encoded sentences
    '''
    encoded_docs = tokenizer.texts_to_sequences(dataset)
    return encoded_docs

def encode_labels(labels) :
    '''
    parameters - list of labels/classes for each input
    maps each label to an index and encodes the label's with its corresponding index
    returns - list of encoded labels/classes for each input
    '''
    le = LabelEncoder()
    le.fit(labels)
    return le.transform(labels), len(le.classes_)

In [29]:
train_trec = open('./data/TREC/train_5500.label')
x_train = []
y_train = []
for x in train_trec :
    data_split = x.split(':')
    x_train.append(data_split[1])
    y_train.append(data_split[0])
train_trec.close()

test_trec = open('./data/TREC/TREC_10.label')
x_test = []
y_test = []
for x in test_trec :
    data_split = x.split(':')
    x_test.append(data_split[1])
    y_test.append(data_split[0])
test_trec.close()

x_train = clean_data(x_train)

y_train, num_classes= encode_labels(y_train)
y_test, _ = encode_labels(y_test)

y_train = to_categorical(y_train, num_classes=num_classes)
y_test = to_categorical(y_test, num_classes=num_classes)

In [30]:
tokenizer = make_tokenizer(x_train)
num_words = len(tokenizer.word_index) + 1

x_train = pad_sequences(encode_texts(x_train, tokenizer), padding='post')
max_length = x_train.shape[1]
x_test = pad_sequences(encode_texts(x_test, tokenizer), maxlen=max_length, padding='post')

In [31]:
x_train = np.array(x_train)
x_test = np.array(x_test)
y_train = np.array(y_train)
y_test = np.array(y_test)

In [32]:
np.savez('./data/encoded_dataset_trec.npz', name1=x_train, name2=y_train, name3=x_test, name4=y_test)

In [33]:
vocab_file = open("./data/vocab_trec.pkl", "wb")
pickle.dump(tokenizer.word_index, vocab_file)
vocab_file.close()