# Feature engineering

Feature engineering is about conversion of documents (words) into numerical features for machine learning processing. 

In [None]:
import os
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sagemaker.session import Session

In [None]:
def split_data(df, training_fraction, test_fraction, validation_fraction):
    """
    Split data into training, test and validation set.
    """
    
    assert training_fraction + test_fraction + validation_fraction == 1, 'split fraction do not add to 1'
               
    nrow = df.shape[0]
    count_training = int(training_fraction * nrow)
    count_test = int(test_fraction * nrow)
    count_validation = int(validation_fraction * nrow)
    
    training_df = df[:count_training]
    test_df = df[count_training:(count_test + count_training)]
    validation_df = df[(count_test + count_training):]
               
    return training_df, test_df, validation_df

In [None]:
def calculate_term_frequency(X):
    """
    Calculate term frequency (TF). 
    """
    term_frequency_array = np.zeros(shape=X.shape)
    iteration = 0
    for array in X:
        if array.sum() > 0:
            term_frequency_array[iteration] = array / array.sum()
        else:
            term_frequency_array[iteration] = array
        iteration += 1
    return term_frequency_array        

In [None]:
def test_calculate_term_frequency():
    """
    tbd.
    """
    X = np.array([[4, 2, 7, 4, 9, 1, 0], 
                  [6, 3, 2, 0, 0, 8, 1]])
    Y = calculate_term_frequency(X)
    Y_result = np.array([[4/27, 2/27, 7/27, 4/27, 9/27, 1/27, 0/27], 
                         [6/20, 3/20, 2/20, 0/20, 0/20, 8/20, 1/20]])
    comparison = Y == Y_result
    equal_arrays = comparison.all()
    assert equal_arrays == True, 'test_calculate_term_frequency: check of test failed'
    print('all test_calculate_term_frequency tests passed')

In [None]:
test_calculate_term_frequency()

In [None]:
def calculate_inverse_document_frequency(term_frequency_matrix, nrow=None):
    """
    Calculate inverse term frequency inverse document frequency (TF-IDF) based on term frequency matrix (TF).
    """
    
    # if nrow is given, initialize according to nrow, otherwise on shape of term_frequency_matrix
    if nrow is None:
        nrow = term_frequency_matrix.shape[0]

    inverse_document_frequency_matrix = np.zeros(shape=(nrow, term_frequency_matrix.shape[1])) # initialize   
    corpus_sum = term_frequency_matrix.copy() # matrix term_frequency is used later, therefore create a copy
    
    # count an n-gram only once per document
    corpus_sum[corpus_sum > 0] = 1
    
    # inverse document frequency (idf) is the number of documents in the corpus (here X.shape[0]), 
    # divided by the number of documents where a word appears
    idf_single_array = 1 + np.log(term_frequency_matrix.shape[0] / corpus_sum.sum(axis=0)) # sum over all documents

    # for matrix matrix multiplication between term_freq and term_freq_inv_doc_freq elementwise
    for i in range(nrow):
        inverse_document_frequency_matrix[i] = idf_single_array
    
    return inverse_document_frequency_matrix

In [None]:
def test_calculate_inverse_document_frequency():
    """
    tbd.
    """
    X = np.array([[4, 2, 7, 4, 9, 1, 0], 
                  [6, 3, 2, 0, 0, 8, 1]])
    X_tf = calculate_term_frequency(X)
    Y = calculate_inverse_document_frequency(X_tf)
    Y_result = np.array([[(1+np.log(1)), (1+np.log(1)), (1+np.log(1)), (1+np.log(2)), 
                          (1+np.log(2)), (1+np.log(1)), (1+np.log(2))], 
                         [(1+np.log(1)), (1+np.log(1)), (1+np.log(1)), 
                          (1+np.log(2)), (1+np.log(2)), (1+np.log(1)), (1+np.log(2))]])
    comparison = Y == Y_result
    equal_arrays = comparison.all()
    assert equal_arrays == True, 'test_calculate_inverse_document_frequency: check of test failed'
    print('all test_calculate_inverse_document_frequency tests passed')

In [None]:
test_calculate_inverse_document_frequency()

In [None]:
def build_dictionary(df, feature_size, n_gram_size):
    """
    tbd.
    """
    corpus = df['processed_text'].values
    vectorizer = CountVectorizer(max_features=feature_size, ngram_range=(n_gram_size, n_gram_size))
    vectorizer.fit(corpus) # use training data to generate dictionary
    feature_dictionary = vectorizer.get_feature_names() # feature order
    return vectorizer, feature_dictionary

In [None]:
def build_feature_matrix(train_df, test_df, validation_df, feature_size, n_gram_size):
    """
    Build the feature matrices necessary for machine learning. 
    
    Args:
    - train_df (dataframe): Training document corpus with processed texts saved as feature 'processed_text' and its 
    corresponding class label available as 'class'.
    - test_df (dataframe): Test document corpus.
    - validation_df (dataframe): Validation document corpus.
    - feature_size: (int) Defines the number of top n-grams extracted from the corpus, this defines the number of columns
    of the feature matrix.
    - n_gram_size: (int) Defines the n-gram size (contiguous sequence of items with length n).
    
    Returns:
    """
    
    # build dictionary just on training data
    transformer, dictionary = build_dictionary(train_df, feature_size, n_gram_size)

    # build feature matrices for all partial datasets
    train_matrix = transformer.transform(train_df['processed_text'].values).toarray()
    test_matrix = transformer.transform(test_df['processed_text'].values).toarray()
    validation_matrix = transformer.transform(validation_df['processed_text'].values).toarray()
    
    # term frequency (TF) can be calculated for all partial datasets individually
    tf_train = calculate_term_frequency(train_matrix)
    tf_test = calculate_term_frequency(test_matrix)
    tf_validation = calculate_term_frequency(validation_matrix)
    
    # inverse document frequency (IDF), we can take just training data
    idf_train = calculate_inverse_document_frequency(tf_train)
    idf_test = calculate_inverse_document_frequency(tf_train, tf_test.shape[0])
    idf_validation = calculate_inverse_document_frequency(tf_train, tf_validation.shape[0])
    
    # calculate term frequency inverse document frequency (TF-IDF)
    tf_idf_train = np.multiply(tf_train, idf_train)
    tf_idf_test = np.multiply(tf_test, idf_test)
    tf_idf_validation = np.multiply(tf_validation, idf_validation)

    return tf_train, tf_idf_train, tf_test, tf_idf_test, tf_validation, tf_idf_validation

In [None]:
def upload_data(y, X, session, bucket, prefix, feature_type, data_type, corpus_size, n_gram_size):
    """
    tbd.
    """
     
    feature_size = X.shape[1]
    
    print('upload {}-{}-{}-{}-{}'.format(data_type, feature_type, corpus_size, feature_size, n_gram_size))
    
    # tmp folder is used for local file creation and then from there upload to s3
    if not os.path.exists('tmp'):
        os.mkdir('tmp')
    
    df = pd.concat([pd.DataFrame(y), pd.DataFrame(X)], axis=1)
    data_path = '{}/{}-{}-{}-{}-{}'.format('tmp', data_type, feature_type, corpus_size, feature_size, n_gram_size) 
    
    if not os.path.exists(data_path):
        os.mkdir(data_path)
        
    df.to_csv('{}/{}.csv'.format(data_path, data_type), index=False, header=False)
    
    # upload to s3
    session.upload_data('tmp', bucket=bucket, key_prefix=prefix)
    
    # remove local files
    !rm -rfd tmp    

In [None]:
def create_dataset(train_df, test_df, validation_df, session, bucket, prefix, corpus_size, feature_size, n_gram_size):
    """
    tbd.
    """
        
    # build features
    tf_train, tf_idf_train, tf_test, tf_idf_test, tf_validation, tf_idf_validation = \
        build_feature_matrix(train_df, test_df, validation_df, feature_size, n_gram_size)
    
    # store only class values
    y_train = train_df['class'].values
    y_test = test_df['class'].values
    y_validation = validation_df['class'].values
    
    # upload to s3
    upload_data(y_train, tf_train, session, bucket, prefix, 'tf', 'train', corpus_size, n_gram_size)
    upload_data(y_train, tf_idf_train, session, bucket, prefix, 'tf-idf', 'train', corpus_size, n_gram_size)
    upload_data(y_test, tf_test, session, bucket, prefix, 'tf', 'test', corpus_size, n_gram_size)
    upload_data(y_test, tf_idf_test, session, bucket, prefix, 'tf-idf', 'test', corpus_size, n_gram_size)
    upload_data(y_validation, tf_validation, session, bucket, prefix, 'tf', 'validation', corpus_size, n_gram_size)
    upload_data(y_validation, tf_idf_validation, session, bucket, prefix, 'tf-idf', 'validation', corpus_size, n_gram_size)

## Save data to s3 storage

In [None]:
# define some SageMaker base parameters
sagemaker_session = Session()
default_bucket = sagemaker_session.default_bucket()
print('SageMaker session {}'.format(sagemaker_session))
print('SageMaker default bucket {}'.format(default_bucket))

In [None]:
# import corpus data
corpus = pd.read_csv('{}/{}'.format('data', 'corpus-5000.csv'))
corpus.head(2)

In [None]:
# split corpus data
corpus_training, corpus_test, corpus_validation = split_data(corpus, 0.6, 0.2, 0.2)

In [None]:
# create datasets and upload to s3
create_dataset(corpus_training, corpus_test, corpus_validation, sagemaker_session, default_bucket, 'data', 5000, 500, 1)
create_dataset(corpus_training, corpus_test, corpus_validation, sagemaker_session, default_bucket, 'data', 5000, 1000, 1)
create_dataset(corpus_training, corpus_test, corpus_validation, sagemaker_session, default_bucket, 'data', 5000, 5000, 1)
create_dataset(corpus_training, corpus_test, corpus_validation, sagemaker_session, default_bucket, 'data', 5000, 500, 2)
create_dataset(corpus_training, corpus_test, corpus_validation, sagemaker_session, default_bucket, 'data', 5000, 1000, 2)
create_dataset(corpus_training, corpus_test, corpus_validation, sagemaker_session, default_bucket, 'data', 5000, 5000, 2)