# Feature engineering

Feature engineering is about conversion of documents (words) into numerical features for machine learning processing. 

In [None]:
import os
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

In [None]:
corpus = pd.read_csv('{}/{}'.format('data', 'corpus.csv'))
corpus.head()

In [None]:
def count_features(values, keys):
    summed_values = values.sum(axis=0)
    zipped_dictionary = zip(summed_values, keys)
    sorted_zipped_dictionary = sorted(zipped_dictionary, reverse=True)
    return sorted_zipped_dictionary

In [None]:
def calculate_term_frequency(X):
    term_frequency_array = np.zeros(shape=X.shape)
    iteration = 0
    for array in X:
        if array.sum() > 0:
            term_frequency_array[iteration] = array / array.sum()
        else:
            term_frequency_array[iteration] = array
        iteration += 1
    return term_frequency_array        

In [None]:
def test_calculate_term_frequency():
    X = np.array([[4, 2, 7, 4, 9, 1, 0], 
                  [6, 3, 2, 0, 0, 8, 1]])
    Y = calculate_term_frequency(X)
    Y_result = np.array([[4/27, 2/27, 7/27, 4/27, 9/27, 1/27, 0/27], 
                         [6/20, 3/20, 2/20, 0/20, 0/20, 8/20, 1/20]])
    comparison = Y == Y_result
    equal_arrays = comparison.all()
    assert equal_arrays == True, 'test_calculate_term_frequency: check of test failed'
    print('all test_calculate_term_frequency tests passed')

In [None]:
test_calculate_term_frequency()

In [None]:
def calculate_term_frequency_inverse_document_frequency(X):
    """
    Calculate inverse term frequency inverse document frequency (TF-IDF).
    """
    inverse_document_frequency = np.zeros(shape=X.shape)
    term_frequency = calculate_term_frequency(X)
    corpus_sum = term_frequency # matrix term_frequency is used later, therefore create a copy
    
    # count an n-gram only once per document
    corpus_sum[corpus_sum > 0] = 1
    
    # inverse document frequency (idf) is the number of documents in the corpus (here X.shape[0]), 
    # divided by the number of documents where a word appears
    idf_single_array = 1 + np.log(X.shape[0] / corpus_sum.sum(axis=0)) # sum over all documents

    # for matrix matrix multiplication between term_freq and term_freq_inv_doc_freq elementwise
    iteration = 0
    for array in X:
        inverse_document_frequency[iteration] = idf_single_array
        iteration += 1
        
    term_frequency_inverse_document_frequency = np.multiply(term_frequency, inverse_document_frequency)
    return term_frequency_inverse_document_frequency

In [None]:
def build_vocabulary(df, feature_size, n_gram_size):
    corpus = df['processed_text'].values
    vectorizer = CountVectorizer(max_features=feature_size, ngram_range=(n_gram_size, n_gram_size))
    feature_matrix = vectorizer.fit_transform(corpus)
    feature_vocabulary = vectorizer.get_feature_names() # feature order
    return feature_vocabulary, feature_matrix.toarray()

In [None]:
def build_feature(df, feature_size, n_gram_size):
    """
    Build the features necessary for machine learning. 
    
    Args:
    df: (dataframe) Corpus with processed texts saved as feature 'processed_text' and its corresponding class label 
    available as 'class'.
    feature_size: (int) Defines the number of top n-grams extracted from the corpus, this defines the number of columns
    of the feature matrix.
    n_gram_size: (int) Defines the n-gram size (contiguous sequence of items with length n).
    
    Returns:
    feature_vocabulary: (list of n-grams) The vocabulary with the top feature_size n-grams.
    feature_occurance: (numpy array) The occurance matrix of n-grams, number of rows = feature_size, 
    number of columns = corpus length.
    feature_statistics: (dictionary) Dictionary with size feature_size sorted according to top n-gram.
    term_freq: (numpy array) Term frequency matrix.
    term_freq_inverse_doc_freq: (numpy array) Maxrix according to term frequency inverse document frequency model.
    """
    feature_vocabulary, feature_occurance = build_vocabulary(df, feature_size, n_gram_size)
    feature_statistics = count_features(feature_occurance, feature_vocabulary)
    term_freq = calculate_term_frequency(feature_occurance)
    term_freq_inverse_doc_freq = calculate_term_frequency_inverse_document_frequency(feature_occurance)
    return feature_vocabulary, feature_occurance, feature_statistics, term_freq, term_freq_inverse_doc_freq

# Test case

Define a test case with 10 most used words in the corups and a n-gram of 1.

In [None]:
vocabulary, occurance, statistics, term_freq, term_freq_inverse_doc_freq = build_feature(corpus, 10, 1)

In [None]:
statistics

# Create feature datasets

In [None]:
def data_split_and_save(df, data_prefix, tf, tf_idf, number_documents, number_features, n_gram_size):
    """
    Perform training, test and validation split. Training is used to train the model, test ist used for hyperparamter
    tuning and validation is the final validation of the model. Splitting is done according to 60% training data, 20%
    test data and 20% validation data.
    
    Returns:
    Training, test or validation dataframe in its corresponding folder named as train-*, test-* or validation-*.
    The first number defines the number of documents (rows), the second number defines the number of features (columns), 
    and the third number defines the n-gram.
    """

    # indicate all entries in path with all
    if number_documents == 0:
        number_documents = 'all'
    
    data_list = ['tf', 'tf-idf']
    
    for feature in data_list:
        print('split and save {}-{}-{}-{}...'.format(feature, number_documents, number_features, n_gram_size), end='')
        
        if feature == 'tf':
            X = tf # numpy
        else:
            X = tf_idf # numpy
    
        y = df['class'].values # numpy

        # first split 60% as training data, rest 40% as test which will be split later 20% test, 20% validation
        X_train, X_rest, y_train, y_rest = train_test_split(X, y, train_size=0.60, random_state=1)

        # 50% / 50% split validation and test data
        X_test, X_validation, y_test, y_validation = train_test_split(X_rest, y_rest, train_size=0.50, random_state=1)

        data_path_train = \
            '{}/train-{}-{}-{}-{}'.format(data_prefix, feature, number_documents, number_features, n_gram_size) 
        data_path_test = \
            '{}/test-{}-{}-{}-{}'.format(data_prefix, feature, number_documents, number_features, n_gram_size) 
        data_path_validation = \
            '{}/validation-{}-{}-{}-{}'.format(data_prefix, feature, number_documents, number_features, n_gram_size) 
        data_path_list = [data_path_train, data_path_test, data_path_validation]

        for check_path in data_path_list:
            if not os.path.exists(check_path):
                os.mkdir(check_path)

        train_df = pd.concat([pd.DataFrame(y_train), pd.DataFrame(X_train)], axis=1)
        test_df = pd.concat([pd.DataFrame(y_test), pd.DataFrame(X_test)], axis=1)
        validation_df = pd.concat([pd.DataFrame(y_validation), pd.DataFrame(X_validation)], axis=1)

        train_df.to_csv('{}/{}'.format(data_path_train, 'train.csv'), index=False, header=False)
        test_df.to_csv('{}/{}'.format(data_path_test, 'test.csv'), index=False, header=False)
        validation_df.to_csv('{}/{}'.format(data_path_validation, 'validation.csv'), index=False, header=False)
        
        print('done')
    

In [None]:
def create_feature_data(df, data_prefix, number_documents, number_features, n_gram_size):
    
    # define the number of documents analyzed, if 0 all data is used
    if number_documents == 0:
        corpus = df.copy()
    else:
        corpus = df.iloc[:number_documents].copy()
    
    vocabulary, matrix, statistics, tf, tf_idf = build_feature(corpus, number_features, n_gram_size)
    data_split_and_save(corpus, data_prefix, tf, tf_idf, number_documents, number_features, n_gram_size)

In [None]:
create_feature_data(corpus, 'data', 5000, 500, 1)
create_feature_data(corpus, 'data', 5000, 1000, 1)
create_feature_data(corpus, 'data', 5000, 5000, 1)
create_feature_data(corpus, 'data', 5000, 500, 2)
create_feature_data(corpus, 'data', 5000, 1000, 2)
create_feature_data(corpus, 'data', 5000, 5000, 2)