# Feature engineering

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
# parameters
data_path = 'data'
data_set = 'clean_document.csv'
term_frequency_data_set = 'term_frequency.csv'

In [None]:
complete_corpus = pd.read_csv('{}/{}'.format(data_path, data_set))
complete_corpus.head()

In [None]:
def count_features(values, keys):
    summed_values = values.sum(axis=0)
    zipped_dictionary = zip(summed_values, keys)
    sorted_zipped_dictionary = sorted(zipped_dictionary, reverse=True)
    return sorted_zipped_dictionary

In [None]:
def calculate_term_frequency(X):
    term_frequency_array = np.zeros(shape=X.shape)
    iteration = 0
    for array in X:
        if array.sum() > 0:
            term_frequency_array[iteration] = array / array.sum()
        else:
            term_frequency_array[iteration] = array
        iteration += 1
    return term_frequency_array        

In [None]:
def test_calculate_term_frequency():
    X = np.array([[4, 2, 7, 4, 9, 1, 0], 
                  [6, 3, 2, 0, 0, 8, 1]])
    Y = calculate_term_frequency(X)
    Y_result = np.array([[4/27, 2/27, 7/27, 4/27, 9/27, 1/27, 0/27], 
                         [6/20, 3/20, 2/20, 0/20, 0/20, 8/20, 1/20]])
    comparison = Y == Y_result
    equal_arrays = comparison.all()
    assert equal_arrays == True, 'test_calculate_term_frequency: check of test failed'
    print('all test_calculate_term_frequency tests passed')

In [None]:
test_calculate_term_frequency()

In [None]:
def calculate_term_frequency_inverse_document_frequency(X):
    inverse_document_frequency = np.zeros(shape=X.shape)
    term_frequency = calculate_term_frequency(X)
    corpus_sum = term_frequency # matrix term_frequency is used later, therefore create a copy
    
    # count an n-gram only once per document
    corpus_sum[corpus_sum > 0] = 1
    
    # inverse document frequency (idf) is the number of documents in the corpus (here X.shape[0]), 
    # divided by the number of documents where a word appears
    idf_single_array = 1 + np.log(X.shape[0] / corpus_sum.sum(axis=0)) # sum over all documents

    # for matrix matrix multiplication between term_freq and term_freq_inv_doc_freq elementwise
    iteration = 0
    for array in X:
        inverse_document_frequency[iteration] = idf_single_array
        iteration += 1
        
    term_frequency_inverse_document_frequency = np.multiply(term_frequency, inverse_document_frequency)
    return term_frequency_inverse_document_frequency

In [None]:
def build_vocabulary(df, feature_size, n_gram_size):
    corpus = df['processed_text'].values
    vectorizer = CountVectorizer(max_features=feature_size, ngram_range=(n_gram_size, n_gram_size))
    feature_matrix = vectorizer.fit_transform(corpus)
    feature_vocabulary = vectorizer.get_feature_names() # feature order
    return feature_vocabulary, feature_matrix.toarray()

In [None]:
def build_feature(df, feature_size, n_gram_size):
    """
    Build the features necessary for machine learning. 
    
    Args:
    df: (dataframe) Corpus with processed texts saved as feature 'processed_text' and its corresponding class 
    available as 'class'.
    feature_size: (int) Defines the number of top n-grams extracted from the corpus.
    n_gram_size: (int) Defines the n-gram size (contiguous sequence of items with length n).
    
    Returns:
    feature_vocabulary: (list of n-grams) The vocabulary with the top feature_size n-grams.
    feature_occurance: (numpy array) The occurance matrix of n-grams, number of rows = feature_size, 
    number of columns = corpus length.
    feature_statistics: (dictionary) Dictionary with size feature_size sorted according to top n-gram.
    term_freq: (numpy array) Term frequency matrix
    term_freq_inverse_doc_freq: (numpy array) Maxrix according to term frequency inverse document frequency model.
    """
    feature_vocabulary, feature_occurance = build_vocabulary(complete_corpus, feature_size, n_gram_size)
    feature_statistics = count_features(feature_occurance, feature_vocabulary)
    term_freq = calculate_term_frequency(feature_occurance)
    term_freq_inverse_doc_freq = calculate_term_frequency_inverse_document_frequency(feature_occurance)
    return feature_vocabulary, feature_occurance, feature_statistics, term_freq, term_freq_inverse_doc_freq

# Test case

Define a test case with 10 most used words in the corups and a n-gram of 1.

In [None]:
vocabulary, occurance, statistics, term_freq, term_freq_inverse_doc_freq = build_feature(complete_corpus, 10, 1)

In [None]:
vocabulary

In [None]:
occurance

In [None]:
statistics

In [None]:
term_freq

In [None]:
term_freq_inverse_doc_freq

# Create datasets

In [None]:
temp_term_frequency = pd.DataFrame(term_frequency, columns=feature_vocabulary)
temp_term_frequency.head()

In [None]:
term_frequency = pd.concat([document['class'], temp_term_frequency], axis=1)
term_frequency.head()

In [None]:
def save_document(df, data_path, data_set):
    df.drop_duplicates(inplace=True)
    df.to_csv('{}/{}'.format(data_path, data_set), index=False)
    print('{} saved'.format(data_set))

In [None]:
save_document(term_frequency, data_path, term_frequency_data_set)