# Feature engineering

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

In [17]:
# parameters
data_path = 'data'
data_set = 'clean_document.csv'
term_frequency_data_set = 'term_frequency.csv'

In [18]:
complete_corpus = pd.read_csv('{}/{}'.format(data_path, data_set))
complete_corpus.head()

Unnamed: 0,class,processed_text
0,0,wto chief debat trump ralli support trade head...
1,0,catalonia find friend among eu leader european...
2,1,shock fed govern grant disabl statu benefit sp...
3,1,climat scammer al gore utterli embarrass expla...
4,1,lol leftist ca congresswoman tonight debat deb...


In [19]:
def count_features(values, keys):
    summed_values = values.sum(axis=0)
    zipped_dictionary = zip(summed_values, keys)
    sorted_zipped_dictionary = sorted(zipped_dictionary, reverse=True)
    return sorted_zipped_dictionary

In [75]:
def calculate_term_frequency(X):
    term_frequency_array = np.zeros(shape=X.shape)
    iteration = 0
    for array in X:
        if array.sum() > 0:
            term_frequency_array[iteration] = array / array.sum()
        else:
            term_frequency_array[iteration] = array
        iteration += 1
    return term_frequency_array        

In [125]:
def calculate_term_frequency_inverse_document_frequency(X):
    inverse_document_frequency = np.zeros(shape=X.shape)
    term_frequency = calculate_term_frequency(X)
    corpus_sum = term_frequency[term_frequency > 0] = 1 # count an n-gram only once per document
    inverse_document_frequency_single_array = 1 + np.log(corpus_sum.sum(axis=0)) # sum over all documents
    iteration = 0
    for array in X:
        inverse_document_frequency[iteration] = inverse_document_frequency_single_array
        iteration += 1
        
    term_frequency_inverse_document_frequency = term_frequency * inverse_document_frequency
    return term_frequency_inverse_document_frequency

In [85]:
def build_vocabulary(df, feature_size, n_gram_size):
    corpus = df['processed_text'].values
    vectorizer = CountVectorizer(max_features=feature_size, ngram_range=(n_gram_size, n_gram_size))
    feature_matrix = vectorizer.fit_transform(corpus)
    feature_vocabulary = vectorizer.get_feature_names() # feature order
    return feature_vocabulary, feature_matrix.toarray()

In [126]:
def build_feature(df, feature_size, n_gram_size):
    """
    Build the features necessary for machine learning. 
    
    Args:
    df: (dataframe) Corpus with processed texts saved as feature 'processed_text' and its corresponding class 
    available as 'class'.
    feature_size: (int) Defines the number of top n-grams extracted from the corpus.
    n_gram_size: (int) Defines the n-gram size (contiguous sequence of items with length n).
    
    Returns:
    feature_vocabulary: (list of n-grams) The vocabulary with the top feature_size n-grams.
    feature_occurance: (numpy array) The occurance matrix of n-grams, number of rows = feature_size, 
    number of columns = corpus length.
    feature_statistics: (dictionary) Dictionary with size feature_size sorted according to top n-gram.
    term_freq: (numpy array) Term frequency matrix
    term_freq_inverse_doc_freq: (numpy array) Maxrix according to term frequency inverse document frequency model.
    """
    feature_vocabulary, feature_occurance = build_vocabulary(complete_corpus, feature_size, n_gram_size)
    feature_statistics = count_features(feature_occurance, feature_vocabulary)
    term_freq = calculate_term_frequency(feature_occurance)
    term_freq_inverse_doc_freq = calculate_term_frequency_inverse_document_frequency(feature_occurance)
    return feature_vocabulary, feature_occurance, feature_statistics, term_freq, term_freq_inverse_doc_freq

# Test case

Define a test case with 10 most used words in the corups and a n-gram of 1.

In [127]:
vocabulary, occurance, statistics, term_freq, term_freq_inverse_doc_freq = build_feature(complete_corpus, 10, 1)

AttributeError: 'int' object has no attribute 'sum'

In [115]:
vocabulary

['clinton',
 'one',
 'peopl',
 'presid',
 'said',
 'say',
 'state',
 'trump',
 'would',
 'year']

In [116]:
occurance

array([[ 0,  0,  3, ...,  4,  5,  4],
       [ 0,  1,  0, ...,  0,  0,  0],
       [ 0,  2,  0, ...,  0,  1,  1],
       ...,
       [ 0,  0,  8, ...,  0,  1,  1],
       [ 0,  1,  0, ..., 11,  0,  0],
       [ 0,  3,  3, ...,  0,  3,  1]])

In [117]:
statistics

[(649, 'trump'),
 (586, 'said'),
 (272, 'would'),
 (270, 'state'),
 (221, 'presid'),
 (202, 'year'),
 (193, 'clinton'),
 (191, 'peopl'),
 (185, 'one'),
 (174, 'say')]

In [118]:
term_freq

array([[0.        , 0.        , 0.10714286, ..., 0.14285714, 0.17857143,
        0.14285714],
       [0.        , 0.125     , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.25      , 0.        , ..., 0.        , 0.125     ,
        0.125     ],
       ...,
       [0.        , 0.        , 0.44444444, ..., 0.        , 0.05555556,
        0.05555556],
       [0.        , 0.03448276, 0.        , ..., 0.37931034, 0.        ,
        0.        ],
       [0.        , 0.11111111, 0.11111111, ..., 0.        , 0.11111111,
        0.03703704]])

In [119]:
idf_matrix = term_freq

In [120]:
idf_matrix[idf_matrix > 0] = 1

In [121]:
idf = 1 + np.log(idf_matrix.sum(axis=0))

In [122]:
idf

array([4.09104245, 5.52178858, 5.40671925, 5.62497281, 6.03043792,
       5.61512052, 5.59511985, 5.58496748, 5.56434819, 5.57471098])

# Build term frequency dataset

In [None]:
temp_term_frequency = pd.DataFrame(term_frequency, columns=feature_vocabulary)
temp_term_frequency.head()

In [None]:
term_frequency = pd.concat([document['class'], temp_term_frequency], axis=1)
term_frequency.head()

In [None]:
def save_document(df, data_path, data_set):
    df.drop_duplicates(inplace=True)
    df.to_csv('{}/{}'.format(data_path, data_set), index=False)
    print('{} saved'.format(data_set))

In [None]:
save_document(term_frequency, data_path, term_frequency_data_set)