# Feature engineering

In [69]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

In [4]:
# parameters
data_path = 'data'
data_set = 'clean_document.csv'

In [5]:
document = pd.read_csv('{}/{}'.format(data_path, data_set))
document.head()

In [76]:
def order_vocabulary(dictionary):
    zipped_dictionary = zip(dictionary.values(), dictionary.keys())
    sorted_zipped_dictionary = sorted(zipped_dictionary, reverse=True)
    return sorted_zipped_dictionary

In [78]:
def calculate_term_frequency(X):
    term_frequency_list = []
    for array in X:
        if array.sum() > 0:
            term_frequency_list.append(array / array.sum())
        else:
            term_frequency_list.append(array)
    
    numpy_frequency = np.array(term_frequency_list)
    return numpy_frequency        

In [81]:
def build_vocabulary(df, feature_size):
    corpus = df['processed_text'].values
    vectorizer = CountVectorizer(max_features=feature_size)
    X = vectorizer.fit_transform(corpus)
    sorted_vocabulary = order_vocabulary(vectorizer.vocabulary_) # sort importance of words
    feature_vocabulary = vectorizer.get_feature_names() # feature order
    term_frequency = calculate_term_frequency(X.toarray())
    return sorted_vocabulary, feature_vocabulary, term_frequency

In [82]:
sorted_vocabulary, feature_vocabulary, term_frequency = build_vocabulary(document, 20)

In [83]:
sorted_vocabulary

[(19, 'year'),
 (18, 'would'),
 (17, 'trump'),
 (16, 'told'),
 (15, 'state'),
 (14, 'say'),
 (13, 'said'),
 (12, 'republican'),
 (11, 'report'),
 (10, 'presid'),
 (9, 'peopl'),
 (8, 'one'),
 (7, 'new'),
 (6, 'like'),
 (5, 'law'),
 (4, 'hous'),
 (3, 'govern'),
 (2, 'donald'),
 (1, 'clinton'),
 (0, 'also')]

In [84]:
feature_vocabulary

['also',
 'clinton',
 'donald',
 'govern',
 'hous',
 'law',
 'like',
 'new',
 'one',
 'peopl',
 'presid',
 'report',
 'republican',
 'said',
 'say',
 'state',
 'told',
 'trump',
 'would',
 'year']

In [87]:
term_frequency

array([[0.03030303, 0.        , 0.03030303, ..., 0.12121212, 0.15151515,
        0.12121212],
       [0.        , 0.        , 0.0625    , ..., 0.        , 0.        ,
        0.        ],
       [0.07142857, 0.        , 0.        , ..., 0.        , 0.07142857,
        0.07142857],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.04545455,
        0.04545455],
       [0.        , 0.        , 0.03225806, ..., 0.35483871, 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.09677419,
        0.03225806]])