# Feature engineering

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
# parameters
data_path = 'data'
data_set = 'clean_document.csv'
term_frequency_data_set = 'term_frequency.csv'

In [None]:
document = pd.read_csv('{}/{}'.format(data_path, data_set))
document.head()

In [None]:
def order_vocabulary(values, keys):
    zipped_dictionary = zip(values, keys)
    sorted_zipped_dictionary = sorted(zipped_dictionary, reverse=True)
    return sorted_zipped_dictionary

In [None]:
def calculate_term_frequency(X):
    term_frequency_list = []
    for array in X:
        if array.sum() > 0:
            term_frequency_list.append(array / array.sum())
        else:
            term_frequency_list.append(array)
    
    numpy_frequency = np.array(term_frequency_list)
    return numpy_frequency        

In [None]:
def build_vocabulary(df, feature_size):
    corpus = df['processed_text'].values
    vectorizer = CountVectorizer(max_features=feature_size)
    X = vectorizer.fit_transform(corpus)
    feature_vocabulary = vectorizer.get_feature_names() # feature order
    return feature_vocabulary, X.toarray()

In [None]:
feature_vocabulary, feature_occurance = build_vocabulary(document, 500)

In [None]:
sorted_vocabulary = order_vocabulary(feature_occurance.sum(axis=0), feature_vocabulary)
sorted_vocabulary

In [None]:
term_frequency = calculate_term_frequency(feature_occurance)

# Build term frequency dataset

In [None]:
temp_term_frequency = pd.DataFrame(term_frequency, columns=feature_vocabulary)
temp_term_frequency.head()

In [None]:
term_frequency = pd.concat([document['class'], temp_term_frequency], axis=1)
term_frequency.head()

In [None]:
def save_document(df, data_path, data_set):
    df.drop_duplicates(inplace=True)
    df.to_csv('{}/{}'.format(data_path, data_set), index=False)
    print('{} saved'.format(data_set))

In [None]:
save_document(term_frequency, data_path, term_frequency_data_set)