# Features Engineering

In [None]:
from utilities.data_factory import *
from utilities.models import *
from utilities.pca_helper import *
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import matplotlib.ticker
import seaborn as sns

In [None]:
# Use 0050 as example
df = read_one_file(input_file_name='step1_0050.csv.pklz')
labels_col = get_raw_labels()
features_col = get_raw_features()

___

### Correlation Analysis

In [None]:
features_df = df[features_col].copy()
corr_df = features_df.corr().round(4)
plt.figure(figsize=(30, 15), dpi=80)
sns.heatmap(corr_df, annot=False, fmt='.4f', center=0, annot_kws={"fontsize": 2})
plt.title('Features Correlation')
plt.tight_layout()
plt.show()

___

### PCA Stability analysis

In [None]:
interval_list = [5, 10, 20, 40]
files_list = ['step1_0050.csv.pklz', 'step1_2330.csv.pklz', 'step1_2603.csv.pklz']
target_thres = 0.99
res_dict = {}
fig = plt.figure(figsize=(40, 20), dpi=80)
count = 1
for one_file in files_list:
    df = DataFactory(input_file_name=one_file).df
    res_dict[one_file] = {}
    for one_interval in interval_list:
        res_df = get_pca_components_stability(input_df=df, 
                                              input_interval=one_interval, 
                                              input_thres=target_thres)
        res_dict[one_file][one_interval] = res_df
        log_info(f'Got all result of {one_file} - {one_interval} - {target_thres}')
        ax = fig.add_subplot(len(files_list), len(interval_list), count)
        ax.plot(res_df['start_date'], res_df['n_components'])
        ax.set_title(f'{one_file} | {one_interval} days | {target_thres * 100}% PCA Components #', fontsize=14)
        locator = matplotlib.ticker.MaxNLocator(prune='both', nbins=5)
        ax.xaxis.set_major_locator(locator)
        plt.grid(True)
        count += 1
plt.tight_layout()
plt.show()

In [None]:
one_interval = 7
files_list = ['step1_0050.csv.pklz', 'step1_2330.csv.pklz', 'step1_2603.csv.pklz']
target_n_components = 10
res_dict = {}
fig = plt.figure(figsize=(40, 30), dpi=80)
count = 1
for one_file in files_list:
    df = DataFactory(input_file_name=one_file).df
    res_df = get_pca_explained_var_stability(input_df=df, 
                                             input_interval=one_interval, 
                                             input_n_components=target_n_components)
    res_dict[one_file][one_interval] = res_df
    log_info(f'Got all result of {one_file} - {one_interval} - {target_thres}')
    fig.add_subplot(len(files_list), 1, count)
    for i in range(len(target_n_components)):
        plt.plot(res_df['test_date'], res_df[f'PC{i + 1}'], label=f'PC{i + 1}')
    plt.plot(res_df['test_date'], res_df['total'], label='Total')
    plt.title(f'{one_file} | {one_interval} days | {target_n_components} PCA Components Explained Variance', fontsize=14)
    locator = matplotlib.ticker.MaxNLocator(prune='both', nbins=5)
    plt.xaxis.set_major_locator(locator)
    plt.grid(True)
    count += 1
plt.tight_layout()
plt.show()