# Visualization

This notebook is about visualization of the weekly analysis. In addition, XGBoost and RandomForest have the visualization of the feature importances.

In [None]:
import seaborn as sns
import pickle
import numpy as np
import pandas as pd
import importlib, sys
import os
import matplotlib.pyplot as plt
os.chdir('..')
from src.train_Ensemble import filter_by_feature_type
from src.visualize import plot_recall_at_k, plot_feature_importances, convert_clf_dict_to_df, \
                                          extract_base_clf_probs, calculate_recalls_at_k, plot_weekly_analysis, \
                                          extract_feature_importances, aggregate_feature_importances, \
                                          plot_aggregated_over_weeks, calculate_weight_ratio, plot_ratio

### Load

In [None]:
# Load results
result_path = 'data/results/'
file_name = 'results.pickle'
with open(result_path+file_name, 'rb') as f:
        classifier_dict = pickle.load(f)
# Get config from classifier_dict
config = classifier_dict[min(classifier_dict.keys())]['config']
# Load feature matrix
feature_matrix_path = 'data/processed/' if config['model_name'] == 'LSTM' else 'data/raw/'
if config['model_name'] == 'LSTM':
    feature_matrix_path, feature_matrix_file_name = 'data/processed/', 'feature_matrix_prep.pickle'
else:
    feature_matrix_path, feature_matrix_file_name = 'data/raw/', 'feature_matrix.pickle'
with open(feature_matrix_path+feature_matrix_file_name, 'rb') as f:
    feature_matrix = pickle.load(f)
# Path use to save visualizations
path = 'data/visualizations/'
# Preprocess feature matrix
if config['model_name'] != 'LSTM':
    feature_matrix = filter_by_feature_type(feature_matrix, feature_type=config['feature_type'])

### Top probabilities

In [None]:
# Convert classifier_dict to DataFrame
metrics = ['y_test', 'y_prob', 'confusion_matrix', 'info', 'model']
df_data = convert_clf_dict_to_df(classifier_dict, feature_matrix, metrics, config)
# Extract probabilities for base classifiers in case of late fusion
if config['late_fusion_flag'] and config['model_name'] != 'LSTM':
    df_data = extract_base_clf_probs(df_data)
# Calculate recalls
recalls, num_of_samples, list_k = calculate_recalls_at_k(df_data, k_max=150)
# Plot recall at k
plot_recall_at_k(recalls, list_k, num_of_samples, int(df_data['# of escalation flags'].sum()), 
                 title="Recall at k", path=path)
# Plot ratio
if config['late_fusion_flag'] and config['model_name'] != 'LSTM':
    plot_ratio(df_data, path)
#PLot weekly analysis
for k in [5, 10, 20, 50, 100]:
    plot_weekly_analysis(df_data, "Weekly Analysis", k, path=path)

### Feature importances

In [None]:
if config['model_name'] != 'LSTM':
    # Extract feature importances from classifier
    # feature_importances (num_weeks, num_feat)
    feature_importances = extract_feature_importances(df_data, feature_matrix, classifier_dict, config)
    # Aggregate
    # feature_importances_over_weeks (num_feat per week, num_weeks)
    feature_importances_over_weeks = aggregate_feature_importances(feature_importances, feature_matrix)
    # Calculate features and unique features
    list_not_feat_cols = ['pred_time', 'escalation_flag', 'customer']
    # features (num_features)
    features = feature_matrix.drop(columns=list_not_feat_cols).columns
    # uniq_features (num_feat per week)
    uniq_features = np.unique([feat[:-5] for feat in features])
    # feature_importances_over_feat (num_feat)
    feature_importances_over_feat = np.mean(feature_importances, axis=0)
    # Plot feature importances over features (num_feat)
    plot_feature_importances(feature_importances_over_feat, features, 
                             title='Feature importances aggregated over types | Mean', path=path)
    # Plot aggregated features over weeks
    plot_aggregated_over_weeks(df_data, feature_importances_over_weeks, uniq_features, path)