In [None]:
import pandas as pd
import numpy as np
import os
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
import shap
import matplotlib.pyplot as plt
import pickle as pkl


### Load Data

Load csv.

Features are the emotions; the target variable is 'is_questionable'.

Ratain only users with at least one emotion.

Drop rows where "is_questionable" = NA.

In [None]:
usr_emo_lean_path = os.path.join("..","..","data","processed","usr_emo_lean.gz")
ur_df = pd.read_csv(usr_emo_lean_path).dropna().query('n_emo > 0')
#ur_df

Besides the thresholds, the summary contains:
- Number of accounts
- The percentage of 'Questionable' accounts
- Number of 'Questionable' accounts
- Number of correctly classified 'Questionable' accounts
- Number of accounts classified as 'Questionable'
- Precision and Recall

As the minimum number of comments decrases, the proportion of Questionable accounts decreases too.
And so do precision and recall.
The number of emotions involved impacts much less on precision and recall than minimum number of comments

Browse the results directory and plot the bar and beeswarm plots for each experiment.

In [None]:
shap_result_path = os.path.join("..","..","output","shap_results")
for i,dir in enumerate(os.listdir(shap_result_path)):
    if os.path.isdir(os.path.join(shap_result_path, dir)):
        print(dir)
        if dir == "plots":
            continue
        (min_comments, min_emo) = [int(dir.split('_')[j]) for j in [0,3] ]
        # select the SHAP with users of at least 8 comments and 1 emotion threshold
        if min_emo == 1 and min_comments == 8:
            df = ur_df.query("n_comments >= @min_comments and n_emo >= @min_emo")
            Y = np.where(df['is_questionable'].astype(int)>0,1,0)
            X = df[['anger','anticipation','disgust','fear','joy','sadness','surprise','trust']]
            # load shap values for the selected experiment
            with open(str(os.path.join(shap_result_path,dir,'shap.pkl')), 'rb') as f:
                my_shap_values = pkl.load(f)
            break


In [None]:
# Extract feature importance expressed as mean absolute shap value
plot_importance = pd.DataFrame({'Feature':X.columns, 'Importance':np.sum(np.mean(np.abs(my_shap_values), axis=1), axis=0)})
print(np.sum(np.mean(np.abs(my_shap_values), axis=1), axis=0))
print(X.columns)
plot_importance.sort_values('Importance', ascending=False)
print(plot_importance)
plot_importance.to_csv('../../output/plot_importance.csv', index=False)

In [None]:
# function that modifies SHAP summary's colors
from matplotlib import cm
from matplotlib.colors import ListedColormap, LinearSegmentedColormap
def get_new_cmp(color01, color04):
    print(color01)
    print(color04)

    RGB_val = 255

    #color01 = (77, 146, 33)#color01= (0,150,200)  # Blue wanted
    #color04 = (197, 27, 125)#color04= (220,60,60)  # red wanted
    Colors = [color01, color04]

    # Creating a blue red palette transition for graphics
    Colors= [(R/RGB_val,G/RGB_val,B/RGB_val) for idx, (R,G,B) in enumerate(Colors)]
    n = 256

    # Start of the creation of the gradient
    Color01= ListedColormap(Colors[0], name='Color01', N=None)
    Color04= ListedColormap(Colors[1], name='Color04', N=None)
    top = cm.get_cmap(Color01,128)
    bottom = cm.get_cmap(Color04,128)
    newcolors = np.vstack((top(np.linspace(0, 1, 128)),
                        bottom(np.linspace(0, 1, 128))))

    mymin0 = newcolors[0][0]
    mymin1 = newcolors[0][1]
    mymin2 = newcolors[0][2]
    mymin3 = newcolors[0][3]
    mymax0 = newcolors[255][0]
    mymax1 = newcolors[255][1]
    mymax2 = newcolors[255][2]
    mymax3 = newcolors[255][3]

    GradientBlueRed= [np.linspace(mymin0, mymax0,  n),
                    np.linspace(mymin1, mymax1,  n),
                    np.linspace(mymin2, mymax2,  n),
                    np.linspace(mymin3, mymax3,  n)]

    GradientBlueRed_res =np.transpose(GradientBlueRed)

    newcmp = ListedColormap(GradientBlueRed_res, name='BlueRed')
    return(newcmp)


In [None]:
# choose colors
import ipywidgets as widgets
from ipywidgets import interact, interactive, fixed, interact_manual
w_high = widgets.ColorPicker(
    concise=False,
    description='Pick a color',
    value='blue',
    disabled=False
)
w_low = widgets.ColorPicker(
    concise=False,
    description='Pick a color',
    value='blue',
    disabled=False
)
def hex_to_rgb(value):
    value = value.lstrip('#')
    lv = len(value)
    return tuple(int(value[i:i + lv // 3], 16) for i in range(0, lv, lv // 3))


In [None]:
#071a5d
w_high

In [None]:
#cedfeb
w_low

In [None]:
len(my_shap_values[1])

In [None]:
plt.figure()
shap.summary_plot(my_shap_values[1], X.values, feature_names = X.columns, show=False)
newcmp = get_new_cmp(hex_to_rgb(w_low.value), hex_to_rgb(w_high.value))
for fc in plt.gcf().get_children():
    for fcc in fc.get_children():
        if hasattr(fcc, "set_cmap"):
            fcc.set_cmap(newcmp)
#plt.savefig("shap_summary.pdf",dpi=300)
#plt.savefig("shap_summary.png",dpi=300)

plt.show()
