In [2]:
import numpy as np
import pandas as pd


In [5]:
X = pd.read_parquet("/home/cara/Documents/reddit_analyses/thread-size/Test_outputs/conspiracy_train_X.parquet")
y = pd.read_parquet("/home/cara/Documents/reddit_analyses/thread-size/Test_outputs/conspiracy_train_Y.parquet")['log_thread_size']

In [6]:
CLASS_BIN_EDGES = {
    3: [0.5],
    4: [1 / 3, 2 / 3],
}

In [8]:
classes = 4

In [10]:
bin_edges = [np.log(1) - 1e-3]
bin_edges.append(np.log(2) - 1e-3)
bin_edges.extend(
    [y[y > np.log(1)].quantile(x) for x in CLASS_BIN_EDGES[classes]]
)
bin_edges.append(y.max() + 1e-3)

In [12]:
np.exp(bin_edges)

array([  0.9990005 ,   1.998001  ,   7.        ,  22.        ,
       927.92746365])

In [78]:
y_bins = pd.cut(y, bins=bin_edges, labels=False, include_lowest=True)

In [82]:
thread_classes = {}

for i in range(1, 27):
    thread_classes[i] = y_bins[y == np.log(i)].unique()[0]

df = pd.DataFrame.from_dict(thread_classes, orient='index', columns=["class_name"])
df.index.name = "thread_size"
display(df)

Unnamed: 0_level_0,class_name
thread_size,Unnamed: 1_level_1
1,0
2,1
3,1
4,1
5,1
6,1
7,1
8,2
9,2
10,2


In [80]:
df.class_name.unique()

array([0, 1, 2, 3])

In [83]:
for i in df.class_name.unique():
    thread_size_vals = df[df.class_name == i].index
    print(f"Class {i}: [{thread_size_vals.min()}, {thread_size_vals.max()}]")

Class 0: [1, 1]
Class 1: [2, 7]
Class 2: [8, 22]
Class 3: [23, 26]


In [108]:
y_bin_counts = y_bins.value_counts().sort_index()
y_bin_counts.index.name = "Class"
thread_size_bins = [round(np.exp(x)) for x in bin_edges]
i = 0
bin_ranges = [[thread_size_bins[i], thread_size_bins[i]]]
for i in range(1, len(thread_size_bins)-1):
    to_append = [bin_ranges[i-1][1]+1, thread_size_bins[i+1]]
    bin_ranges.append(to_append)
bin_count_df = pd.DataFrame(
    {"Range": bin_ranges, "Threads": y_bin_counts}
)

In [109]:
bin_count_df

Unnamed: 0_level_0,Range,Threads
Class,Unnamed: 1_level_1,Unnamed: 2_level_1
0,"[1, 1]",115
1,"[2, 7]",241
2,"[8, 22]",218
3,"[23, 928]",226


In [3]:
out_dir = "/home/cara/Documents/reddit_analyses/thread-size/Test_outputs"
for subreddit in ['politics', 'crypto']:
    root_dir = f"/home/cara/Documents/reddit_analyses/thread-size/Outputs/Preprocessing/{subreddit}"
    for filename in [f"{subreddit}_test_X.parquet", f"{subreddit}_test_Y.parquet", f"{subreddit}_train_X.parquet", f"{subreddit}_train_Y.parquet"]:
        df = pd.read_parquet(f"{root_dir}/{filename}").head(500)
        df.to_parquet(f"{out_dir}/{filename}")


In [1]:
import pandas as pd
import numpy as np
import joblib

In [20]:
cm_dict = joblib.load("/home/cara/Documents/reddit_analyses/thread-size/Test_outputs/2_Thread_Size/conspiracy/4_model/model_1/model_data/test_confusion_matrix_data.jl")

In [21]:
def get_correct_classes(cm):
    class_correct = []
    for i in range(0,len(cm)):
        class_correct.append(cm[i,i]/sum(cm[i,:]))
    return class_correct

def get_true_class_counts(cm):
    class_counts = []
    for i in range(0, len(cm)):
        class_counts.append(sum(cm[i,:]))
    return class_counts

def get_predicted_class_counts(cm):
    class_counts = []
    for i in range(0, len(cm)):
        class_counts.append(sum(cm[:,i]))
    return class_counts

def get_predicted_class_ratios_df(cm_dict):
    pred_class_ratios = {}
    for k, cm in cm_dict.items():
        pred_class_ratios[k] = get_predicted_class_counts(cm)/cm_dict['CM'].sum()
    true_class_ratios = get_true_class_counts(cm_dict['CM'])/cm_dict['CM'].sum()
    return pred_class_ratios, true_class_ratios
    

In [22]:
pred_class_ratios, true_class_ratios = get_predicted_class_ratios_df(cm_dict)

In [23]:
pred_class_ratios

{'CM': array([0.   , 0.38 , 0.185, 0.435]),
 'lower': array([0.      , 0.241125, 0.091875, 0.27375 ]),
 'upper': array([0.     , 0.51575, 0.2805 , 0.56625]),
 'mean': array([0.     , 0.38675, 0.18375, 0.4295 ]),
 'std': array([0.        , 0.08037691, 0.05740983, 0.08647887])}

In [24]:
def get_predicted_class_ratios_df(cm_dict):
    pred_class_ratios = {}
    for k, cm in cm_dict.items():
        pred_class_ratios[k] = get_predicted_class_counts(cm)/cm_dict['CM'].sum()
    true_class_ratios = get_true_class_counts(cm_dict['CM'])/cm_dict['CM'].sum()
    df = pd.concat([pd.DataFrame(pred_class_ratios),pd.DataFrame(true_class_ratios)], axis=1).rename(columns={
        "CM": "predicted",
        0: "true"
    })
    return df

In [25]:
get_predicted_class_ratios_df(cm_dict)

Unnamed: 0,predicted,lower,upper,mean,std,true
0,0.0,0.0,0.0,0.0,0.0,0.14
1,0.38,0.241125,0.51575,0.38675,0.080377,0.34
2,0.185,0.091875,0.2805,0.18375,0.05741,0.26
3,0.435,0.27375,0.56625,0.4295,0.086479,0.26


In [38]:
def misclassified(i, j, cm_dict):
    """Class i misclassified as class j in given confusion matrix cm.

    Args:
        i int: integer associated with class
        j int: integer associated with class
        cm pd.DataFrame: confusion matrix
    """
    true_class_counts = get_true_class_counts(cm_dict["CM"])[i]
    misclassified_dict = {
        f"misclassified": np.sum(cm_dict['CM'][i, j])/true_class_counts,
    }
    for m in ["lower", "upper"]:
        misclassified_dict[f"misclassified_{m}"] = np.sum(cm_dict[m][i, j])/true_class_counts
    return misclassified_dict

In [58]:
df = (pd.DataFrame.from_dict(misclassified(0,1, cm_dict),orient='index').T)
df['Origin'] = 0
df['Destination'] = 1
df

Unnamed: 0,misclassified,misclassified_lower,misclassified_upper,Origin,Destination
0,0.357143,0.195536,0.501786,0,1


In [45]:
len(cm_dict['CM'])

4

In [59]:
rows = []
for i in range(0,len(cm_dict['CM'])):
    for j in [x for x in range(0, len(cm_dict['CM'])) if x != i]:
        df = pd.DataFrame.from_dict(misclassified(i,j, cm_dict),orient='index').T
        df['Origin'] = i
        df['Destination'] = j
        rows.append(df)

In [64]:
pd.concat(rows).reset_index(drop=True)[['Origin', 'Destination', 'misclassified', 'misclassified_lower', 'misclassified_upper']]

Unnamed: 0,Origin,Destination,misclassified,misclassified_lower,misclassified_upper
0,0,1,0.357143,0.195536,0.501786
1,0,2,0.107143,0.0,0.178571
2,0,3,0.535714,0.302679,0.733036
3,1,0,0.0,0.0,0.0
4,1,2,0.147059,0.051103,0.184191
5,1,3,0.382353,0.248529,0.509926
6,2,0,0.0,0.0,0.0
7,2,1,0.365385,0.228846,0.480769
8,2,3,0.442308,0.325962,0.557692
9,3,0,0.0,0.0,0.0


In [65]:
import pandas as pd
import numpy
import joblib
import shap

  from .autonotebook import tqdm as notebook_tqdm


In [66]:
shap_exp_dict = joblib.load("/home/cara/Documents/reddit_analyses/thread-size/Test_outputs/1_Thread_Start/conspiracy/4_model/model_5/shap_explainer.jl")

In [67]:
X = shap_exp_dict["X_test"]
shap_exp = shap_exp_dict["explainer"]
shap_vals = shap_exp(X)

In [72]:
pd.DataFrame(shap_vals.values, columns=X.columns)

Unnamed: 0,question_ratio,encoded_author,avg_word_length,subject_length,hour
0,0.530410,-0.130665,-0.438208,0.076502,-0.127650
1,-0.239536,-0.025147,-0.396594,-0.511986,0.649893
2,0.002010,0.454964,-0.121515,0.853430,0.272883
3,0.048221,-0.093034,0.418219,-1.468706,0.135414
4,0.703397,-0.029730,0.963111,-0.409357,0.299801
...,...,...,...,...,...
195,-0.352225,0.123636,-0.573684,0.674257,-0.148908
196,1.234176,0.310022,0.136743,0.816985,0.040226
197,0.136550,0.454469,0.305125,0.875259,-0.067563
198,1.234176,0.310022,0.136743,0.816985,0.040226
