In [41]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_absolute_error, mean_squared_error


# Model Evaluation with Three Cosine Similarity Metrics

We are going to use Unadjusted, Centered, and Normalized Cosine Similarity to evaluate our model's performance with and without the Global Baseline Estimate (GBE). Please see our Report for more information.

## Load the Utility and Similarity Matrices

In [42]:
df_layer_1 = pd.read_parquet(
    "../data/processed/100-layer-frequencies_layer_1.parquet",
    engine="pyarrow",
).drop("Gender_Code", axis=1)

df_layer_2 = pd.read_parquet(
    "../data/processed/100-layer-frequencies_layer_2.parquet",
    engine="pyarrow",
).drop("Gender_Code", axis=1)

df_layer_3 = pd.read_parquet(
    "../data/processed/100-layer-frequencies_layer_3.parquet",
    engine="pyarrow",
).drop("Gender_Code", axis=1)


In [43]:
df_item_distances_1 = pd.read_parquet(
    "../data/similarity_matrices/item_distances_layer_1.parquet",
    engine="pyarrow",
)
df_item_indices_1 = pd.read_parquet(
    "../data/similarity_matrices/item_indices_layer_1.parquet",
    engine="pyarrow",
)

df_item_distances_2 = pd.read_parquet(
    "../data/similarity_matrices/item_distances_layer_2.parquet",
    engine="pyarrow",
)
df_item_indices_2 = pd.read_parquet(
    "../data/similarity_matrices/item_indices_layer_2.parquet",
    engine="pyarrow",
)

df_item_distances_3 = pd.read_parquet(
    "../data/similarity_matrices/item_distances_layer_3.parquet",
    engine="pyarrow",
)
df_item_indices_3 = pd.read_parquet(
    "../data/similarity_matrices/item_indices_layer_3.parquet",
    engine="pyarrow",
)


In [37]:
df_adjusted_item_distances_1 = pd.read_parquet(
    "../data/similarity_matrices/adjusted_item_distances_layer_1.parquet",
    engine="pyarrow",
)
df_adjusted_item_indices_1 = pd.read_parquet(
    "../data/similarity_matrices/adjusted_item_indices_layer_1.parquet",
    engine="pyarrow",
)
df_adjusted_item_distances_2 = pd.read_parquet(
    "../data/similarity_matrices/adjusted_item_distances_layer_2.parquet",
    engine="pyarrow",
)
df_adjusted_item_indices_2 = pd.read_parquet(
    "../data/similarity_matrices/adjusted_item_indices_layer_2.parquet",
    engine="pyarrow",
)
df_adjusted_item_distances_3 = pd.read_parquet(
    "../data/similarity_matrices/adjusted_item_distances_layer_3.parquet",
    engine="pyarrow",
)
df_adjusted_item_indices_3 = pd.read_parquet(
    "../data/similarity_matrices/adjusted_item_indices_layer_3.parquet",
    engine="pyarrow",
)


In [44]:
df_normalized_item_distances_1 = pd.read_parquet(
    "../data/similarity_matrices/normalized_item_distances_layer_1.parquet",
    engine="pyarrow",
)
df_normalized_item_indices_1 = pd.read_parquet(
    "../data/similarity_matrices/normalized_item_indices_layer_1.parquet",
    engine="pyarrow",
)
df_normalized_item_distances_2 = pd.read_parquet(
    "../data/similarity_matrices/normalized_item_distances_layer_2.parquet",
    engine="pyarrow",
)
df_normalized_item_indices_2 = pd.read_parquet(
    "../data/similarity_matrices/normalized_item_indices_layer_2.parquet",
    engine="pyarrow",
)
df_normalized_item_distances_3 = pd.read_parquet(
    "../data/similarity_matrices/normalized_item_distances_layer_3.parquet",
    engine="pyarrow",
)
df_normalized_item_indices_3 = pd.read_parquet(
    "../data/similarity_matrices/normalized_item_indices_layer_3.parquet",
    engine="pyarrow",
)


## Set up the Evaluator

In [46]:
def evaluate(
    df,
    Cosine="Normal",
    indices=df_item_indices_1,
    distances=df_item_distances_1,
):

    df_flip = df.set_index("Member_Life_ID").transpose()
    df_flip.columns.name = ""
    orig_df = df_flip

    # We are using a 70/30 Train/Test split. Pick 30% of patients randomly.
    true_df = orig_df.sample(n=30_000, axis="columns", random_state=404)
    test_df = true_df.copy()

    # Baseline estimate calculation
    global_mean = np.nanmean(test_df)
    icd_mean = test_df.mean(axis=1)
    patient_mean = test_df.mean(axis=0)

    deviation_of_patient = patient_mean - global_mean
    deviation_of_icd = icd_mean - global_mean

    # We are evaluating three variants of the Cosine Similarity function in
    # this project.
    if Cosine == "Adjusted":
        test_df = test_df - patient_mean
    if Cosine == "Normalized":
        test_df = (test_df - test_df.min()) / (test_df.max() - test_df.min())

    y_true = []
    y_pred = []
    y_pred_raw = []
    for target_ID in test_df:

        # Look at all ICD classes of that patient
        test_vector = test_df[target_ID]

        # Show ICD classes with positive frequencies
        test_vector = test_vector[test_vector > 0]

        # Nothing to test if the patient has not been diagnosed for anything
        # No need to trim df if there's only one diagnose
        if len(test_vector) > 1:

            # Filter length of ICD class to 30%
            test_vector = test_vector[0 : round(len(test_vector) * 0.3)]

        # Loop over each diagnosed ICD class for that patient
        for ICD_Class, real_freq in test_vector.iteritems():
            similar_ICD_indices = indices.loc[ICD_Class].tolist()
            similar_ICD_distances = distances.loc[ICD_Class].tolist()

            # Replace indices with ICD classes
            for i in range(len(similar_ICD_indices)):
                value = similar_ICD_indices[i]
                similar_ICD_indices[i] = orig_df.index[value]

            # Remove ICD class from its own similarity list
            if ICD_Class in similar_ICD_indices:
                ICD_id = similar_ICD_indices.index(ICD_Class)
                similar_ICD_indices.remove(ICD_Class)
                similar_ICD_distances.pop(ICD_id)

            similar_ICD_similarity = [1 - x for x in similar_ICD_distances]
            numerator = 0
            numerator_raw = 0
            denominator = 0

            # The global_estimate of each class for one user
            baseline_estimate = (
                global_mean + deviation_of_patient[target_ID] + deviation_of_icd
            )

            # looping over each similar ICD class in that similarity list
            for s in range(0, len(similar_ICD_similarity)):

                # rating - global estimate, right side of equation
                adjusted_r = (
                    test_df.loc[similar_ICD_indices[s], target_ID]
                    - baseline_estimate[similar_ICD_indices[s]]
                )
                numerator = numerator + similar_ICD_similarity[s] * adjusted_r
                numerator_raw = (
                    numerator_raw
                    + similar_ICD_similarity[s]
                    * test_df.loc[similar_ICD_indices[s], target_ID]
                )
                denominator = denominator + similar_ICD_similarity[s]

            if denominator > 0:
                predicted_freq = numerator / denominator + baseline_estimate[ICD_Class]
                predicted_freq_raw = numerator_raw / denominator

            else:
                # If similarity matrix is 0, set freq as 0 to avoid NaN
                predicted_freq = baseline_estimate[ICD_Class]
                predicted_freq_raw = 0

            if predicted_freq < 0:
                predicted_freq = 0

            if predicted_freq_raw < 0:
                predicted_freq_raw = 0

            y_true.append(real_freq)
            y_pred.append(predicted_freq)
            y_pred_raw.append(predicted_freq_raw)

    return (y_true, y_pred, y_pred_raw)


## Evaluation

### Layer 1

In [47]:
y_true, y_pred, y_pred_raw = evaluate(df_layer_1)

In [49]:
print("------------------ Unadjusted Cosine ------------------")

# Nothing
print("RMSE:", mean_squared_error(y_true, y_pred, squared=False), "(GBE)")
print("RMSE:", mean_squared_error(y_true, y_pred_raw, squared=False))
print("MAE:", mean_absolute_error(y_true, y_pred), "(GBE)")
print("MAE:", mean_absolute_error(y_true, y_pred_raw))


------------------ Unadjusted Cosine ------------------
RMSE: 1.5989534623430735 (GBE)
RMSE: 1.6054705363037605
MAE: 1.2065819980562873 (GBE)
MAE: 1.1553044186300971


In [None]:
y_true, y_pred_adjusted, y_pred_raw_adjusted = evaluate(
    df_layer_1,
    Cosine="Adjusted",
    indices=df_adjusted_item_indices_1,
    distances=df_adjusted_item_distances_1,
)


In [None]:
print("------------------ Centered Cosine ------------------")

print("RMSE:", mean_squared_error(y_true, y_pred_adjusted, squared=False), "(GBE)")
print("RMSE:", mean_squared_error(y_true, y_pred_raw_adjusted, squared=False))
print("MAE:", mean_absolute_error(y_true, y_pred_adjusted), "(GBE)")
print("MAE:", mean_absolute_error(y_true, y_pred_raw_adjusted))


In [None]:
y_true, y_pred_normalized, y_pred_raw_normalized = evaluate(
    df_layer_1,
    Cosine="Normalized",
    indices=df_normalized_item_indices_1,
    distances=df_normalized_item_distances_1,
)


In [None]:
print("------------------ Normalized Cosine ------------------")

print("RMSE:", mean_squared_error(y_true, y_pred_normalized, squared=False), "(GBE)")
print("RMSE:", mean_squared_error(y_true, y_pred_raw_normalized, squared=False))
print("MAE:", mean_absolute_error(y_true, y_pred_normalized), "(GBE)")
print("MAE:", mean_absolute_error(y_true, y_pred_raw_normalized))


### Layer 2

In [None]:
y_true, y_pred, y_pred_raw = evaluate(
    df_layer_2,
    indices=df_item_indices_2,
    distances=df_item_distances_2,
)


In [None]:
print("------------------ Unadjusted Cosine ------------------")

print("RMSE:",mean_squared_error(y_true, y_pred,squared=False), "(GBE)")
print("RMSE:",mean_squared_error(y_true, y_pred_raw,squared=False))
print("MAE:",mean_absolute_error(y_true, y_pred), "(GBE)")
print("MAE:",mean_absolute_error(y_true, y_pred_raw))

In [None]:
y_true, y_pred_adjusted, y_pred_raw_adjusted = evaluate(
    df_layer_2,
    Cosine="Adjusted",
    indices=df_adjusted_item_indices_2,
    distances=df_adjusted_item_distances_2,
)


In [None]:
print("------------------ Centered Cosine ------------------")

print("RMSE:",mean_squared_error(y_true, y_pred_adjusted,squared=False), "(GBE)")
print("RMSE:",mean_squared_error(y_true, y_pred_raw_adjusted,squared=False))
print("MAE:",mean_absolute_error(y_true, y_pred_adjusted), "(GBE)")
print("MAE:",mean_absolute_error(y_true, y_pred_raw_adjusted))

In [None]:
y_true, y_pred_normalized, y_pred_raw_normalized = evaluate(
    df_layer_2,
    Cosine="Normalized",
    indices=df_normalized_item_indices_2,
    distances=df_normalized_item_distances_2,
)


In [None]:
print("------------------ Normalized Cosine ------------------")

print("RMSE:",mean_squared_error(y_true, y_pred_normalized,squared=False), "(GBE)")
print("RMSE:",mean_squared_error(y_true, y_pred_raw_normalized,squared=False))
print("MAE:",mean_absolute_error(y_true, y_pred_normalized), "(GBE)")
print("MAE:",mean_absolute_error(y_true, y_pred_raw_normalized))

### Layer 3

In [None]:
y_true, y_pred, y_pred_raw = evaluate(
    df_layer_3,
    indices=df_item_indices_3,
    distances=df_item_distances_3,
)


In [None]:
print("------------------ Unadjusted Cosine ------------------")

print("RMSE:",mean_squared_error(y_true, y_pred,squared=False), "(GBE)")
print("RMSE:",mean_squared_error(y_true, y_pred_raw,squared=False))
print("MAE:",mean_absolute_error(y_true, y_pred), "(GBE)")
print("MAE:",mean_absolute_error(y_true, y_pred_raw))

In [None]:
y_true, y_pred_adjusted, y_pred_raw_adjusted = evaluate(
    df_layer_3,
    Cosine="Adjusted",
    indices=df_adjusted_item_indices_3,
    distances=df_adjusted_item_distances_3,
)


In [None]:
print("------------------ Centered Cosine ------------------")

print("RMSE:",mean_squared_error(y_true, y_pred_adjusted,squared=False), "(GBE)")
print("RMSE:",mean_squared_error(y_true, y_pred_raw_adjusted,squared=False))
print("MAE:",mean_absolute_error(y_true, y_pred_adjusted), "(GBE)")
print("MAE:",mean_absolute_error(y_true, y_pred_raw_adjusted))

In [None]:
y_true, y_pred_normalized, y_pred_raw_normalized = evaluate(
    df_layer_3,
    Cosine="Normalized",
    indices=df_normalized_item_indices_3,
    distances=df_normalized_item_distances_3,
)


In [None]:
print("------------------ Normalized Cosine ------------------")

print("RMSE:",mean_squared_error(y_true, y_pred_normalized,squared=False), "(GBE)")
print("RMSE:",mean_squared_error(y_true, y_pred_raw_normalized,squared=False))
print("MAE:",mean_absolute_error(y_true, y_pred_normalized), "(GBE)")
print("MAE:",mean_absolute_error(y_true, y_pred_raw_normalized))