In [1]:
import pandas as pd
from sklearn.neighbors import NearestNeighbors

from util import (
    SIMILARITY_TABLE_DATA_DIR,
)


# Generate Similarity Matrices

Generate the user-user and item-item similarity matrices at each layer that we need for our analysis. This generates unadjusted, adjusted, and normalized Cosine Similarities for item-item matrices.

The output is written to `../data/similarity_matrices`. For the given dataset, this is run just once.

In [3]:
layer_1_raw = pd.read_parquet(
    "../data/processed/100-layer-frequencies_layer_1.parquet",
    engine="pyarrow",
).drop("Gender_Code", axis=1)

layer_2_raw = pd.read_parquet(
    "../data/processed/100-layer-frequencies_layer_2.parquet",
    engine="pyarrow",
).drop("Gender_Code", axis=1)

layer_3_raw = pd.read_parquet(
    "../data/processed/100-layer-frequencies_layer_3.parquet",
    engine="pyarrow",
).drop("Gender_Code", axis=1)


In [4]:
# We will generate the top-10 rankings. The user/item will be part of this
# list since it is most similar to itself. Hence 11.
num_neighbor = 11

## Prepare Similarity Matrices

### Item-item

In [6]:
# Small helper method to flip the dataframe. This makes the ICD classes as row
# and member ID as columns
def flip_df(df_raw):
    df_flip = df_raw.set_index('Member_Life_ID').transpose()
    df_flip.columns.name=''
    return df_flip

#### Unadjusted Cosine Similarity

In [7]:
df_item_1 = flip_df(layer_1_raw)
df_item_2 = flip_df(layer_2_raw)
df_item_3 = flip_df(layer_3_raw)

#### Adjusted Cosine Similarity

In [8]:
patient_mean_1 = df_item_1.mean(axis=0)
patient_mean_2 = df_item_2.mean(axis=0)
patient_mean_3 = df_item_3.mean(axis=0)
df_adjusted_item_1 = df_item_1 - patient_mean_1
df_adjusted_item_2 = df_item_2 - patient_mean_2
df_adjusted_item_3 = df_item_3 - patient_mean_3

#### Normalized Cosine Similarity

In [9]:
df_normalized_item_1 = (df_item_1 - df_item_1.min()) / (
    df_item_1.max() - df_item_1.min()
)
df_normalized_item_2 = (df_item_2 - df_item_2.min()) / (
    df_item_2.max() - df_item_2.min()
)
df_normalized_item_3 = (df_item_3 - df_item_3.min()) / (
    df_item_3.max() - df_item_3.min()
)

### User-user

In [None]:
df_user_1 = layer_1_raw.set_index('Member_Life_ID')
df_user_2 = layer_2_raw.set_index('Member_Life_ID')
df_user_3 = layer_3_raw.set_index('Member_Life_ID')

## Calculate Item-Item and User-User Distances

In [10]:
# Small helper that uses sklearn to determine the cosine similarity.
def calculate(df, num_neighbor):
    knn = NearestNeighbors(metric="cosine", algorithm="brute")
    knn.fit(df.values)
    distances, indices = knn.kneighbors(df.values, n_neighbors=num_neighbor)
    return distances, indices


### Item-Item Similarity - Unadjusted Cosine

In [14]:
item_distance_1, item_indices_1 = calculate(df_item_1, num_neighbor)
df_item_distance_1 = pd.DataFrame(
    item_distance_1,
    index=df_item_1.index.tolist(),
    columns=[str(i) for i in range(num_neighbor)],
)
df_item_indices_1 = pd.DataFrame(
    item_indices_1,
    index=df_item_1.index.tolist(),
    columns=[str(i) for i in range(num_neighbor)],
)


In [15]:
item_distance_2, item_indices_2 = calculate(df_item_2, num_neighbor)
df_item_distance_2 = pd.DataFrame(
    item_distance_2,
    index=df_item_2.index.tolist(),
    columns=[str(i) for i in range(num_neighbor)],
)
df_item_indices_2 = pd.DataFrame(
    item_indices_2,
    index=df_item_2.index.tolist(),
    columns=[str(i) for i in range(num_neighbor)],
)


In [16]:
item_distance_3, item_indices_3 = calculate(df_item_3, num_neighbor)
df_item_distance_3 = pd.DataFrame(
    item_distance_3,
    index=df_item_3.index.tolist(),
    columns=[str(i) for i in range(num_neighbor)],
)
df_item_indices_3 = pd.DataFrame(
    item_indices_3,
    index=df_item_3.index.tolist(),
    columns=[str(i) for i in range(num_neighbor)],
)


### Item-Item Similarity - Adjusted Cosine

In [17]:
adjusted_item_distance_1, adjusted_item_indices_1 = calculate(
    df_adjusted_item_1, num_neighbor
)
df_adjusted_item_distance_1 = pd.DataFrame(
    adjusted_item_distance_1,
    index=df_adjusted_item_1.index.tolist(),
    columns=[str(i) for i in range(num_neighbor)],
)
df_adjusted_item_indices_1 = pd.DataFrame(
    adjusted_item_indices_1,
    index=df_adjusted_item_1.index.tolist(),
    columns=[str(i) for i in range(num_neighbor)],
)


In [18]:
adjusted_item_distance_2, adjusted_item_indices_2 = calculate(
    df_adjusted_item_2, num_neighbor
)
df_adjusted_item_distance_2 = pd.DataFrame(
    adjusted_item_distance_2,
    index=df_adjusted_item_2.index.tolist(),
    columns=[str(i) for i in range(num_neighbor)],
)
df_adjusted_item_indices_2 = pd.DataFrame(
    adjusted_item_indices_1,
    index=df_adjusted_item_2.index.tolist(),
    columns=[str(i) for i in range(num_neighbor)],
)


In [19]:
adjusted_item_distance_3, adjusted_item_indices_3 = calculate(
    df_adjusted_item_3, num_neighbor
)
df_adjusted_item_distance_3 = pd.DataFrame(
    adjusted_item_distance_3,
    index=df_adjusted_item_3.index.tolist(),
    columns=[str(i) for i in range(num_neighbor)],
)
df_adjusted_item_indices_3 = pd.DataFrame(
    adjusted_item_indices_3,
    index=df_adjusted_item_3.index.tolist(),
    columns=[str(i) for i in range(num_neighbor)],
)


### Item-Item Similarity - Normalized Cosine

In [38]:
normalized_item_distance_1, normalized_item_indices_1 = calculate(
    df_normalized_item_1, num_neighbor
)
df_normalized_item_distance_1 = pd.DataFrame(
    normalized_item_distance_1,
    index=df_normalized_item_1.index.tolist(),
    columns=[str(i) for i in range(num_neighbor)],
)
df_normalized_item_indices_1 = pd.DataFrame(
    normalized_item_indices_1,
    index=df_normalized_item_1.index.tolist(),
    columns=[str(i) for i in range(num_neighbor)],
)


In [39]:
normalized_item_distance_2, normalized_item_indices_2 = calculate(
    df_normalized_item_2, num_neighbor
)
df_normalized_item_distance_2 = pd.DataFrame(
    normalized_item_distance_2,
    index=df_normalized_item_2.index.tolist(),
    columns=[str(i) for i in range(num_neighbor)],
)
df_normalized_item_indices_2 = pd.DataFrame(
    normalized_item_indices_2,
    index=df_normalized_item_2.index.tolist(),
    columns=[str(i) for i in range(num_neighbor)],
)


In [40]:
normalized_item_distance_3, normalized_item_indices_3 = calculate(
    df_normalized_item_3, num_neighbor
)
df_normalized_item_distance_3 = pd.DataFrame(
    normalized_item_distance_3,
    index=df_normalized_item_3.index.tolist(),
    columns=[str(i) for i in range(num_neighbor)],
)
df_normalized_item_indices_3 = pd.DataFrame(
    normalized_item_indices_3,
    index=df_normalized_item_3.index.tolist(),
    columns=[str(i) for i in range(num_neighbor)],
)


### User-User Similarity

In [None]:
user_distance_1, user_indices_1 = calculate(df_user_1, num_neighbor)
df_user_distance_1 = pd.DataFrame(
    user_distance_1,
    index=df_user_1.index.tolist(),
    columns=[str(i) for i in range(num_neighbor)],
)
df_user_indices_1 = pd.DataFrame(
    user_indices_1,
    index=df_user_1.index.tolist(),
    columns=[str(i) for i in range(num_neighbor)],
)

In [None]:
user_distance_2, user_indices_2 = calculate(df_user_2, num_neighbor)
df_user_distance_2 = pd.DataFrame(
    user_distance_2,
    index=df_user_2.index.tolist(),
    columns=[str(i) for i in range(num_neighbor)],
)
df_user_indices_2 = pd.DataFrame(
    user_indices_2,
    index=df_user_2.index.tolist(),
    columns=[str(i) for i in range(num_neighbor)],
)

In [None]:
user_distance_3, user_indices_3 = calculate(df_user_3, num_neighbor)
df_user_distance_3 = pd.DataFrame(
    user_distance_3,
    index=df_user_3.index.tolist(),
    columns=[str(i) for i in range(num_neighbor)],
)
df_user_indices_3 = pd.DataFrame(
    user_indices_3,
    index=df_user_3.index.tolist(),
    columns=[str(i) for i in range(num_neighbor)],
)

## Cache the Matrices!

### Item-Item - Unadjusted Cosine

In [None]:
df_item_distance_1.to_parquet(
    f"{SIMILARITY_TABLE_DATA_DIR}/item_distances_layer_1.parquet"
)
df_item_indices_1.to_parquet(
    f"{SIMILARITY_TABLE_DATA_DIR}/item_indices_layer_1.parquet"
)

In [None]:
df_item_distance_2.to_parquet(
    f"{SIMILARITY_TABLE_DATA_DIR}/item_distances_layer_2.parquet"
)
df_item_indices_2.to_parquet(
    f"{SIMILARITY_TABLE_DATA_DIR}/item_indices_layer_2.parquet"
)

In [None]:
df_item_distance_3.to_parquet(
    f"{SIMILARITY_TABLE_DATA_DIR}/item_distances_layer_3.parquet"
)
df_item_indices_3.to_parquet(
    f"{SIMILARITY_TABLE_DATA_DIR}/item_indices_layer_3.parquet"
)

### Item-Item - Adjusted Cosine

In [20]:
df_adjusted_item_distance_1.to_parquet(
    f"{SIMILARITY_TABLE_DATA_DIR}/adjusted_item_distances_layer_1.parquet"
)
df_adjusted_item_indices_1.to_parquet(
    f"{SIMILARITY_TABLE_DATA_DIR}/adjusted_item_indices_layer_1.parquet"
)

In [21]:
df_adjusted_item_distance_2.to_parquet(
    f"{SIMILARITY_TABLE_DATA_DIR}/adjusted_item_distances_layer_2.parquet"
)
df_adjusted_item_indices_2.to_parquet(
    f"{SIMILARITY_TABLE_DATA_DIR}/adjusted_item_indices_layer_2.parquet"
)

In [22]:
df_adjusted_item_distance_3.to_parquet(
    f"{SIMILARITY_TABLE_DATA_DIR}/adjusted_item_distances_layer_3.parquet"
)
df_adjusted_item_indices_3.to_parquet(
    f"{SIMILARITY_TABLE_DATA_DIR}/adjusted_item_indices_layer_3.parquet"
)

### Item-Item - Normalized Cosine

In [41]:
df_normalized_item_distance_1.to_parquet(
    f"{SIMILARITY_TABLE_DATA_DIR}/normalized_item_distances_layer_1.parquet"
)
df_normalized_item_indices_1.to_parquet(
    f"{SIMILARITY_TABLE_DATA_DIR}/normalized_item_indices_layer_1.parquet"
)

In [42]:
df_normalized_item_distance_2.to_parquet(
    f"{SIMILARITY_TABLE_DATA_DIR}/normalized_item_distances_layer_2.parquet"
)
df_normalized_item_indices_2.to_parquet(
    f"{SIMILARITY_TABLE_DATA_DIR}/normalized_item_indices_layer_2.parquet"
)

In [43]:
df_normalized_item_distance_3.to_parquet(
    f"{SIMILARITY_TABLE_DATA_DIR}/normalized_item_distances_layer_3.parquet"
)
df_normalized_item_indices_3.to_parquet(
    f"{SIMILARITY_TABLE_DATA_DIR}/normalized_item_indices_layer_3.parquet"
)

### User-User Similarity

In [None]:
df_user_distance_1.to_parquet(
    f"{SIMILARITY_TABLE_DATA_DIR}/user_distances_layer_1.parquet"
)
df_user_indices_1.to_parquet(
    f"{SIMILARITY_TABLE_DATA_DIR}/user_indices_layer_1.parquet"
)

In [None]:
df_user_distance_2.to_parquet(
    f"{SIMILARITY_TABLE_DATA_DIR}/user_distances_layer_2.parquet"
)
df_user_indices_2.to_parquet(
    f"{SIMILARITY_TABLE_DATA_DIR}/user_indices_layer_2.parquet"
)

In [None]:
df_user_distance_3.to_parquet(
    f"{SIMILARITY_TABLE_DATA_DIR}/user_distances_layer_3.parquet"
)
df_user_indices_3.to_parquet(
    f"{SIMILARITY_TABLE_DATA_DIR}/user_indices_layer_3.parquet"
)