# Frequent Items

## Initialization
- Load cleaned data and user data with clusters

In [13]:
import pandas as pd
import numpy as np
import os
from sklearn.neighbors import NearestNeighbors
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from mlxtend.frequent_patterns import apriori, association_rules
from sklearn.preprocessing import MultiLabelBinarizer
from scipy.sparse import csr_matrix
from mlxtend.frequent_patterns import fpgrowth
from collections import Counter
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from mlxtend.preprocessing import TransactionEncoder
from collections import defaultdict
from itertools import combinations
import networkx as nx
import warnings
# Ignore all warnings
warnings.filterwarnings("ignore")

In [14]:
file_path_songs = "Data/df_combined.csv"
df_songs = pd.read_csv(file_path_songs)
file_path_clusters = "Data/df_users.csv"
df_users = pd.read_csv(file_path_clusters)

## Compute market basket data frames
- A market basket data frame is computed for each cluster
- Only users with more than 50 total listenings are included
- Each basket includes at most the users 100 most played songs
- Songs which does not occur in more than 10 different baskets are excluded

We generate 15 separate DataFrames stored as `df_basket_1`, `df_basket_2`, ..., `df_basket_15`.

In [None]:
# Calculate the total play count for each user
user_total_playcount = df_users.groupby('user_id')['playcount'].sum().reset_index()
user_total_playcount = user_total_playcount[user_total_playcount['playcount'] >= 50]

# Filter the original user data to include only users with at least 50 total plays
df_users_filtered = df_users[df_users['user_id'].isin(user_total_playcount['user_id'])]

# Process each cluster to generate individual DataFrames
cluster_dataframes = {}  # Dictionary to hold the dataframes for each cluster

for cluster_id in sorted(df_users_filtered['most_played_cluster'].unique()[:15]):  # Ensure max 15 clusters
    # Filter the data for the current cluster
    cluster_data = df_users_filtered[df_users_filtered['most_played_cluster'] == cluster_id]

    # Sort songs for each user by play count and select the 100 most played songs
    def get_top_songs(user_data):
        return user_data.nlargest(100, 'playcount')  # Adjust to desired number of top songs

    # Apply the top song selection for each user
    df_top_songs_cluster = cluster_data.groupby('user_id').apply(get_top_songs).reset_index(drop=True)

    # Create a basket for each user with their top songs
    df_basket_cluster = df_top_songs_cluster.groupby('user_id')['track_id'].apply(list).reset_index()
    df_basket_cluster.rename(columns={'track_id': 'basket'}, inplace=True)

    # Assign the DataFrame to a variable dynamically
    cluster_dataframes[f"df_basket_{cluster_id}"] = df_basket_cluster

    # Optionally, save to a named variable
    globals()[f"df_basket_{cluster_id}"] = df_basket_cluster

In [None]:
# Function to filter songs occurring in more than 10 baskets
def filter_songs(df_basket, min_basket_count=10):
    """
    Filters out songs from the baskets that occur in fewer than `min_basket_count` baskets.
    
    Parameters:
    - df_basket (pd.DataFrame): DataFrame with columns `user_id` and `basket`.
    - min_basket_count (int): Minimum number of baskets a song must appear in to be retained.

    Returns:
    - pd.DataFrame: Filtered DataFrame with updated baskets.
    """
    # Flatten all baskets into a single list to count song occurrences
    all_songs = [song for basket in df_basket['basket'] for song in basket]
    song_counts = Counter(all_songs)

    # Identify songs that occur in more than `min_basket_count` baskets
    frequent_songs = {song for song, count in song_counts.items() if count > min_basket_count}

    # Filter each user's basket to retain only frequent songs
    df_basket['basket'] = df_basket['basket'].apply(lambda basket: [song for song in basket if song in frequent_songs])

    # Remove rows with empty baskets
    df_basket = df_basket[df_basket['basket'].map(len) > 0].reset_index(drop=True)

    return df_basket

# Apply the function to each df_basket_# DataFrame
for cluster_id in cluster_dataframes.keys():
    cluster_dataframes[cluster_id] = filter_songs(cluster_dataframes[cluster_id])

    # Optionally, save back to dynamically created variables
    globals()[cluster_id] = cluster_dataframes[cluster_id]

## Compute association rules and support
- Support and association rules are computed for each basket data frame
- The minimum support used is 2%
- Association rules are computed using the Apriori algorithm to generate frequent single items, dublets, and triplets

In [None]:
# Function to calculate support for itemsets
def calculate_support(data, itemsets):
    itemset_counts = defaultdict(int)
    for basket in data['basket']:
        for itemset in itemsets:
            if set(itemset).issubset(set(basket)):
                itemset_counts[itemset] += 1
    total_baskets = len(data)
    support = {itemset: count / total_baskets for itemset, count in itemset_counts.items()}
    return support

# Apriori Algorithm for Size 2 and 3
def apriori_pairs_and_triplets(data, min_support):
    frequent_itemsets = {}

    # Start with single items
    items = set(item for basket in data['basket'] for item in basket)
    single_itemsets = [(item,) for item in items]
    
    # Calculate support for single items
    support = calculate_support(data, single_itemsets)
    frequent_singles = [itemset for itemset, sup in support.items() if sup >= min_support]
    frequent_itemsets.update({itemset: support[itemset] for itemset in frequent_singles})
    
    print(f"Frequent Single Items: {frequent_singles}")
    
    # Generate pairs (size 2)
    pairs = list(combinations(set(item for itemset in frequent_singles for item in itemset), 2))
    pair_support = calculate_support(data, pairs)
    frequent_pairs = [itemset for itemset, sup in pair_support.items() if sup >= min_support]
    frequent_itemsets.update({itemset: pair_support[itemset] for itemset in frequent_pairs})
    
    print(f"Frequent Pairs: {frequent_pairs}")
    
    # Generate triplets (size 3)
    triplets = list(combinations(set(item for itemset in frequent_pairs for item in itemset), 3))
    triplet_support = calculate_support(data, triplets)
    frequent_triplets = [itemset for itemset, sup in triplet_support.items() if sup >= min_support]
    frequent_itemsets.update({itemset: triplet_support[itemset] for itemset in frequent_triplets})
    
    print(f"Frequent Triplets: {frequent_triplets}")
    
    return frequent_itemsets

## Generate results (freq. items and support)
The frequent items and item sets with corresponding support are stored in a CSV file for each cluster.

In [None]:
df_baskets = {f"df_basket_{i}": globals()[f"df_basket_{i}"] for i in range(1, 15)}

min_support = 0.02

for i, (name, data) in enumerate(df_baskets.items(), start=1):
    print(f"Processing {name}...")
    frequent_itemsets = apriori_pairs_and_triplets(data, min_support)
    
    # Convert frequent itemsets to a DataFrame
    results_df = pd.DataFrame(
        [{"itemset": itemset, "support": support} for itemset, support in frequent_itemsets.items()]
    )

Processing df_basket_1...
Frequent Single Items: [('TRRXGAK128EF349F1A',), ('TRRKGRC128F932D8F0',), ('TRWZFIC128F933BCA3',), ('TRNEWWX128F9336A1F',), ('TRUFTBY128F93450B8',), ('TRCPXID128F92D5D3C',), ('TRLUKKL128F4284EEA',), ('TRIJLQJ128E078F6F1',), ('TROYOWO12903CACF51',), ('TRUNCXA12903CDAA07',), ('TRBEFXF128F4263CE9',), ('TRONYHY128F92C9D11',), ('TRAZIQK12903CCFB3A',), ('TRRGWHY128F93043DB',), ('TRUDNRB128F42598CA',), ('TRBNYBX128F422EC61',), ('TRUWANM128F1485EE2',), ('TRSUNIO128F92DD214',), ('TRFKZHE128F149F632',), ('TRVZILF128F42748EF',), ('TRRGKHJ128F92F64DA',), ('TRBHVEV128F425C018',), ('TRMEBVU128F92F64DB',), ('TRCQFJI128F4284EEE',), ('TRFTUIW128E0784B9F',), ('TRQJLCO128F42BCC0A',), ('TRYFDNR128F4260C5E',), ('TRPDNZQ128F92C1E98',), ('TRXHXZJ128F92DD518',), ('TREXRIW128EF3434B7',), ('TRETTHQ128F428294E',), ('TRERZDK128F42B3222',), ('TRIXKCB128F424EA32',), ('TRZNAHL128F9327D5A',), ('TRIYBSB128F14B0259',), ('TRWCIAX128F42925BD',), ('TRFNGJS128F92F9EEE',), ('TRFWGOJ128E0780C8B',), 

In [21]:
# Save to a CSV file
for i in range(15):
    csv_filename = f"frequent_itemsets_basket_{i}.csv"
    output_path = f"Data/Results/{csv_filename}"
    results_df.to_csv(output_path, index=False)

## Visualizing the results: Final songs recommendations
- Heatmap showing the support between pairs of songs (duplets)
- Network graph showing triplets

In [24]:
# Directories for input and output
input_dir = "Data/Results"
output_dir_heatmaps = "Data/Heatmaps"
output_dir_graphs = "Data/Graphs"
os.makedirs(output_dir_heatmaps, exist_ok=True)
os.makedirs(output_dir_graphs, exist_ok=True)

# Function to process a single file for heatmap and graph
def process_csv(file_path, id_to_name):
    # Load duplets and their support values
    df = pd.read_csv(file_path)
    duplets = {
        tuple(item.strip() for item in row["itemset"].strip("()").replace("'", "").split(", ")): row["support"]
        for _, row in df.iterrows()
    }

    # Convert duplets' track_ids to names, handling missing keys
    duplets_with_names = {
        tuple(id_to_name.get(item, f"Unknown({item})") for item in duplet): support
        for duplet, support in duplets.items()
    }

    # Filter out pairs including 'Revelry' and ensure only pairs are included
    filtered_dublets = {
        pair: support for pair, support in duplets_with_names.items()
        if 'Revelry' not in pair and len(pair) == 2
    }

    # Prepare data for heatmap
    songs = list(set([item for pair in filtered_dublets.keys() for item in pair]))
    heatmap_data = pd.DataFrame(0, index=songs, columns=songs)

    for (song1, song2), support in filtered_dublets.items():
        heatmap_data.loc[song1, song2] = support
        heatmap_data.loc[song2, song1] = support  # Symmetric pairs

    # Plot heatmap
    plt.figure(figsize=(12, 10))
    sns.heatmap(
        heatmap_data,
        annot=False,
        cmap="Blues",
        cbar_kws={'label': 'Support'}
    )
    plt.title(f"Song Pair Support Heatmap - {os.path.basename(file_path)}", fontsize=16)
    plt.xticks(rotation=45, ha='right', fontsize=10)
    plt.yticks(fontsize=10)
    plt.tight_layout()

    # Save heatmap
    heatmap_path = os.path.join(output_dir_heatmaps, f"{os.path.basename(file_path).replace('.csv', '')}_heatmap.png")
    plt.savefig(heatmap_path)
    plt.close()
    print(f"Heatmap saved to {heatmap_path}")

    # Create the full graph for triplets
    triplets = {
        tuple(item.strip() for item in row["itemset"].strip("()").replace("'", "").split(", ")): row["support"]
        for _, row in df.iterrows()
    }
    triplets_with_names = [
        tuple(id_to_name.get(item, f"Unknown({item})") for item in triplet)
        for triplet in triplets.keys()
    ]

    G = nx.Graph()
    for triplet in triplets_with_names:
        for pair in combinations(triplet, 2):
            G.add_edge(pair[0], pair[1])

    # Identify connected components (clusters)
    groups = [G.subgraph(c).copy() for c in nx.connected_components(G)]

    # Plot and save each cluster graph
    for i, group in enumerate(groups):
        plt.figure(figsize=(10, 6))
        pos = nx.spring_layout(group, seed=42)
        nx.draw_networkx_nodes(group, pos, node_size=1000, node_color='skyblue', alpha=0.9)
        nx.draw_networkx_edges(group, pos, width=2, edge_color='gray', alpha=0.6)
        nx.draw_networkx_labels(group, pos, font_size=10, font_color='black')
        plt.title(f"Group {i + 1} - {os.path.basename(file_path)}", fontsize=14)
        plt.axis('off')

        # Save each cluster plot
        graph_path = os.path.join(output_dir_graphs, f"{os.path.basename(file_path).replace('.csv', '')}_group_{i + 1}.png")
        plt.savefig(graph_path)
        plt.close()
        print(f"Graph saved to {graph_path}")

# Define the mapping of track IDs to names, normalizing keys
id_to_name = {track_id.strip(): name for track_id, name in df_songs.set_index('track_id')['name'].to_dict().items()}

# Process each CSV file in the input directory
for file_name in os.listdir(input_dir):
    if file_name.endswith(".csv"):  # Process only CSV files
        file_path = os.path.join(input_dir, file_name)
        print(f"Processing {file_name}...")
        process_csv(file_path, id_to_name)

Processing frequent_itemsets_basket_7.csv...
Heatmap saved to Data/Heatmaps/frequent_itemsets_basket_7_heatmap.png
Graph saved to Data/Graphs/frequent_itemsets_basket_7_group_1.png
Graph saved to Data/Graphs/frequent_itemsets_basket_7_group_2.png
Graph saved to Data/Graphs/frequent_itemsets_basket_7_group_3.png
Graph saved to Data/Graphs/frequent_itemsets_basket_7_group_4.png
Graph saved to Data/Graphs/frequent_itemsets_basket_7_group_5.png
Graph saved to Data/Graphs/frequent_itemsets_basket_7_group_6.png
Graph saved to Data/Graphs/frequent_itemsets_basket_7_group_7.png
Graph saved to Data/Graphs/frequent_itemsets_basket_7_group_8.png
Graph saved to Data/Graphs/frequent_itemsets_basket_7_group_9.png
Processing frequent_itemsets_basket_6.csv...
Heatmap saved to Data/Heatmaps/frequent_itemsets_basket_6_heatmap.png
Graph saved to Data/Graphs/frequent_itemsets_basket_6_group_1.png
Graph saved to Data/Graphs/frequent_itemsets_basket_6_group_2.png
Graph saved to Data/Graphs/frequent_itemsets

## Evaluating the results: How similar are songs with high support