In [5]:
import pandas as pd

# Load datasets (replace with actual file paths or DataFrame initialization)
artists = pd.read_csv('dataset/artists.dat', sep='\t')
tags = pd.read_csv('dataset/tags.dat', sep='\t', encoding='ISO-8859-1')
user_artists = pd.read_csv('dataset/user_artists.dat', sep='\t')
user_tags = pd.read_csv('dataset/user_taggedartists.dat', sep='\t')

print(user_artists.shape, user_tags.shape)
# Step 1: Remove interactions with invalid artistIDs
valid_artist_ids = set(artists['id'])
user_artists = user_artists[user_artists['artistID'].isin(valid_artist_ids)]
user_tags = user_tags[user_tags['artistID'].isin(valid_artist_ids)]

# Step 2: Remove interactions with invalid tagIDs
valid_tag_ids = set(tags['tagID'])
user_tags = user_tags[user_tags['tagID'].isin(valid_tag_ids)]

print(user_artists.shape, user_tags.shape)
# Step 3: Save or return cleaned data
user_artists.to_csv("dataset/cleaned_user_artists.csv", index=False)
user_tags.to_csv("dataset/cleaned_user_tags.csv", index=False)

# Only a few rows were removed, so the data is already quite clean

(92834, 3) (186479, 6)
(92834, 3) (184941, 6)


In [11]:
# Step 1: Define the remapping function with consistent mappings
def analyze_and_remap_ids(df, column, id_name, existing_mapping=None):
    # Get unique IDs
    unique_ids = df[column].unique()
    
    # Create a new mapping if one doesn't exist
    if existing_mapping is None:
        sorted_ids = sorted(unique_ids)
        mapping = {old_id: new_id for new_id, old_id in enumerate(sorted_ids)}
    else:
        mapping = existing_mapping  # Use the provided mapping
    
    # Apply the mapping to the DataFrame
    df[column] = df[column].map(mapping)
    
    # Print analysis
    min_id = df[column].min()
    max_id = df[column].max()
    total_ids = len(mapping)
    gaps = len(set(range(min_id, max_id + 1)) - set(df[column].unique()))
    print(f"{id_name} ID Analysis:")
    print(f" - Min ID: {min_id}")
    print(f" - Max ID: {max_id}")
    print(f" - Total Unique IDs: {total_ids}")
    print(f" - Number of Gaps in the Range: {gaps}")
    
    return df, mapping

def verify_mapping_consistency(df, column, mapping, id_name):
    # Check if all values in the column map to the expected range
    mapped_ids = df[column].unique()
    expected_ids = set(range(len(mapping)))
    
    # Check for unexpected IDs
    if not set(mapped_ids).issubset(expected_ids):
        print(f"Error: {id_name} contains IDs outside the expected range!")
        print(f"Unexpected IDs: {set(mapped_ids) - expected_ids}")
        return False
    
    # Check that the mapping is bijective
    if len(mapping) != len(set(mapping.values())):
        print(f"Error: {id_name} mapping is not bijective!")
        return False
    
    print(f"{id_name} mapping is correct and consistent.")
    return True

# Step 2: Remap artist IDs consistently across datasets
artists, artist_id_mapping = analyze_and_remap_ids(artists, "id", "Artist")
user_artists, _ = analyze_and_remap_ids(user_artists, "artistID", "User-Artist (Artist ID)", artist_id_mapping)
user_tags, _ = analyze_and_remap_ids(user_tags, "artistID", "User-Tag (Artist ID)", artist_id_mapping)

# Step 3: Remap tag IDs consistently across datasets
tags, tag_id_mapping = analyze_and_remap_ids(tags, "tagID", "Tag")
user_tags, _ = analyze_and_remap_ids(user_tags, "tagID", "User-Tag (Tag ID)", tag_id_mapping)

# Step 4: Remap user IDs consistently across datasets
user_artists, user_id_mapping = analyze_and_remap_ids(user_artists, "userID", "User-Artist (User ID)")
user_tags, _ = analyze_and_remap_ids(user_tags, "userID", "User-Tag (User ID)", user_id_mapping)

# Step 6: Validate mappings
assert(verify_mapping_consistency(artists, "id", artist_id_mapping, "Artist"))
assert(verify_mapping_consistency(user_artists, "artistID", artist_id_mapping, "User-Artist (Artist ID)"))
assert(verify_mapping_consistency(user_tags, "artistID", artist_id_mapping, "User-Tag (Artist ID)"))

assert(verify_mapping_consistency(tags, "tagID", tag_id_mapping, "Tag"))
assert(verify_mapping_consistency(user_tags, "tagID", tag_id_mapping, "User-Tag (Tag ID)"))

assert(verify_mapping_consistency(user_artists, "userID", user_id_mapping, "User-Artist (User ID)"))
assert(verify_mapping_consistency(user_tags, "userID", user_id_mapping, "User-Tag (User ID)"))

artists.to_csv("dataset/remapped/artists.csv", index=False)
tags.to_csv("dataset/remapped/tags.csv", index=False)
user_artists.to_csv("dataset/remapped/user_artists.csv", index=False)
user_tags.to_csv("dataset/remapped/user_tags.csv", index=False)

Artist ID Analysis:
 - Min ID: 0
 - Max ID: 17631
 - Total Unique IDs: 17632
 - Number of Gaps in the Range: 0
User-Artist (Artist ID) ID Analysis:
 - Min ID: 0
 - Max ID: 17631
 - Total Unique IDs: 17632
 - Number of Gaps in the Range: 0
User-Tag (Artist ID) ID Analysis:
 - Min ID: 0
 - Max ID: 17630
 - Total Unique IDs: 17632
 - Number of Gaps in the Range: 5498
Tag ID Analysis:
 - Min ID: 0
 - Max ID: 11945
 - Total Unique IDs: 11946
 - Number of Gaps in the Range: 0
User-Tag (Tag ID) ID Analysis:
 - Min ID: 0
 - Max ID: 11944
 - Total Unique IDs: 11946
 - Number of Gaps in the Range: 2227
User-Artist (User ID) ID Analysis:
 - Min ID: 0
 - Max ID: 1891
 - Total Unique IDs: 1892
 - Number of Gaps in the Range: 0
User-Tag (User ID) ID Analysis:
 - Min ID: 0
 - Max ID: 1891
 - Total Unique IDs: 1892
 - Number of Gaps in the Range: 1
Artist mapping is correct and consistent.
User-Artist (Artist ID) mapping is correct and consistent.
User-Tag (Artist ID) mapping is correct and consistent