In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from src.config import DATA_RAW

In [None]:
tracks = pd.read_csv(f"{DATA_RAW}/tracks.dat", sep="\t")
artists = pd.read_csv(f"{DATA_RAW}/artists.dat", sep="\t")
tags = pd.read_csv(f"{DATA_RAW}/tags.dat", sep="\t")
user_artists = pd.read_csv(f"{DATA_RAW}/user_artists.dat", sep="\t")
user_tagged = pd.read_csv(f"{DATA_RAW}/user_taggedtracks.dat", sep="\t")

tracks.head(), artists.head(), tags.head()

In [None]:
print("Tracks:", len(tracks))
print("Artists:", len(artists))
print("Tags:", len(tags))
print("Userâ€“Artist Interactions:", len(user_artists))
print("Tagged Tracks:", len(user_tagged))


In [None]:
top_artists = (
    user_artists.groupby("artistID")["weight"]
    .sum()
    .sort_values(ascending=False)
    .head(20)
    .reset_index()
)

top_artists = top_artists.merge(artists[["id", "name"]], left_on="artistID", right_on="id")

plt.figure(figsize=(10, 6))
sns.barplot(y="name", x="weight", data=top_artists)
plt.title("Top 20 Most Popular Artists")
plt.show()

In [None]:
plt.figure(figsize=(12, 6))
tags["tagValue"].value_counts().head(20).plot(kind="bar")
plt.title("Most Frequent Tags")
plt.show()

In [None]:
user_counts = user_artists["userID"].value_counts()

plt.figure(figsize=(10, 5))
sns.histplot(user_counts, bins=50)
plt.title("User Activity Distribution (#Artists Listened)")
plt.xlabel("Number of Artists")
plt.ylabel("User Count")
plt.show()