In [48]:
from utils import get_dataframes

dataframes, effect_data = get_dataframes("", True, False)

In [49]:
characters_liking = dataframes["characters"].loc[:, ["name", "likes", "dislikes"]]
df = characters_liking[characters_liking["likes"].str.len() != 0].reset_index(drop=True)

In [50]:
def parse_and_flatten_list(list_str: list[list[str]]):
    """
    Convert string like '[[Plüschtiere], [Alleinsein], ...]'
    into a Python list ['Plüschtiere','Alleinsein',...]
    """
    string_list: list[str] = []
    try:
        for sub in list_str:
            string_list.append(sub[0])
        return string_list
    except Exception as e:
        print("An exception occurred:", e)
        return []


df["likes"] = df["likes"].apply(parse_and_flatten_list)
df["dislikes"] = df["dislikes"].apply(parse_and_flatten_list)
df

Unnamed: 0,name,likes,dislikes
0,Amlin,"[Plüschtiere, Alleinsein, Ordnung, Kontrolle, ...","[Faule Personen, Emotionale Personen, Festivit..."
1,Ar-Merer,"[Gerichtsprozesse, Gewinnen, Aufmerksamkeit se...","[Gruppenarbeiten, Unrecht haben, Familie, Wide..."
2,Avila,"[Pärchen beobachten, Flauschige Tiere, Kamille...","[Streit, Pöbelnde Personen]"
3,Burdig,"[Apfelwein, Karl, Gute Gespräche mit Kunden, F...","[Voreingenommene Personen, Schmarotzer, Aufdri..."
4,Carmesine,"[Glaube, Kräutertee, Katzen, Ruhe]","[Unordnung, Unruhe, Insekten, Spät aufstehen]"
5,Cordelia,"[Ehre, Stärke beweisen, Ausritte zu Pferd, Hag...","[Krieg, Saufgelage, Deserteure]"
6,Fubuki,"[Süßigkeiten, Süßspeisen, Große Klingenwaffen,...","[Hitze, Bitteres Essen, Wie ein Kind behandelt..."
7,Grenze,"[Stärke, Mut, Duelle, Kontrolle, Anerkennung]","[Menschen, Schwächlinge, Übermut, Erinnerungen..."
8,Hindrik,"[Holz, Regen, Wald, Natur, Selbstständigkeit, ...","[Menschenmengen, Tiere schlachten, Holzsplitte..."
9,Iddra,"[Geld, Katzen, Orangen, Macht, Untergebene, Ro...","[Ungehorsamkeit, Widerworte, Arroganz, Unsaube..."


In [52]:
all_items = set()
for _, row in df.iterrows():
    for i in row["likes"]:
        all_items.add(i)
    for i in row["dislikes"]:
        all_items.add(i)

all_items = list(all_items)  # convert to list to have a stable index
print(f"Found {len(all_items)} unique items.")

Found 191 unique items.


In [None]:
import spacy

nlp = spacy.load("de_core_news_lg")  # make sure you've downloaded it

item_vectors = []
for item in all_items:
    doc = nlp(item)
    # doc.vector is a single 300D or 512D (depends on model) vector for the entire phrase
    item_vectors.append(doc.vector)

import numpy as np

item_vectors = np.array(item_vectors)
print("Vectors shape:", item_vectors.shape)  # (num_items, vector_dim)

In [None]:
from sklearn.cluster import KMeans

k = 10  # choose how many clusters you want
kmeans = KMeans(n_clusters=k, random_state=42)
kmeans.fit(item_vectors)

# kmeans.labels_ is an array of cluster assignments for each item
labels = kmeans.labels_  # shape: (num_items,)