In [2]:
import pandas as pd

df = pd.read_csv('steam_tags.csv')
df

Unnamed: 0,tags
0,Indie
1,Singleplayer
2,Action
3,Adventure
4,Casual
...,...
2562,IndieMars
2563,IndieMagic
2564,IndieLore-Rich
2565,IndieLinear


In [3]:
#import rest of the libraries to execute this .py
import pandas as pd
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
tqdm.pandas()

model = SentenceTransformer('all-MiniLM-L6-v2') #all-MiniLM-L6-v2 #all-mpnet-base-v2

#creates a new column with the encoded text
df['text_vector_'] = df['tags'].progress_apply(lambda x : model.encode(x).tolist())
df

Loading the tokenizer from the `special_tokens_map.json` and the `added_tokens.json` will be removed in `transformers 5`,  it is kept for forward compatibility, but it is recommended to update your `tokenizer_config.json` by uploading it again. You will see the new `added_tokens_decoder` attribute that will store the relevant information.
100%|██████████| 2567/2567 [00:28<00:00, 89.87it/s] 


Unnamed: 0,tags,text_vector_
0,Indie,"[0.053279560059309006, -0.09138833731412888, 0..."
1,Singleplayer,"[0.001120690256357193, -0.05869673565030098, 0..."
2,Action,"[-0.04115905985236168, 0.008662102743983269, -..."
3,Adventure,"[0.0013454982545226812, 0.08579752594232559, 0..."
4,Casual,"[-0.015388733707368374, -0.012919720262289047,..."
...,...,...
2562,IndieMars,"[0.047055501490831375, -0.0933956727385521, 0...."
2563,IndieMagic,"[0.08697230368852615, -0.0611506812274456, 0.0..."
2564,IndieLore-Rich,"[0.033941738307476044, -0.07493843883275986, -..."
2565,IndieLinear,"[-0.0227375328540802, -0.08573731034994125, 0...."


In [4]:
from sklearn.cluster import KMeans
from sklearn.cluster import AffinityPropagation
import numpy as np
import pandas as pd
import warnings
import hdbscan

warnings.simplefilter(action='ignore') #SUPPRESS ALL WARNINGS

def clustering_hdbscan(df):
    #clustering
    cluster = hdbscan.HDBSCAN(
        min_cluster_size=2,
        alpha=0.3,
        leaf_size=5,
        metric='euclidean',                      
        cluster_selection_method='eom'
    ).fit([x[0] for x in df.values])
    cluster_labels = cluster.labels_
    return cluster_labels

def clustering_function_affinity(df):
    # damping_value = 0.9
    affinity_propagation = AffinityPropagation()
    # display(df)
    cluster_labels = affinity_propagation.fit_predict([x[0] for x in df.values])
    n_clusters = len(np.unique(cluster_labels))
    return cluster_labels

def clustering_function_kmeans(df, n_clusters=5):
    while True:
        try:
            kmeans = KMeans(n_clusters=n_clusters)
            # display(df)
            cluster_labels = kmeans.fit_predict([x[0] for x in df.values])
            cluster_labels = kmeans.labels_
            return cluster_labels
        except:
            n_clusters -= 1

def hierarchical_clustering(kmeans_depth, df, depth):
    """
    kmeans_depth is a list that contains the number of clusters to use for kmeans at each level of depth
    Ex. kmeans_depth[0] is the number of clusters used for depth 1
    Ex. kmeans_depth[1] is the number of clusters used for each subcluster in depth 2
    """

    df = pd.DataFrame(df)

    # Function to drop columns containing the word "label"
    def drop_columns_containing_label(df, keyword='label'):
        columns_to_drop = [col for col in df.columns if keyword.lower() in col.lower()]
        df_dropped = df.drop(columns=columns_to_drop)
        return df_dropped

    # add the first cluster_labels
    depth_ = 1
    df[f'labels_{depth_}'] = clustering_function_kmeans(drop_columns_containing_label(df), n_clusters=kmeans_depth[depth_-1])
    # df[f'labels_{depth_}'] = clustering_hdbscan(drop_columns_containing_label(df))

    # add the following cluster_labels until depth is reached
    for _ in range(depth-1):
        list1 = list()
        for g in df.groupby([x for x in df.columns if 'labels' in x]):
            depth_ = len([x for x in df.columns if 'labels' in x]) + 1
            df_ = g[1]
            df_[f'labels_{depth_}'] = clustering_function_affinity(drop_columns_containing_label(df_))
            list1.append(df_)
        df = pd.concat(list1)
        df

    return df

In [5]:
# clustering
df_topics = hierarchical_clustering(kmeans_depth=[200], df=df['text_vector_'], depth=3)
df_topics['categories_eng'] = df.iloc[df_topics.index]['tags']

cols = ['labels_1', 'labels_2', 'labels_3']
df_topics

  File "c:\Users\ardit\AppData\Local\Programs\Python\Python39\lib\site-packages\joblib\externals\loky\backend\context.py", line 282, in _count_physical_cores
    raise ValueError(f"found {cpu_count_physical} physical cores < 1")


Unnamed: 0,text_vector_,labels_1,labels_2,labels_3,categories_eng
1465,"[0.00679501099511981, 0.11885671317577362, -0....",0,0,0,EducationDesign & Illustration
1930,"[0.01816725917160511, -0.020321648567914963, -...",0,1,0,EducationAnimation & Modeling
703,"[-0.024891158565878868, -0.07615001499652863, ...",1,0,0,SimulationSingleplayer
1032,"[-0.019346315413713455, -0.007711109705269337,...",1,0,1,SimulationArena Shooter
1237,"[-0.02915360778570175, 0.040611688047647476, -...",1,0,2,SimulationNaval Combat
...,...,...,...,...,...
529,"[0.023982785642147064, 0.09959866106510162, -0...",198,1,0,Cycling
634,"[-0.022003713995218277, 0.05222572386264801, -...",198,1,1,ATV
75,"[-0.006742512807250023, 0.039412710815668106, ...",199,0,0,Management
111,"[-0.00582854263484478, 0.06645624339580536, -0...",199,1,0,Resource Management


In [6]:
counter = 0
for df_ in df_topics.groupby(['labels_1', 'labels_2', 'labels_3']):
    display(df_[1])
    counter += 1
    if counter == 30: break

Unnamed: 0,text_vector_,labels_1,labels_2,labels_3,categories_eng
1465,"[0.00679501099511981, 0.11885671317577362, -0....",0,0,0,EducationDesign & Illustration


Unnamed: 0,text_vector_,labels_1,labels_2,labels_3,categories_eng
1930,"[0.01816725917160511, -0.020321648567914963, -...",0,1,0,EducationAnimation & Modeling


Unnamed: 0,text_vector_,labels_1,labels_2,labels_3,categories_eng
703,"[-0.024891158565878868, -0.07615001499652863, ...",1,0,0,SimulationSingleplayer


Unnamed: 0,text_vector_,labels_1,labels_2,labels_3,categories_eng
1032,"[-0.019346315413713455, -0.007711109705269337,...",1,0,1,SimulationArena Shooter
1510,"[-0.011868858709931374, 0.04921221733093262, -...",1,0,1,SimulationHero Shooter
1573,"[-0.015150941908359528, 0.03222223371267319, -...",1,0,1,SimulationTop-Down Shooter
1605,"[-0.00815537292510271, 0.0024146963842213154, ...",1,0,1,SimulationThird-Person Shooter


Unnamed: 0,text_vector_,labels_1,labels_2,labels_3,categories_eng
1237,"[-0.02915360778570175, 0.040611688047647476, -...",1,0,2,SimulationNaval Combat


Unnamed: 0,text_vector_,labels_1,labels_2,labels_3,categories_eng
1048,"[-0.024791916832327843, 0.0016261069104075432,...",1,1,0,SimulationPinball


Unnamed: 0,text_vector_,labels_1,labels_2,labels_3,categories_eng
61,"[-0.017763419076800346, -0.059603627771139145,...",2,0,0,Visual Novel


Unnamed: 0,text_vector_,labels_1,labels_2,labels_3,categories_eng
371,"[0.04115910828113556, -0.07427249848842621, 0....",2,1,0,IndieVisual Novel
474,"[-0.0020717610605061054, 0.0037615334149450064...",2,1,0,CasualVisual Novel
586,"[0.014427917078137398, -0.09553801268339157, 0...",2,1,0,IndieInteractive Fiction
1057,"[0.01532629132270813, -0.06249205395579338, 0....",2,1,0,IndieDark Fantasy
1736,"[0.047733016312122345, -0.07112820446491241, -...",2,1,0,IndieVillain Protagonist
2466,"[0.025852752849459648, -0.0773003026843071, 0....",2,1,0,IndieComic Book


Unnamed: 0,text_vector_,labels_1,labels_2,labels_3,categories_eng
483,"[-0.10134214162826538, 0.0025913002900779247, ...",2,1,1,Based On A Novel


Unnamed: 0,text_vector_,labels_1,labels_2,labels_3,categories_eng
2061,"[-0.0076070972718298435, 0.10626905411481857, ...",3,0,0,AdventureHidden Object


Unnamed: 0,text_vector_,labels_1,labels_2,labels_3,categories_eng
2013,"[0.0005366362747736275, 0.05592409893870354, -...",3,1,0,AdventureTop-Down


Unnamed: 0,text_vector_,labels_1,labels_2,labels_3,categories_eng
868,"[-0.05542542785406113, 0.04941685497760773, 0....",3,1,1,AdventurePlatformer
1097,"[-0.07470724731683731, 0.0011902316473424435, ...",3,1,1,AdventureMultiplayer
2372,"[-0.016168612986803055, 0.11478349566459656, -...",3,1,1,AdventureCartoony
2415,"[-0.04287803918123245, 0.09324570000171661, -0...",3,1,1,AdventureCombat


Unnamed: 0,text_vector_,labels_1,labels_2,labels_3,categories_eng
72,"[-0.046611737459897995, 0.0686585083603859, -0...",4,0,0,Tactical


Unnamed: 0,text_vector_,labels_1,labels_2,labels_3,categories_eng
1211,"[-0.11242984980344772, 0.09316850453615189, -0...",4,1,0,StrategyAuto Battler


Unnamed: 0,text_vector_,labels_1,labels_2,labels_3,categories_eng
559,"[-0.05099041759967804, -0.0365036316215992, -0...",4,1,1,StrategyVR
1036,"[-0.02660423330962658, 0.03930097073316574, -0...",4,1,1,StrategyNaval Combat
1631,"[-0.03707116097211838, 0.03551387041807175, -0...",4,1,1,StrategyNaval
1633,"[-0.03435828164219856, 0.06634855270385742, -0...",4,1,1,StrategyDrama


Unnamed: 0,text_vector_,labels_1,labels_2,labels_3,categories_eng
622,"[-0.1038614958524704, 0.017301885411143303, -0...",4,2,0,StrategyIdler
947,"[-0.037820298224687576, 0.022725917398929596, ...",4,2,0,StrategyCats
981,"[-0.057531699538230896, 0.07827965915203094, -...",4,2,0,StrategyTanks
1639,"[-0.0748552605509758, 0.024112505838274956, -0...",4,2,0,StrategyDark Fantasy
1920,"[-0.10613927990198135, 0.07796458154916763, -0...",4,2,0,StrategyDark
1935,"[-0.08879886567592621, 0.003158039413392544, -...",4,2,0,StrategyLinear


Unnamed: 0,text_vector_,labels_1,labels_2,labels_3,categories_eng
2072,"[-0.05747454985976219, 0.07347848266363144, -0...",4,3,0,StrategyShop Keeper


Unnamed: 0,text_vector_,labels_1,labels_2,labels_3,categories_eng
258,"[-0.009479526430368423, -0.014515984803438187,...",5,0,0,SportsSports
719,"[-0.02519829571247101, -0.013761961832642555, ...",5,0,0,SportsVR
1300,"[0.041649699211120605, 0.04606030136346817, -0...",5,0,0,SportsFlight
1552,"[-0.055406779050827026, -0.0041481428779661655...",5,0,0,SportsCollectathon
1625,"[-0.0301741361618042, 0.008230865001678467, 0....",5,0,0,Sports2D
2408,"[-0.05017208307981491, 0.04514428600668907, -0...",5,0,0,SportsFantasy
2419,"[-0.007273668423295021, -0.03471531346440315, ...",5,0,0,SportsSpace
2439,"[-0.050811853259801865, 0.059244126081466675, ...",5,0,0,SportsSnow
2455,"[-0.02926877699792385, 0.03284667059779167, -0...",5,0,0,SportsRunner
2460,"[-2.528719778638333e-05, 0.05712643265724182, ...",5,0,0,SportsShort


Unnamed: 0,text_vector_,labels_1,labels_2,labels_3,categories_eng
325,"[-0.04182858392596245, 0.010790975764393806, 0...",5,1,0,SportsAction


Unnamed: 0,text_vector_,labels_1,labels_2,labels_3,categories_eng
376,"[0.020362842828035355, 0.0440136194229126, -0....",5,1,1,SportsCasual
524,"[-0.03176463022828102, 0.13497547805309296, 0....",5,1,1,SportsAdventure
2383,"[0.0025774117093533278, 0.11816635727882385, -...",5,1,1,SportsNature
2399,"[0.010485419072210789, 0.11339229345321655, -0...",5,1,1,SportsCute


Unnamed: 0,text_vector_,labels_1,labels_2,labels_3,categories_eng
1553,"[-0.05994141846895218, 0.053088486194610596, -...",5,2,0,SportsCo-op


Unnamed: 0,text_vector_,labels_1,labels_2,labels_3,categories_eng
2397,"[-0.05597424507141113, 0.020889559760689735, -...",5,2,1,SportsCozy


Unnamed: 0,text_vector_,labels_1,labels_2,labels_3,categories_eng
2413,"[0.024134401232004166, 0.0012138790916651487, ...",5,3,0,SportsOpen World


Unnamed: 0,text_vector_,labels_1,labels_2,labels_3,categories_eng
2421,"[0.009149710647761822, 0.04854391887784004, -0...",5,4,0,SportsSplit Screen


Unnamed: 0,text_vector_,labels_1,labels_2,labels_3,categories_eng
2382,"[-0.051146816462278366, 0.028620941564440727, ...",5,5,0,SportsMusic


Unnamed: 0,text_vector_,labels_1,labels_2,labels_3,categories_eng
418,"[-0.07196696102619171, 0.05813470482826233, -0...",5,5,1,SportsIndie
1061,"[0.011126547120511532, 0.03918803110718727, -0...",5,5,1,SportsArcade
2373,"[0.047000862658023834, 0.04858908802270889, -0...",5,5,1,SportsLogic
2375,"[-0.002150478772819042, 0.07934700697660446, -...",5,5,1,SportsMechs
2376,"[0.00019084702944383025, 0.04828477278351784, ...",5,5,1,SportsGore
2377,"[-0.013949261978268623, 0.04222911596298218, 0...",5,5,1,SportsMemes
2431,"[-0.02093004807829857, 0.05348697677254677, 0....",5,5,1,Sportse-sports
2454,"[-0.007246903609484434, 0.04908309131860733, -...",5,5,1,SportsRogue-lite


Unnamed: 0,text_vector_,labels_1,labels_2,labels_3,categories_eng
810,"[0.00650976225733757, 0.10537339746952057, -0....",5,6,0,SportsTennis
1526,"[0.027465784922242165, 0.07371950149536133, -0...",5,6,0,SportsSurvival


Unnamed: 0,text_vector_,labels_1,labels_2,labels_3,categories_eng
2384,"[0.010979237034916878, 0.03651013225317001, -0...",5,6,1,SportsOffroad
2452,"[-0.007726949639618397, 0.07962332665920258, -...",5,6,1,SportsRetro
2534,"[-0.022693322971463203, 0.07562263309955597, -...",5,6,1,SportsAnime


Unnamed: 0,text_vector_,labels_1,labels_2,labels_3,categories_eng
2457,"[0.002668068977072835, 0.07559897005558014, -0...",5,7,0,SportsScore Attack


Unnamed: 0,text_vector_,labels_1,labels_2,labels_3,categories_eng
938,"[-0.02513885870575905, -0.09530197829008102, 0...",6,0,0,IndieIdler
