In [3]:
import json
import pandas as pd

In [4]:
champion_file = '../data/championFull.json'
with open(champion_file) as f:
    champs = json.load(f) 
# what we need is champs['data']  champs['keys'] 


## Prepare data


In [4]:
champ_id_name = champs['keys'] 
all_champion_data = champs['data']
champion_item = []

for k,v in all_champion_data.items():
    champion_dict = {}
    champion_dict['id'] = v['key']
    champion_dict['name'] = v['name']
    champion_dict['title'] = v['title']
    champion_dict['tags'] = v['tags']
    champion_dict['attack_sta'] = v['info']['attack']
    champion_dict['defense_sta'] = v['info']['defense']
    champion_dict['magic_sta'] = v['info']['magic']
    champion_dict['difficulty_sta'] = v['info']['difficulty']
    
    spell = v['spells']
    spell_info = []
    for i in spell:
        spell_info.append(i['description'])
    
    spell_desc = ' '.join(spell_info)
    champion_dict['ability_desc'] = spell_desc
    
    champion_item.append(champion_dict)

In [5]:
columns = ['id','name','title','tags','attack_sta','defense_sta','magic_sta','difficulty_sta','ability_desc']
champ_pd = pd.DataFrame(champion_item)
champ_pd.head()

Unnamed: 0,id,name,title,tags,attack_sta,defense_sta,magic_sta,difficulty_sta,ability_desc
0,266,Aatrox,the Darkin Blade,"[Fighter, Tank]",8,4,3,4,"Aatrox slams his greatsword down, dealing phys..."
1,103,Ahri,the Nine-Tailed Fox,"[Mage, Assassin]",3,4,8,5,"Ahri sends out and pulls back her orb, dealing..."
2,84,Akali,the Rogue Assassin,[Assassin],5,3,8,7,"Akali throws out five kunai, dealing damage ba..."
3,12,Alistar,the Minotaur,"[Tank, Support]",6,9,5,7,"Alistar smashes the ground, dealing damage to ..."
4,32,Amumu,the Sad Mummy,"[Tank, Mage]",2,6,8,3,"Amumu tosses a sticky bandage at a target, stu..."


## Make clustering

In [6]:
#1. cluster based on tags
tags = list(champ_pd.tags)
tag_set = set()
for i in tags:
    for j in i:
        tag_set.add(j)

# build a cluster based on tag
cluster_dict = {}
for i in tag_set:
    cluster_dict.setdefault(i,[])
    
for index,row in champ_pd.iterrows():
    tags = row['tags']
    name = row['name']
    for each_tag in tags:
        cluster_dict[each_tag].append(name)  
        
for tag,champs in cluster_dict.items():
    print(f'**{tag}**:{champs}')
    print('='*117)

**Marksman**:['Aphelios', 'Ashe', 'Azir', 'Caitlyn', 'Corki', 'Draven', 'Ezreal', 'Graves', 'Jayce', 'Jhin', 'Jinx', "Kai'Sa", 'Kalista', 'Kennen', 'Kindred', "Kog'Maw", 'Lucian', 'Miss Fortune', 'Quinn', 'Samira', 'Senna', 'Sivir', 'Teemo', 'Tristana', 'Twitch', 'Varus', 'Vayne', 'Xayah']
**Fighter**:['Aatrox', 'Blitzcrank', 'Camille', 'Darius', 'Diana', 'Dr. Mundo', 'Ekko', 'Elise', 'Fiora', 'Fizz', 'Gangplank', 'Garen', 'Gnar', 'Gragas', 'Hecarim', 'Illaoi', 'Irelia', 'Jarvan IV', 'Jax', 'Jayce', 'Kayle', 'Kayn', 'Kled', 'Lee Sin', 'Lillia', 'Malphite', 'Master Yi', 'Wukong', 'Mordekaiser', 'Nasus', 'Nautilus', 'Nocturne', 'Nunu & Willump', 'Olaf', 'Ornn', 'Pantheon', 'Poppy', 'Qiyana', 'Rammus', "Rek'Sai", 'Renekton', 'Rengar', 'Riven', 'Rumble', 'Ryze', 'Sejuani', 'Sett', 'Shyvana', 'Singed', 'Sion', 'Skarner', 'Swain', 'Taric', 'Thresh', 'Trundle', 'Tryndamere', 'Udyr', 'Urgot', 'Vi', 'Volibear', 'Warwick', 'Xin Zhao', 'Yasuo', 'Yone', 'Yorick', 'Zac']
**Tank**:['Aatrox', 'Alista

In [7]:
#2. cluster based on statistics 
stats_pd = pd.DataFrame({'name':champ_pd.name,'attack':champ_pd.attack_sta,'defense':champ_pd.defense_sta,\
                         'magic':champ_pd.magic_sta,'diff':champ_pd.difficulty_sta})
print(stats_pd.head(),end='\n\n')


stats_pd.attack = (stats_pd.attack - stats_pd.attack.describe()['mean'])/ stats_pd.attack.describe()['std']
stats_pd.defense = (stats_pd.defense - stats_pd.defense.describe()['mean'])/ stats_pd.defense.describe()['std']
stats_pd.magic = (stats_pd.magic - stats_pd.magic.describe()['mean'])/ stats_pd.magic.describe()['std']
stats_pd['diff'] = (stats_pd['diff'] - stats_pd['diff'].describe()['mean'])/ stats_pd['diff'].describe()['std']
print(stats_pd.head(),end='\n\n')

from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=5, random_state=10).fit(stats_pd[['attack','defense','magic','diff']])
cluster_res = kmeans.labels_
cluster_res
cluster_kmeans = {}
for index,cluster_id in enumerate(cluster_res):
    name = stats_pd.iloc[index]['name']
    cluster_kmeans[cluster_id] = cluster_kmeans.get(cluster_id,[])
    cluster_kmeans[cluster_id].append(name)
for tag,champs in cluster_kmeans.items():
    print(f'**{tag}**:{champs}')
    print('='*117) 

      name  attack  defense  magic  diff
0   Aatrox       8        4      3     4
1     Ahri       3        4      8     5
2    Akali       5        3      8     7
3  Alistar       6        9      5     7
4    Amumu       2        6      8     3

      name    attack   defense     magic      diff
0   Aatrox  0.899647 -0.325225 -0.864117 -0.849768
1     Ahri -0.887810 -0.325225  0.953177 -0.370981
2    Akali -0.172827 -0.806685  0.953177  0.586593
3  Alistar  0.184664  2.082075 -0.137200  0.586593
4    Amumu -1.245301  0.637695  0.953177 -1.328555

**1**:['Aatrox', 'Ashe', 'Camille', 'Darius', 'Fiora', 'Garen', 'Graves', 'Illaoi', 'Kindred', 'Master Yi', 'Miss Fortune', 'Wukong', 'Nocturne', 'Olaf', 'Pantheon', 'Quinn', "Rek'Sai", 'Renekton', 'Sett', 'Shyvana', 'Sivir', 'Skarner', 'Tristana', 'Trundle', 'Tryndamere', 'Varus', 'Vi', 'Warwick', 'Xayah', 'Xin Zhao']
**4**:['Ahri', 'Amumu', 'Brand', "Cho'Gath", 'Diana', 'Fizz', 'Ivern', 'Janna', 'Jax', 'Karma', 'Kayle', 'Kennen', 'Lissandra

In [8]:
#3. clusetr based on skill descrpitions

In [9]:
chap_desc_pd =  pd.DataFrame({'name':champ_pd.name,'skill_desc':champ_pd.ability_desc})
chap_desc_pd.head()

Unnamed: 0,name,skill_desc
0,Aatrox,"Aatrox slams his greatsword down, dealing phys..."
1,Ahri,"Ahri sends out and pulls back her orb, dealing..."
2,Akali,"Akali throws out five kunai, dealing damage ba..."
3,Alistar,"Alistar smashes the ground, dealing damage to ..."
4,Amumu,"Amumu tosses a sticky bandage at a target, stu..."


In [67]:
import spacy
import time
import multiprocessing
import gensim
from gensim.models import Word2Vec

In [53]:
nlp = spacy.load('en_core_web_sm')
nlp.Defaults.stop_words |= {"dealing",'damage','deal','enemy','damages'}

all_tokens = []
for index, row in chap_desc_pd.iterrows():
    desc_tokens = []
    doc = nlp(row['skill_desc'])
    for token in doc:
        lexeme = nlp.vocab[token.text]
        if lexeme.is_stop == False and lexeme.is_punct == False:
            desc_tokens.append(lexeme.text)
    all_tokens.append(desc_tokens)
        
chap_desc_pd['tokens'] = all_tokens
chap_desc_pd.head()
# chap_desc_pd.tokens.describe()['top']  check the top words 

Unnamed: 0,name,skill_desc,tokens
0,Aatrox,"Aatrox slams his greatsword down, dealing phys...","[Aatrox, slams, greatsword, physical, swing, t..."
1,Ahri,"Ahri sends out and pulls back her orb, dealing...","[Ahri, sends, pulls, orb, magic, way, true, wa..."
2,Akali,"Akali throws out five kunai, dealing damage ba...","[Akali, throws, kunai, based, bonus, Attack, A..."
3,Alistar,"Alistar smashes the ground, dealing damage to ...","[Alistar, smashes, ground, nearby, enemies, to..."
4,Amumu,"Amumu tosses a sticky bandage at a target, stu...","[Amumu, tosses, sticky, bandage, target, stunn..."


In [63]:
cores = multiprocessing.cpu_count() # Count the number of cores in a computer
w2v_model = Word2Vec(min_count=1,
                     window=4,
                     size=300,
                     sample=6e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20,
                     workers=cores-1)
sentences = list(chap_desc_pd.tokens)

t = time.time()
w2v_model.build_vocab(sentences, progress_per=10000)
print('Time to build vocab: {} mins'.format(round((time.time() - t) / 60, 2)))

#获取model里面的说有关键词
keys=w2v_model.wv.vocab.keys()

#获取词对于的词向量
wordvector=[]
for key in keys:
    wordvector.append(w2v_model[key])
len(wordvector)

Time to build vocab: 0.01 mins




2189

In [77]:
# use doc2Vec 关于doc2Vec的使用？？？
from gensim.models import Doc2Vec

doc_label = list(chap_desc_pd['name'])
setences = list(chap_desc_pd.tokens)


LabeledSentence1 = gensim.models.doc2vec.TaggedDocument
all_content_train = []
j=0
for em in setences:
    all_content_train.append(LabeledSentence1(em,[doc_label[j]]))
    j+=1
print('Number of texts processed:', j)

d2v_model = Doc2Vec(all_content_train, size = 300, window = 5, min_count = 2, workers=7, dm = 1,alpha=0.025, min_alpha=0.001)
d2v_model.train(all_content_train, total_examples=d2v_model.corpus_count, epochs=10, start_alpha=0.002, end_alpha=-0.016)

Number of texts processed: 151




In [89]:
kmeans_model = KMeans(n_clusters=5, init='k-means++', max_iter=100) 
X = kmeans_model.fit(d2v_model.docvecs.doctag_syn0)
cluster_res2=kmeans_model.labels_

cluster_res2

  


array([1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1,
       1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 3, 2, 1, 1, 2, 1, 1, 1, 2, 2, 1, 1,
       1, 1, 1, 2, 1, 2, 1, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 1, 2, 2, 3,
       2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 3, 2, 3, 3, 3, 2, 3, 3, 2, 2, 3,
       3, 2, 2, 3, 2, 3, 3, 2, 2, 2, 3, 0, 3, 3, 3, 3, 3, 3, 4, 0, 3, 3,
       3, 3, 3, 3, 3, 3, 0, 3, 2, 3, 3, 0, 3, 0, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 4, 0, 4, 3, 0, 0, 3, 4, 4, 3, 4, 3, 4, 4, 0, 4, 4],
      dtype=int32)

In [90]:
cluster_kmeans = {}
for index,cluster_id in enumerate(cluster_res):
    name = stats_pd.iloc[index]['name']
    cluster_kmeans[cluster_id] = cluster_kmeans.get(cluster_id,[])
    cluster_kmeans[cluster_id].append(name)
for tag,champs in cluster_kmeans.items():
    print(f'**{tag}**:{champs}')
    print('='*117) 

**1**:['Aatrox', 'Ashe', 'Camille', 'Darius', 'Fiora', 'Garen', 'Graves', 'Illaoi', 'Kindred', 'Master Yi', 'Miss Fortune', 'Wukong', 'Nocturne', 'Olaf', 'Pantheon', 'Quinn', "Rek'Sai", 'Renekton', 'Sett', 'Shyvana', 'Sivir', 'Skarner', 'Tristana', 'Trundle', 'Tryndamere', 'Varus', 'Vi', 'Warwick', 'Xayah', 'Xin Zhao']
**4**:['Ahri', 'Amumu', 'Brand', "Cho'Gath", 'Diana', 'Fizz', 'Ivern', 'Janna', 'Jax', 'Karma', 'Kayle', 'Kennen', 'Lissandra', 'Lulu', 'Lux', 'Mordekaiser', 'Morgana', 'Nami', 'Nautilus', 'Nunu & Willump', 'Rakan', 'Sona', 'Soraka', 'Sylas', 'Taliyah', 'Teemo', 'Thresh', 'Vladimir', 'Yuumi', 'Zac', 'Ziggs', 'Zilean', 'Zoe']
**0**:['Akali', 'Anivia', 'Annie', 'Aurelion Sol', 'Azir', 'Bard', 'Cassiopeia', 'Ekko', 'Elise', 'Evelynn', 'Fiddlesticks', 'Heimerdinger', 'Karthus', 'Kassadin', 'Katarina', 'LeBlanc', 'Lillia', 'Malzahar', 'Neeko', 'Nidalee', 'Orianna', 'Qiyana', 'Rumble', 'Ryze', 'Swain', 'Syndra', 'Twisted Fate', 'Veigar', "Vel'Koz", 'Viktor', 'Xerath', 'Zyra']
