In [1]:
# Import the necessary libraries:
import os  # For reading files and managing paths
import numpy as np  # For performing mathematical operations
from scipy.sparse import lil_matrix  # For handling sparse matrices
from sklearn.decomposition import TruncatedSVD  # For Singular Value Decomposition (SVD)
from sklearn.metrics.pairwise import cosine_similarity  # For calculating cosine similarity between vectors

In [2]:
# Define the path to the corpus folder and obtain the list of text files
corpus_folder = "./corpus"
file_names = [f for f in os.listdir(corpus_folder) if f.endswith(".txt")]

# Initialize an empty list to store the words from the corpus
corpus = []

# Read each text file in the corpus folder and append the words to the corpus list
for file_name in file_names:
    file_path = os.path.join(corpus_folder, file_name)
    with open(file_path, "r", encoding="utf-8") as corpusFile:
        for linea in corpusFile:
            word_line = linea.strip().split()
            corpus.extend(word_line)

In [3]:
# Function to create a co-occurrence matrix from the corpus with a given window size
def create_co_occurrence_matrix(corpus, window_size=4):
    vocab = set(corpus)  # Create a set of unique words in the corpus
    word2id = {word: i for i, word in enumerate(vocab)}  # Create a word-to-index dictionary for the words
    id2word = {i: word for i, word in enumerate(vocab)}  # Create an index-to-word dictionary for the words
    matrix = lil_matrix((len(vocab), len(vocab)))  # Initialize an empty sparse matrix of size len(vocab) x len(vocab)

    # Iterate through the corpus to fill the co-occurrence matrix
    for i in range(len(corpus)):
        for j in range(max(0, i - window_size), min(len(corpus), i + window_size)):
            if i != j:
                matrix[word2id[corpus[i]], word2id[corpus[j]]] += 1

    return matrix, word2id, id2word

In [4]:
# Function to perform SVD on the co-occurrence matrix and reduce the dimensionality
def perform_svd(matrix, n_components=300):
    n_components = min(n_components, matrix.shape[1] - 1)
    svd = TruncatedSVD(n_components=n_components)
    return svd.fit_transform(matrix)

In [5]:
# Function to create word embeddings from the corpus using the co-occurrence matrix and SVD
def create_word_embeddings(corpus):
    matrix, word2id, id2word = create_co_occurrence_matrix(corpus)  # Create the co-occurrence matrix
    word_embeddings = perform_svd(matrix)  # Perform SVD on the matrix
    return word_embeddings, word2id, id2word

In [6]:
# Create the word embeddings from the given corpus
embeddings, word2id, id2word = create_word_embeddings(corpus)

In [7]:
# Function to calculate the cosine similarity between two word vectors
def get_word_similarity(embeddings, word2id, word1, word2):
    word1_vector = embeddings[word2id[word1]]  # Get the vector representation of word1
    word2_vector = embeddings[word2id[word2]]  # Get the vector representation of word2

    # Compute the cosine similarity between the two vectors
    similarity = cosine_similarity(word1_vector.reshape(1, -1), word2_vector.reshape(1, -1))

    return similarity[0][0]

In [8]:
# Example usage: Calculate the similarity between the word embeddings for 'sun' and 'sky'
similarity = get_word_similarity(embeddings, word2id, 'darius', 'katarina')
print(f"The distance between the two words is: {similarity}")

The distance between the two words is: 0.7429412980646812


In [9]:
similarity = get_word_similarity(embeddings, word2id, 'yasuo', 'yone')
print(f"The distance between the two words is: {similarity}")

The distance between the two words is: 0.9033523373831738


In [10]:
similarity = get_word_similarity(embeddings, word2id, 'darius', 'aatrox')
print(f"The distance between the two words is: {similarity}")

The distance between the two words is: 0.9943334314902029


In [11]:
# opening the file in read mode 
my_file = open("champ_list.txt", "r") 

# reading the file 
data = my_file.read() 

# replacing end splitting the text  
# when newline ('\n') is seen. 
champions = data.split(", ") 
print(champions) 

my_file.close() 

['aatrox', 'ahri', 'akali', 'akshan', 'alistar', 'amumu', 'anivia', 'annie', 'aphelios', 'ashe', 'aurelion-sol', 'azir', 'bard', 'bel-veth', 'blitzcrank', 'brand', 'braum', 'briar', 'caitlyn', 'camille', 'cassiopeia', 'cho-gath', 'corki', 'darius', 'diana', 'dr-mundo', 'draven', 'ekko', 'elise', 'evelynn', 'ezreal', 'fiddlesticks', 'fiora', 'fizz', 'galio', 'gangplank', 'garen', 'gnar', 'gragas', 'graves', 'gwen', 'hecarim', 'heimerdinger', 'illaoi', 'irelia', 'ivern', 'janna', 'jarvan-iv', 'jax', 'jayce', 'jhin', 'jinx', 'k-sante', 'kai-sa', 'kalista', 'karma', 'karthus', 'kassadin', 'katarina', 'kayle', 'kayn', 'kennen', 'kha-zix', 'kindred', 'kled', 'kog-maw', 'leblanc', 'lee-sin', 'leona', 'lillia', 'lissandra', 'lucian', 'lulu', 'lux', 'malphite', 'malzahar', 'maokai', 'master-yi', 'milio', 'miss-fortune', 'mordekaiser', 'morgana', 'naafiri', 'nami', 'nasus', 'nautilus', 'neeko', 'nidalee', 'nilah', 'nocturne', 'nunu', 'olaf', 'orianna', 'ornn', 'pantheon', 'poppy', 'pyke', 'qiyan

In [12]:
# Create a list of tuples with champion names and their similarity scores to 'Aatrox'
similarity_scores = [(c, get_word_similarity(embeddings, word2id, "garen", c)) for c in champions]

# Sort the list based on similarity scores in descending order
similarity_scores.sort(key=lambda x: x[1], reverse=True)

# Create a ranking dictionary
ranking = {champion: rank + 1 for rank, (champion, _) in enumerate(similarity_scores)}

# Print the rankings
for champion, rank in ranking.items():
    print(f"Rank {rank}: {champion}")

Rank 1: garen
Rank 2: fiora
Rank 3: rumble
Rank 4: aatrox
Rank 5: darius
Rank 6: jayce
Rank 7: gwen
Rank 8: skarner
Rank 9: olaf
Rank 10: renekton
Rank 11: kayle
Rank 12: lee-sin
Rank 13: warwick
Rank 14: nilah
Rank 15: udyr
Rank 16: shyvana
Rank 17: camille
Rank 18: diana
Rank 19: gragas
Rank 20: gnar
Rank 21: urgot
Rank 22: bel-veth
Rank 23: nasus
Rank 24: gangplank
Rank 25: sett
Rank 26: pantheon
Rank 27: trundle
Rank 28: lillia
Rank 29: rek-sai
Rank 30: volibear
Rank 31: tryndamere
Rank 32: yorick
Rank 33: hecarim
Rank 34: wukong
Rank 35: yasuo
Rank 36: kayn
Rank 37: vi
Rank 38: illaoi
Rank 39: kog-maw
Rank 40: mordekaiser
Rank 41: briar
Rank 42: xin-zhao
Rank 43: irelia
Rank 44: riven
Rank 45: kled
Rank 46: dr-mundo
Rank 47: jax
Rank 48: zilean
Rank 49: leona
Rank 50: katarina
Rank 51: lissandra
Rank 52: aurelion-sol
Rank 53: talon
Rank 54: viktor
Rank 55: fiddlesticks
Rank 56: miss-fortune
Rank 57: sylas
Rank 58: ekko
Rank 59: yone
Rank 60: ahri
Rank 61: braum
Rank 62: pyke
Rank 