In [1]:
# Import the necessary libraries:
import os  # For reading files and managing paths
import numpy as np  # For performing mathematical operations
from scipy.sparse import lil_matrix  # For handling sparse matrices
from sklearn.decomposition import TruncatedSVD  # For Singular Value Decomposition (SVD)
from sklearn.metrics.pairwise import cosine_similarity  # For calculating cosine similarity between vectors

In [2]:
# Define the path to the corpus folder and obtain the list of text files
corpus_folder = "./corpus"
file_names = [f for f in os.listdir(corpus_folder) if f.endswith(".txt")]

# Initialize an empty list to store the words from the corpus
corpus = []

# Read each text file in the corpus folder and append the words to the corpus list
for file_name in file_names:
    file_path = os.path.join(corpus_folder, file_name)
    with open(file_path, "r") as corpusFile:
        for linea in corpusFile:
            word_line = linea.strip().split()
            corpus.extend(word_line)

UnicodeDecodeError: 'charmap' codec can't decode byte 0x9d in position 490: character maps to <undefined>

In [None]:
# Function to create a co-occurrence matrix from the corpus with a given window size
def create_co_occurrence_matrix(corpus, window_size=4):
    vocab = set(corpus)  # Create a set of unique words in the corpus
    word2id = {word: i for i, word in enumerate(vocab)}  # Create a word-to-index dictionary for the words
    id2word = {i: word for i, word in enumerate(vocab)}  # Create an index-to-word dictionary for the words
    matrix = lil_matrix((len(vocab), len(vocab)))  # Initialize an empty sparse matrix of size len(vocab) x len(vocab)

    # Iterate through the corpus to fill the co-occurrence matrix
    for i in range(len(corpus)):
        for j in range(max(0, i - window_size), min(len(corpus), i + window_size)):
            if i != j:
                matrix[word2id[corpus[i]], word2id[corpus[j]]] += 1

    return matrix, word2id, id2word

In [None]:
# Function to perform SVD on the co-occurrence matrix and reduce the dimensionality
def perform_svd(matrix, n_components=300):
    n_components = min(n_components, matrix.shape[1] - 1)
    svd = TruncatedSVD(n_components=n_components)
    return svd.fit_transform(matrix)

In [None]:
# Function to create word embeddings from the corpus using the co-occurrence matrix and SVD
def create_word_embeddings(corpus):
    matrix, word2id, id2word = create_co_occurrence_matrix(corpus)  # Create the co-occurrence matrix
    word_embeddings = perform_svd(matrix)  # Perform SVD on the matrix
    return word_embeddings, word2id, id2word

In [None]:
# Create the word embeddings from the given corpus
embeddings, word2id, id2word = create_word_embeddings(corpus)

In [None]:
# Function to calculate the cosine similarity between two word vectors
def get_word_similarity(embeddings, word2id, word1, word2):
    word1_vector = embeddings[word2id[word1]]  # Get the vector representation of word1
    word2_vector = embeddings[word2id[word2]]  # Get the vector representation of word2

    # Compute the cosine similarity between the two vectors
    similarity = cosine_similarity(word1_vector.reshape(1, -1), word2_vector.reshape(1, -1))

    return similarity[0][0]

In [None]:
# Example usage: Calculate the similarity between the word embeddings for 'sun' and 'sky'
similarity = get_word_similarity(embeddings, word2id, 'Darius', 'Katarina')
print(f"The distance between the two words is: {similarity}")

The distance between the two words is: 0.1576220207772147


In [None]:
similarity = get_word_similarity(embeddings, word2id, 'Ahri', 'Katarina')
print(f"The distance between the two words is: {similarity}")

The distance between the two words is: 0.3043478081958932


In [None]:
similarity = get_word_similarity(embeddings, word2id, 'Darius', 'Aatrox')
print(f"The distance between the two words is: {similarity}")

The distance between the two words is: 0.25354712418326303


In [None]:
champions = ["Aatrox", "Ahri", "Darius", "Katarina"]

for c1 in champions:
    for c2 in champions:
        similarity = get_word_similarity(embeddings, word2id, c1, c2)
        print(f"Similarity between {c1} and {c2}: {similarity}")

Similarity between Aatrox and Aatrox: 1.0
Similarity between Aatrox and Ahri: 0.2797515901192059
Similarity between Aatrox and Darius: 0.25354712418326303
Similarity between Aatrox and Katarina: 0.2797515691418397
Similarity between Ahri and Aatrox: 0.2797515901192059
Similarity between Ahri and Ahri: 0.9999999999999998
Similarity between Ahri and Darius: 0.15762198150904974
Similarity between Ahri and Katarina: 0.3043478081958932
Similarity between Darius and Aatrox: 0.25354712418326303
Similarity between Darius and Ahri: 0.15762198150904974
Similarity between Darius and Darius: 1.0
Similarity between Darius and Katarina: 0.1576220207772147
Similarity between Katarina and Aatrox: 0.2797515691418397
Similarity between Katarina and Ahri: 0.3043478081958932
Similarity between Katarina and Darius: 0.1576220207772147
Similarity between Katarina and Katarina: 0.9999999999999991


In [None]:
# Create a list of tuples with champion names and their similarity scores to 'Aatrox'
similarity_scores = [(c, get_word_similarity(embeddings, word2id, "Ahri", c)) for c in champions]

# Sort the list based on similarity scores in descending order
similarity_scores.sort(key=lambda x: x[1], reverse=True)

# Create a ranking dictionary
ranking = {champion: rank + 1 for rank, (champion, _) in enumerate(similarity_scores)}

# Print the rankings
for champion, rank in ranking.items():
    print(f"Rank {rank}: {champion}")

Rank 1: Ahri
Rank 2: Katarina
Rank 3: Aatrox
Rank 4: Darius
