In [11]:
import pandas as pd
import numpy as np
import re

from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import PorterStemmer

from textacy.preprocessing.remove import accents, brackets, punctuation
from textacy.preprocessing.replace import numbers, urls
from textacy.preprocessing.normalize import whitespace

import os

In [12]:
def clean_page(page):
    # given a page, removes heading, newlines, tabs, etc
    page = re.sub("=+", "", page)
    page = page.replace("\n", "")
    page = page.replace("\t", "")
    page = accents(brackets(page))
    page = urls(page)

    return whitespace(page).lower()

def clean_sentences(s):
        
    pattern = r'[^A-Za-z0-9]+'
    page = re.sub(pattern, '', s)
    return s


  
ps = PorterStemmer()
def prepare_document(doc):
    # given a document, preprocesses and tokenizes it for tfidf

    # clean the document of misc symbols and headings, lowercase it
    doc = clean_page(doc)

    #tokenize by sentence and then by word
    sentences = sent_tokenize(doc)

    #remove punctuation
    sentences = [punctuation(s) for s in sentences]


    # stem every word
    sentences_and_words = [word_tokenize(s) for s in sentences]

    prepared_doc = []
    
    for sent in sentences_and_words:
        stemmed_sentences = []
        for word in sent:
            stemmed_sentences.append(ps.stem(word))
        cleaned_sentence = " ".join(stemmed_sentences)
        prepared_doc.append(cleaned_sentence)
    return " ".join(prepared_doc)


# small function to calculats cosine similarity of all pairs and store
def cosine_similarity(v1, v2):
    numerator = np.dot(v1, v2)
    denom = np.sqrt(np.sum(np.square(v1))) * np.sqrt(np.sum(np.square(v2)))

    return numerator/denom 


def cos_dicts(names, vects):

    #given a set of vectors, create a dict of dicts for cosine similarity
    # This dict of dict structure allows us to index directly into the pair we want
    # The first key will be our desired game
    # and the value for that key will be a dictionary of partner games

    # The inner key will be the second game we wish to seek, and its value will be cosine similarity to our first game

    d = {}
    for name, vect in zip(names, vects):
        cos_sim_by_vect = {}
        for n2, v2 in zip(names, vects):
            if n2 != name:
                cos_sim_by_vect[n2] = cosine_similarity(vect, v2)
        d[name] = cos_sim_by_vect
    return d

def retrieve_top_k_similar(n1, similarity_dict, k):
    inner_dict = similarity_dict[n1]
    # sort the dictionary by value, descending, then retrieve top k values
    return sorted(inner_dict.items(), reverse = True, key = lambda x: x[1])[:k]

In [32]:
ns_bestsellers = pd.read_csv("nintendo_bestsellers.csv")

In [14]:
# We need to remove headings, normalise case and punctuation, newlines
ns_bestsellers.wiki_page.iloc[0]

'Mario Kart 8 Deluxe is a 2017 kart racing game developed and published by Nintendo and released for the Nintendo Switch. The game is an expanded and enhanced re-release of the 2014 game Mario Kart 8. Deluxe follows the same gameplay as Mario Kart 8 and the rest of the Mario Kart series, where players race in go-karts while trying to sabotage each other with items. Players can control one of several characters from the Mario franchise and other Nintendo franchises, with several additional characters being added in Deluxe. The game also introduces a revamped battle mode, featuring five sub-modes and eight battle courses.\nMario Kart 8 Deluxe was first teased in October 2016 during the Nintendo Switch reveal trailer, and was fully revealed during the Nintendo Switch Presentation in January 2017. Deluxe released on April 28, 2017 and was both a critical and commercial success. Critics widely considered it to be the definitive edition of Mario Kart 8, and some considered it the best game i

In [15]:
clean_page(ns_bestsellers.wiki_page.iloc[0])

'mario kart 8 deluxe is a 2017 kart racing game developed and published by nintendo and released for the nintendo switch. the game is an expanded and enhanced re-release of the 2014 game mario kart 8. deluxe follows the same gameplay as mario kart 8 and the rest of the mario kart series, where players race in go-karts while trying to sabotage each other with items. players can control one of several characters from the mario franchise and other nintendo franchises, with several additional characters being added in deluxe. the game also introduces a revamped battle mode, featuring five sub-modes and eight battle courses.mario kart 8 deluxe was first teased in october 2016 during the nintendo switch reveal trailer, and was fully revealed during the nintendo switch presentation in january 2017. deluxe released on april 28, 2017 and was both a critical and commercial success. critics widely considered it to be the definitive edition of mario kart 8, and some considered it the best game in 

In [16]:
# preprocessing for TFIDF

# tokenization, stemming/lemmatization, whitespace/newline removal, etc

#tokenization will be taken care of by nltk as well
# sentence tokenization, followed by word tokenization
# each row in our dataset will be a document, each column will be a word

#We'll have our input data be a single document, so we can re-use the pipeline during our model training


In [17]:
prepare_document(ns_bestsellers.wiki_page.iloc[0])

'mario kart 8 delux is a 2017 kart race game develop and publish by nintendo and releas for the nintendo switch the game is an expand and enhanc re releas of the 2014 game mario kart 8 delux follow the same gameplay as mario kart 8 and the rest of the mario kart seri where player race in go kart while tri to sabotag each other with item player can control one of sever charact from the mario franchis and other nintendo franchis with sever addit charact be ad in delux the game also introduc a revamp battl mode featur five sub mode and eight battl cours mario kart 8 delux wa first teas in octob 2016 dure the nintendo switch reveal trailer and wa fulli reveal dure the nintendo switch present in januari 2017 delux releas on april 28 2017 and wa both a critic and commerci success critic wide consid it to be the definit edit of mario kart 8 and some consid it the best game in the mario kart seri or one of the best race game of all time while some critic the lack of signific new content compar

In [18]:
cleaned_wikis = ns_bestsellers.wiki_page.apply(lambda x: prepare_document(x))

In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()
tfidf_wikis = tfidf.fit_transform(cleaned_wikis.tolist())

In [20]:
tfidf_wikis.shape

(73, 7965)

In [21]:
video_game_cos_dict = cos_dicts(ns_bestsellers.Title, tfidf_wikis.toarray())

In [22]:
from sklearn.decomposition import TruncatedSVD

svd = TruncatedSVD(n_components=2, random_state=1)
svd.fit(tfidf_wikis)
svd.explained_variance_ratio_

array([0.09999207, 0.16452101])

In [23]:
two_d_projection = svd.transform(tfidf_wikis)

In [24]:
two_d_projection.tolist()

[[0.7071613096928229, -0.004007896242005735],
 [0.8962073116592086, -0.0011420398181093026],
 [0.8539801082311657, -0.000171332435410799],
 [0.8972344842804922, 0.0019192487969316014],
 [0.7718029309511149, -0.0017894416604157394],
 [0.8586778911516929, -0.0064637506656401245],
 [0.01191202343102428, 0.9999237939951644],
 [0.7007918057805415, -0.008874974527806669],
 [0.7668279532516866, -0.007334147033914029],
 [0.6623294477004471, -0.0040418513064617665],
 [0.7678122771972617, -0.0005738176614580829],
 [0.8045348150148822, -0.004815274374010706],
 [0.8089621426457776, -0.00889409297096421],
 [0.6912699984677548, -0.002004981508617416],
 [0.7476728374590481, -0.005600191916575279],
 [0.5804252839860033, -0.0010082536491969885],
 [0.01191202343102428, 0.9999237939951644],
 [0.6465912949246286, -0.0007367636643416352],
 [0.7766298748432107, -0.006530107640287257],
 [0.73298155578316, -0.008829592324671409],
 [0.01191202343102428, 0.9999237939951644],
 [0.7668744549653017, -0.00519120878

In [25]:
ns_bestsellers.Title

0                          Mario Kart 8 Deluxe
1                Animal Crossing: New Horizons
2                   Super Smash Bros. Ultimate
3      The Legend of Zelda: Breath of the Wild
4                     Pokémon Sword and Shield
                        ...                   
68                              Fitness Boxing
69       Fitness Boxing 2: Rhythm and Exercise
70                        Shin Megami Tensei V
71    Story of Seasons: Pioneers of Olive Town
72                             Thief Simulator
Name: Title, Length: 73, dtype: object

In [26]:
two_d_projection.shape

(73, 2)

In [27]:
projected_df = pd.DataFrame(two_d_projection, columns = ["pc1", "pc2"])
projected_df["names"] = ns_bestsellers.Title
projected_df.head()

Unnamed: 0,pc1,pc2,names
0,0.707161,-0.004008,Mario Kart 8 Deluxe
1,0.896207,-0.001142,Animal Crossing: New Horizons
2,0.85398,-0.000171,Super Smash Bros. Ultimate
3,0.897234,0.001919,The Legend of Zelda: Breath of the Wild
4,0.771803,-0.001789,Pokémon Sword and Shield


In [28]:
import plotly.express as px

px.scatter(x= projected_df.pc1, y = projected_df.pc2, hover_data=[projected_df.names])