In [1]:
import pandas as pd
import wikipedia as w
from bs4 import BeautifulSoup
from textacy.preprocessing.remove import accents, brackets, punctuation
from textacy.preprocessing.replace import numbers, urls
from textacy.preprocessing.normalize import whitespace
from nltk.tokenize import sent_tokenize

2022-11-27 17:20:24.104618: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
green_page = w.page(pageid="12460")
purple_page = w.page(pageid = "37948")

In [40]:
import re
def clean_page(page):
    # given a page, removes heading, newlines, tabs, etc
    page = re.sub("=+", "", page)
    page = page.replace("\n", "")
    page = page.replace("\t", "")
    page = accents(brackets(page))
    page = urls(page)

    return whitespace(page).lower()


In [4]:
def clean_sentences(s):
        
    pattern = r'[^A-Za-z0-9]+'
    page = re.sub(pattern, '', s)
    return s

In [5]:
list(map(lambda x: clean_sentences(x), sent_tokenize(clean_page(green_page.content))))
# clean numbers
# clean headers and newlines
# remove cited works



['green is the color between cyan and yellow on the visible spectrum.',
 'it is evoked by light which has a dominant wavelength of roughly – nm.',
 'in subtractive color systems, used in painting and color printing, it is created by a combination of yellow and cyan; in the rgb color model, used on television and computer screens, it is one of the additive primary colors, along with red and blue, which are mixed in different combinations to create all other colors.',
 'by far the largest contributor to green in nature is chlorophyll, the chemical by which plants photosynthesize and convert sunlight into chemical energy.',
 'many creatures have adapted to their green environments by taking on a green hue themselves as camouflage.',
 'several minerals have a green color, including the emerald, which is colored green by its chromium content.during post-classical and early modern europe, green was the color commonly associated with wealth, merchants, bankers and the gentry, while red was re

In [6]:
len(sent_tokenize(clean_page(purple_page.content)))

238

In [7]:
# scraping nintendo switch games table
import requests
best_selling_nintendo = requests.get("https://en.wikipedia.org/wiki/List_of_best-selling_Nintendo_Switch_video_games")
nintendo_soup = BeautifulSoup(best_selling_nintendo.text, 'html.parser')
nintendo_table = nintendo_soup.find_all('table')


In [8]:
switch_games = nintendo_table[1]
headers = [header.text.strip() for header in switch_games.find_all('th')][:8]


In [9]:
headers 

['No.',
 'Title',
 'Copies sold',
 'As of',
 'Release date[a]',
 'Genre(s)',
 'Developer(s)',
 'Publisher(s)']

In [10]:
switch_games

<table class="wikitable plainrowheaders sortable">
<caption>List of best-selling Nintendo Switch video games
</caption>
<tbody><tr>
<th scope="col"><abbr title="Number">No.</abbr>
</th>
<th scope="col">Title
</th>
<th scope="col">Copies sold
</th>
<th scope="col">As of
</th>
<th scope="col">Release date<sup class="reference" id="cite_ref-5"><a href="#cite_note-5">[a]</a></sup>
</th>
<th scope="col">Genre(s)
</th>
<th scope="col">Developer(s)
</th>
<th scope="col">Publisher(s)
</th></tr>
<tr>
<td style="text-align: center;">1
</td>
<th scope="row"><i><a href="/wiki/Mario_Kart_8_Deluxe" title="Mario Kart 8 Deluxe">Mario Kart 8 Deluxe</a></i>
</th>
<td><span data-sort-value="7001484099999999999♠">48.41</span><span class="nowrap"> </span>million<sup class="reference" id="cite_ref-titlesales_4-1"><a href="#cite_note-titlesales-4">[4]</a></sup>
</td>
<td><span data-sort-value="000000002022-09-30-0000" style="white-space:nowrap">September 30, 2022</span>
</td>
<td><span data-sort-value="00000

In [13]:
#data_rows[1].find_all(re.compile("td|th"))

In [14]:
data_rows = switch_games.find_all('tr')
rows = []
for row in data_rows:
    value = row.find_all(re.compile('td|th'))
    beautified_value = [ele.text.strip() for ele in value]
    # Remove data arrays that are empty
    if len(beautified_value) == 0:
        continue
    # deal with numbers that are missing entries
    if len(beautified_value) < len(headers):
        beautified_value = [None] + beautified_value
    rows.append(beautified_value)

In [15]:
rows

[['No.',
  'Title',
  'Copies sold',
  'As of',
  'Release date[a]',
  'Genre(s)',
  'Developer(s)',
  'Publisher(s)'],
 ['1',
  'Mario Kart 8 Deluxe',
  '48.41\xa0million[4]',
  'September 30, 2022',
  'April 28, 2017',
  'Kart racing',
  'Nintendo EPD',
  'Nintendo'],
 ['2',
  'Animal Crossing: New Horizons',
  '40.17\xa0million[4]',
  'September 30, 2022',
  'March 20, 2020',
  'Social simulation',
  'Nintendo EPD',
  'Nintendo'],
 ['3',
  'Super Smash Bros. Ultimate',
  '29.53\xa0million[4]',
  'September 30, 2022',
  'December 7, 2018',
  'Fighting',
  'Bandai Namco StudiosSora Ltd.',
  'Nintendo'],
 ['4',
  'The Legend of Zelda: Breath of the Wild',
  '27.79\xa0million[4]',
  'September 30, 2022',
  'March 3, 2017',
  'Action-adventure',
  'Nintendo EPD',
  'Nintendo'],
 ['5',
  'Pokémon Sword and Shield',
  '25.37\xa0million[4]',
  'September 30, 2022',
  'November 15, 2019',
  'Role-playing',
  'Game Freak',
  'The Pokémon CompanyNintendo'],
 ['6',
  'Super Mario Odyssey',
  '2

In [16]:
import csv

with open('nintendo_bestsellers.csv', 'w', newline="") as output:
    writer = csv.writer(output)
    writer.writerow(headers)
    writer.writerows(rows)

In [17]:
ns_bestsellers = pd.read_csv("nintendo_bestsellers.csv")[1:]

In [18]:
ns_bestsellers.head()

Unnamed: 0,No.,Title,Copies sold,As of,Release date[a],Genre(s),Developer(s),Publisher(s)
1,1,Mario Kart 8 Deluxe,48.41 million[4],"September 30, 2022","April 28, 2017",Kart racing,Nintendo EPD,Nintendo
2,2,Animal Crossing: New Horizons,40.17 million[4],"September 30, 2022","March 20, 2020",Social simulation,Nintendo EPD,Nintendo
3,3,Super Smash Bros. Ultimate,29.53 million[4],"September 30, 2022","December 7, 2018",Fighting,Bandai Namco StudiosSora Ltd.,Nintendo
4,4,The Legend of Zelda: Breath of the Wild,27.79 million[4],"September 30, 2022","March 3, 2017",Action-adventure,Nintendo EPD,Nintendo
5,5,Pokémon Sword and Shield,25.37 million[4],"September 30, 2022","November 15, 2019",Role-playing,Game Freak,The Pokémon CompanyNintendo


In [33]:
# write function that can search each game and return page contents

def retrieve_page(game):
    try:
        print(game)
        return w.page(game).content
    except:
        print(f"Game {game} was not found")
        return "Page Not Found"


game_text = ns_bestsellers["Title"].apply(lambda x: retrieve_page(x))

Mario Kart 8 Deluxe
Animal Crossing: New Horizons
Super Smash Bros. Ultimate
The Legend of Zelda: Breath of the Wild
Pokémon Sword and Shield
Super Mario Odyssey
Super Mario Party
Game Super Mario Party was not found
Pokémon Brilliant Diamond and Shining Pearl
Ring Fit Adventure
Pokémon: Let's Go, Pikachu! and Let's Go, Eevee!
Pokémon Legends: Arceus
New Super Mario Bros. U Deluxe
Splatoon 2
Luigi's Mansion 3
Pokémon Scarlet and Violet
Super Mario 3D World + Bowser's Fury
Super Mario 3D All-Stars
Game Super Mario 3D All-Stars was not found
Mario Party Superstars
Splatoon 3
Super Mario Maker 2
Monster Hunter Rise
Game Monster Hunter Rise was not found
Nintendo Switch Sports
The Legend of Zelda: Link's Awakening
Kirby and the Forgotten Land
Mario Tennis Aces
Clubhouse Games: 51 Worldwide Classics
Donkey Kong Country: Tropical Freeze
Hyrule Warriors: Age of Calamity
Kirby Star Allies
The Legend of Zelda: Skyward Sword HD
Fire Emblem: Three Houses
1-2-Switch
Momotaro Dentetsu: Showa, Heise

In [34]:
ns_bestsellers["wiki_page"] = game_text

In [35]:
ns_bestsellers

Unnamed: 0,No.,Title,Copies sold,As of,Release date[a],Genre(s),Developer(s),Publisher(s),text,wiki_page
1,1,Mario Kart 8 Deluxe,48.41 million[4],"September 30, 2022","April 28, 2017",Kart racing,Nintendo EPD,Nintendo,Page Not Found,Mario Kart 8 Deluxe is a 2017 kart racing game...
2,2,Animal Crossing: New Horizons,40.17 million[4],"September 30, 2022","March 20, 2020",Social simulation,Nintendo EPD,Nintendo,Page Not Found,Animal Crossing: New Horizons is a 2020 social...
3,3,Super Smash Bros. Ultimate,29.53 million[4],"September 30, 2022","December 7, 2018",Fighting,Bandai Namco StudiosSora Ltd.,Nintendo,Page Not Found,Super Smash Bros. Ultimate is a 2018 crossover...
4,4,The Legend of Zelda: Breath of the Wild,27.79 million[4],"September 30, 2022","March 3, 2017",Action-adventure,Nintendo EPD,Nintendo,Page Not Found,The Legend of Zelda: Breath of the Wild is a 2...
5,5,Pokémon Sword and Shield,25.37 million[4],"September 30, 2022","November 15, 2019",Role-playing,Game Freak,The Pokémon CompanyNintendo,Page Not Found,Pokémon Sword and Pokémon Shield are 2019 role...
...,...,...,...,...,...,...,...,...,...,...
69,,Fitness Boxing,1 million[23],"September 8, 2020","December 20, 2018",Exergamerhythm,Imagineer,JP: ImagineerNA/PAL: Nintendo,Page Not Found,Fitness Boxing is an exergaming video game dev...
70,,Fitness Boxing 2: Rhythm and Exercise,1 million[24],"December 9, 2021","December 4, 2020",Exergamerhythm,Imagineer,JP: ImagineerNA/PAL: Nintendo,Page Not Found,Fitness Boxing is an exergaming video game dev...
71,,Shin Megami Tensei V,1 million[25],"April 18, 2022","November 11, 2021",Role-playing,Atlus,JP: AtlusNA: SegaPAL: Nintendo,Page Not Found,Shin Megami Tensei V is a 2021 role-playing vi...
72,,Story of Seasons: Pioneers of Olive Town,1 million[26],"November 18, 2021","February 25, 2021",Simulationrole-playing,Marvelous,Xseed Games,Page Not Found,The Story of Seasons video game series was ori...


In [39]:
# We need to remove headings, normalise case and punctuation, newlines
ns_bestsellers.wiki_page.iloc[0]

'Mario Kart 8 Deluxe is a 2017 kart racing game developed and published by Nintendo and released for the Nintendo Switch. The game is an expanded and enhanced re-release of the 2014 game Mario Kart 8. Deluxe follows the same gameplay as Mario Kart 8 and the rest of the Mario Kart series, where players race in go-karts while trying to sabotage each other with items. Players can control one of several characters from the Mario franchise and other Nintendo franchises, with several additional characters being added in Deluxe. The game also introduces a revamped battle mode, featuring five sub-modes and eight battle courses.\nMario Kart 8 Deluxe was first teased in October 2016 during the Nintendo Switch reveal trailer, and was fully revealed during the Nintendo Switch Presentation in January 2017. A later trailer showcased several of the game\'s features, including the new battle mode. Deluxe released on April 28, 2017, and as of September 2022, it has sold 48.41 million units, making it t

In [41]:
clean_page(ns_bestsellers.wiki_page.iloc[0])

'mario kart 8 deluxe is a 2017 kart racing game developed and published by nintendo and released for the nintendo switch. the game is an expanded and enhanced re-release of the 2014 game mario kart 8. deluxe follows the same gameplay as mario kart 8 and the rest of the mario kart series, where players race in go-karts while trying to sabotage each other with items. players can control one of several characters from the mario franchise and other nintendo franchises, with several additional characters being added in deluxe. the game also introduces a revamped battle mode, featuring five sub-modes and eight battle courses.mario kart 8 deluxe was first teased in october 2016 during the nintendo switch reveal trailer, and was fully revealed during the nintendo switch presentation in january 2017. a later trailer showcased several of the game\'s features, including the new battle mode. deluxe released on april 28, 2017, and as of september 2022, it has sold 48.41 million units, making it the

In [None]:
# preprocessing for TFIDF

# tokenization, stemming/lemmatization, whitespace/newline removal, etc

#tokenization will be taken care of by nltk as well
# sentence tokenization, followed by word tokenization
# each row in our dataset will be a document, each column will be a word

#We'll have our input data be a single document, so we can re-use the pipeline during our model training


In [108]:
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
  
ps = PorterStemmer()
def prepare_document(doc):
    # given a document, preprocesses and tokenizes it for tfidf

    # clean the document of misc symbols and headings, lowercase it
    doc = clean_page(doc)

    #tokenize by sentence and then by word
    sentences = sent_tokenize(doc)

    #remove punctuation
    sentences = [punctuation(s) for s in sentences]


    # stem every word
    sentences_and_words = [word_tokenize(s) for s in sentences]

    prepared_doc = []
    
    for sent in sentences_and_words:
        stemmed_sentences = []
        for word in sent:
            stemmed_sentences.append(ps.stem(word))
        cleaned_sentence = " ".join(stemmed_sentences)
        prepared_doc.append(cleaned_sentence)
    return " ".join(prepared_doc)


In [109]:
prepare_document(ns_bestsellers.wiki_page.iloc[0])

'mario kart 8 delux is a 2017 kart race game develop and publish by nintendo and releas for the nintendo switch the game is an expand and enhanc re releas of the 2014 game mario kart 8 delux follow the same gameplay as mario kart 8 and the rest of the mario kart seri where player race in go kart while tri to sabotag each other with item player can control one of sever charact from the mario franchis and other nintendo franchis with sever addit charact be ad in delux the game also introduc a revamp battl mode featur five sub mode and eight battl cours mario kart 8 delux wa first teas in octob 2016 dure the nintendo switch reveal trailer and wa fulli reveal dure the nintendo switch present in januari 2017 a later trailer showcas sever of the game s featur includ the new battl mode delux releas on april 28 2017 and as of septemb 2022 it ha sold 48 41 million unit make it the best sell nintendo switch game and one of the best sell game of all time sinc launch delux ha receiv sever softwar 

In [110]:
cleaned_wikis = ns_bestsellers.wiki_page.apply(lambda x: prepare_document(x))

In [111]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()
tfidf_wikis = tfidf.fit_transform(cleaned_wikis.tolist())

In [115]:
tfidf_wikis.shape

(73, 7820)

In [144]:
import numpy as np
# small function to calculats cosine similarity of all pairs and store
def cosine_similarity(v1, v2):
    numerator = np.dot(v1, v2)
    denom = np.sqrt(np.sum(np.square(v1))) * np.sqrt(np.sum(np.square(v2)))

    return numerator/denom 


In [145]:
def cos_dicts(names, vects):

    #given a set of vectors, create a dict of dicts for cosine similarity
    # This dict of dict structure allows us to index directly into the pair we want
    # 
    #
    d = {}
    for name, vect in zip(names, vects):
        cos_sim_by_vect = {}
        for n2, v2 in zip(names, vects):
            if n2 != name:
                cos_sim_by_vect[n2] = cosine_similarity(vect, v2)
        d[name] = cos_sim_by_vect
    return d

In [146]:
tfidf_wikis

<73x7820 sparse matrix of type '<class 'numpy.float64'>'
	with 40840 stored elements in Compressed Sparse Row format>

In [148]:
video_game_cos_dict = cos_dicts(ns_bestsellers.Title, tfidf_wikis.toarray())

In [160]:
def retrieve_top_k_similar(n1, similarity_dict, k):
    inner_dict = similarity_dict[n1]
    # sort the dictionary by value, descending, then retrieve top k values
    return sorted(inner_dict.items(), reverse = True, key = lambda x: x[1])[:k]

sorted(video_game_cos_dict["Pokémon Brilliant Diamond and Shining Pearl"].items(), reverse = True, key = lambda x: x[1])

[('Pokémon Legends: Arceus', 0.7664562374981096),
 ('Pokémon Sword and Shield', 0.7117721706272662),
 ('Pokémon Scarlet and Violet', 0.6540293027111254),
 ("Pokémon: Let's Go, Pikachu! and Let's Go, Eevee!", 0.6181862870898288),
 ('Animal Crossing: New Horizons', 0.6139966515987575),
 ('The Legend of Zelda: Breath of the Wild', 0.6013191240464668),
 ('New Pokémon Snap', 0.5952942243874303),
 ('Pokémon Mystery Dungeon: Rescue Team DX', 0.5849086906989363),
 ('The Legend of Zelda: Skyward Sword HD', 0.5844527815601698),
 ('Pokkén Tournament DX', 0.5822689497440127),
 ('Super Smash Bros. Ultimate', 0.574374527721978),
 ("The Legend of Zelda: Link's Awakening", 0.573782760090913),
 ('Super Mario Odyssey', 0.5553230042750218),
 ('Paper Mario: The Origami King', 0.5484806036124424),
 ('Splatoon 2', 0.545879691949679),
 ('Xenoblade Chronicles 2', 0.5426143401726631),
 ('Fire Emblem: Three Houses', 0.5345435679421285),
 ('Octopath Traveler', 0.5340231839470976),
 ('Xenoblade Chronicles 3', 0.5