In [1]:


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
nltk.download("punkt")
nltk.download("stopwords")
nltk.download("wordnet")

import string
import re
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from scipy import stats
from sklearn.metrics.pairwise import linear_kernel # for cosine similarity
from sklearn.feature_extraction.text import TfidfVectorizer


[nltk_data] Downloading package punkt to /home/beltran/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/beltran/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/beltran/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
# load data
data = pd.read_csv('all_games_2.csv')

data

Unnamed: 0,name,platform,release_date,summary,meta_score,user_review
0,The Legend of Zelda: Ocarina of Time,Nintendo 64,"November 23, 1998","As a young boy, Link is tricked by Ganondorf, ...",99,9.1
1,Tony Hawk's Pro Skater 2,PlayStation,"September 20, 2000",As most major publishers' development efforts ...,98,7.4
2,Grand Theft Auto IV,PlayStation 3,"April 29, 2008",[Metacritic's 2008 PS3 Game of the Year; Also ...,98,7.7
3,SoulCalibur,Dreamcast,"September 8, 1999","This is a tale of souls and swords, transcendi...",98,8.4
4,Grand Theft Auto IV,Xbox 360,"April 29, 2008",[Metacritic's 2008 Xbox 360 Game of the Year; ...,98,7.9
...,...,...,...,...,...,...
18795,Fast & Furious: Showdown,Xbox 360,"May 21, 2013",Fast & Furious: Showdown takes some of the fra...,22,1.3
18796,Drake of the 99 Dragons,Xbox,"November 3, 2003",Drake is out for revenge in a supernatural Hon...,22,1.7
18797,Afro Samurai 2: Revenge of Kuma Volume One,PlayStation 4,"September 22, 2015","Head out on a journey of redemption, driven by...",21,2.9
18798,Infestation: Survivor Stories (The War Z),PC,"October 15, 2012","(Formerly known as ""The War Z"") It has been 5 ...",20,1.7


In [3]:
# drop duplicates

data.drop_duplicates('name',keep='first', inplace=True)

data.reset_index(drop=True, inplace=True)

data


Unnamed: 0,name,platform,release_date,summary,meta_score,user_review
0,The Legend of Zelda: Ocarina of Time,Nintendo 64,"November 23, 1998","As a young boy, Link is tricked by Ganondorf, ...",99,9.1
1,Tony Hawk's Pro Skater 2,PlayStation,"September 20, 2000",As most major publishers' development efforts ...,98,7.4
2,Grand Theft Auto IV,PlayStation 3,"April 29, 2008",[Metacritic's 2008 PS3 Game of the Year; Also ...,98,7.7
3,SoulCalibur,Dreamcast,"September 8, 1999","This is a tale of souls and swords, transcendi...",98,8.4
4,Super Mario Galaxy,Wii,"November 12, 2007",[Metacritic's 2007 Wii Game of the Year] The u...,97,9.1
...,...,...,...,...,...,...
12249,Charlie's Angels,GameCube,"July 9, 2003","Join Natalie, Dylan, and Alex for an intense a...",23,4.3
12250,Fast & Furious: Showdown,Xbox 360,"May 21, 2013",Fast & Furious: Showdown takes some of the fra...,22,1.3
12251,Drake of the 99 Dragons,Xbox,"November 3, 2003",Drake is out for revenge in a supernatural Hon...,22,1.7
12252,Afro Samurai 2: Revenge of Kuma Volume One,PlayStation 4,"September 22, 2015","Head out on a journey of redemption, driven by...",21,2.9


In [4]:
# summary stats

data.describe()

Unnamed: 0,meta_score
count,12254.0
mean,70.743104
std,12.418369
min,20.0
25%,64.0
50%,73.0
75%,80.0
max,99.0


In [5]:
# check for nulls

data.isnull().sum()

# 101 null summary entries


name              0
platform          0
release_date      0
summary         101
meta_score        0
user_review       0
dtype: int64

In [6]:
# drop nulls

null_rows = data[data['summary'].isnull()].index

data.drop(index=null_rows, inplace=True)

data

Unnamed: 0,name,platform,release_date,summary,meta_score,user_review
0,The Legend of Zelda: Ocarina of Time,Nintendo 64,"November 23, 1998","As a young boy, Link is tricked by Ganondorf, ...",99,9.1
1,Tony Hawk's Pro Skater 2,PlayStation,"September 20, 2000",As most major publishers' development efforts ...,98,7.4
2,Grand Theft Auto IV,PlayStation 3,"April 29, 2008",[Metacritic's 2008 PS3 Game of the Year; Also ...,98,7.7
3,SoulCalibur,Dreamcast,"September 8, 1999","This is a tale of souls and swords, transcendi...",98,8.4
4,Super Mario Galaxy,Wii,"November 12, 2007",[Metacritic's 2007 Wii Game of the Year] The u...,97,9.1
...,...,...,...,...,...,...
12249,Charlie's Angels,GameCube,"July 9, 2003","Join Natalie, Dylan, and Alex for an intense a...",23,4.3
12250,Fast & Furious: Showdown,Xbox 360,"May 21, 2013",Fast & Furious: Showdown takes some of the fra...,22,1.3
12251,Drake of the 99 Dragons,Xbox,"November 3, 2003",Drake is out for revenge in a supernatural Hon...,22,1.7
12252,Afro Samurai 2: Revenge of Kuma Volume One,PlayStation 4,"September 22, 2015","Head out on a journey of redemption, driven by...",21,2.9


In [7]:
# get info about dataframe

data.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 12153 entries, 0 to 12253
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   name          12153 non-null  object
 1   platform      12153 non-null  object
 2   release_date  12153 non-null  object
 3   summary       12153 non-null  object
 4   meta_score    12153 non-null  int64 
 5   user_review   12153 non-null  object
dtypes: int64(1), object(5)
memory usage: 664.6+ KB


In [8]:
# clean/process corpus

# Function to streamline NLP Process

def nlp(df, text):
    # Load string
    # raw_data = pd.read_csv(file + '.csv')
    df['dummy'] = df[text].astype(str)
    # Convert to lowercase
    dummy = (post.lower() for post in df['dummy'])
    df['dummy'] = [i for i in dummy]
    # Word & Sentence Tokenization
    token_post = (word_tokenize(post) for post in df['dummy'])
    token_post = [i for i in token_post]
    #sent_token = [sent_tokenize(post) for post in df['text']]

    # Remove Punctuation
    reg = re.compile('(@[a-z0-9]+)|([^0-9a-z \t])|(\w+:\/\/\S+)')
    no_punc = []
    for filt in token_post:
        review = []
        for token in filt:
            new_token = reg.sub(u'', token)
            if not new_token == u'':
                review.append(new_token)
        no_punc.append(review)
        
    # Remove Stopwords
    no_stop = []

    for post in no_punc:
        new_term_vector = []
        for word in post:
            if not word in stopwords.words('english'):
                new_term_vector.append(word)

        no_stop.append(new_term_vector)
        
    # Stemming & Lemmatization
    pstem = PorterStemmer()
    wlem = WordNetLemmatizer()

    preproc_text = []

    for text in no_stop:
        final_text = []
        for word in text:
            pstem.stem(word)
            final_text.append(wlem.lemmatize(word))

        preproc_text.append(final_text)
        
    # create final data set
    #data = df.copy()

    new_col = pd.Series(preproc_text)
    df['proc_summary'] = new_col
    df.drop('dummy', axis=1, inplace=True)
    return df

proc_data = nlp(data, 'summary')


In [9]:
proc_data.head(10)

Unnamed: 0,name,platform,release_date,summary,meta_score,user_review,proc_summary
0,The Legend of Zelda: Ocarina of Time,Nintendo 64,"November 23, 1998","As a young boy, Link is tricked by Ganondorf, ...",99,9.1,"[young, boy, link, tricked, ganondorf, king, g..."
1,Tony Hawk's Pro Skater 2,PlayStation,"September 20, 2000",As most major publishers' development efforts ...,98,7.4,"[major, publisher, development, effort, shift,..."
2,Grand Theft Auto IV,PlayStation 3,"April 29, 2008",[Metacritic's 2008 PS3 Game of the Year; Also ...,98,7.7,"[metacritic, 2008, ps3, game, year, also, know..."
3,SoulCalibur,Dreamcast,"September 8, 1999","This is a tale of souls and swords, transcendi...",98,8.4,"[tale, soul, sword, transcending, world, histo..."
4,Super Mario Galaxy,Wii,"November 12, 2007",[Metacritic's 2007 Wii Game of the Year] The u...,97,9.1,"[metacritic, 2007, wii, game, year, ultimate, ..."
5,Super Mario Galaxy 2,Wii,"May 23, 2010","Super Mario Galaxy 2, the sequel to the galaxy...",97,9.1,"[super, mario, galaxy, 2, sequel, galaxyhoppin..."
6,Red Dead Redemption 2,Xbox One,"October 26, 2018",Developed by the creators of Grand Theft Auto ...,97,8.0,"[developed, creator, grand, theft, auto, v, re..."
7,Grand Theft Auto V,Xbox One,"November 18, 2014",Grand Theft Auto 5 melds storytelling and game...,97,7.9,"[grand, theft, auto, 5, meld, storytelling, ga..."
8,Disco Elysium: The Final Cut,PC,"March 30, 2021",Disco Elysium - The Final Cut is the definitiv...,97,8.3,"[disco, elysium, final, cut, definitive, editi..."
9,The Legend of Zelda: Breath of the Wild,Switch,"March 3, 2017",Forget everything you know about The Legend of...,97,8.7,"[forget, everything, know, legend, zelda, game..."


In [10]:
# reset proc_data index

proc_data.reset_index(drop=True, inplace=True)

proc_data.index


RangeIndex(start=0, stop=12153, step=1)

In [11]:
#Define TF-IDF Vectorizer Object
tfidf = TfidfVectorizer()

#Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform([str(i) for i in proc_data['proc_summary']])

#Output the shape of tfidf_matrix
tfidf_matrix.shape

(12153, 38142)

In [12]:
# list of feature integer indices to feature name

tfidf.get_feature_names_out()[3000:3050]


array(['astromonkey', 'astronaut', 'astroneer', 'astroneers', 'astrong',
       'astronomical', 'astronomist', 'astrophysical', 'astrophysicist',
       'astropop', 'astrosmash', 'asuka', 'asuna', 'asunder', 'asura',
       'asw', 'asylum', 'asymmetric', 'asymmetrical', 'asynchronous',
       'at60framespersecond', 'atalanta', 'ataldazar', 'atamipek',
       'atari', 'atat', 'atats', 'atb', 'atbat', 'ateam', 'atelier',
       'atemporal', 'aterra', 'atgs', 'atheletes', 'athena', 'athens',
       'atherton', 'athlete', 'athletic', 'athleticism', 'athletics',
       'athrun', 'atkinson', 'atlanta', 'atlantean', 'atlanteans',
       'atlantian', 'atlantic', 'atlantis'], dtype=object)

In [13]:
# compute cosine similarity matrix

cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

cosine_sim

array([[1.        , 0.        , 0.01963851, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 1.        , 0.00569489, ..., 0.        , 0.        ,
        0.        ],
       [0.01963851, 0.00569489, 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 1.        ,
        1.        ],
       [0.        , 0.        , 0.        , ..., 1.        , 1.        ,
        1.        ],
       [0.        , 0.        , 0.        , ..., 1.        , 1.        ,
        1.        ]])

In [14]:
# reverse mapping of indices and video game titles

indices = pd.Series(proc_data.index, index=proc_data['name'])#.drop_duplicates()

indices

name
The Legend of Zelda: Ocarina of Time              0
Tony Hawk's Pro Skater 2                          1
Grand Theft Auto IV                               2
SoulCalibur                                       3
Super Mario Galaxy                                4
                                              ...  
Charlie's Angels                              12148
Fast & Furious: Showdown                      12149
Drake of the 99 Dragons                       12150
Afro Samurai 2: Revenge of Kuma Volume One    12151
Infestation: Survivor Stories (The War Z)     12152
Length: 12153, dtype: int64

In [15]:
# Recommendation function that takes video game title 
# as input and outputs most similar video games

def recommender_system(title, cosine_sim=cosine_sim):
    # get index of video game that matches title    
    idx = indices[title]
    # get pairwise similarity scores of all video games with the given title
    sim_scores = list(enumerate(cosine_sim[idx]))
    # sort games based on similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    # get scores of 10 most similar video games
    sim_scores = sim_scores[1:11]
    # get game indices
    game_indices = [i[0] for i in sim_scores]
    # return top 10 most similar video games
    recs = proc_data['name'].iloc[game_indices]
    return recs


In [16]:
# test run
# open-world action game
recommender_system('Journey')

288                            flower
6865    Ar tonelico: Melody of Elemia
5944                    Shadow Hearts
1298                      Chime Sharp
9724                          Godfall
446              Phantasy Star Online
37                    LittleBigPlanet
3766               Sega Ages: Shinobi
8219                   Joy Ride Turbo
6301                         Narcosis
Name: name, dtype: object

In [17]:
recommender_system('Super Mario Galaxy')

2169              NyxQuest: Kindred Spirits
761                            Pokemon Moon
2095      Bloodstained: Curse of the Moon 2
103                    Super Mario 3D World
10355                     PGA European Tour
9805     The Witch and the Hundred Knight 2
7501                                   Lume
14                      Super Mario Odyssey
9381                            Teen Titans
11702       The Uncertain: Light at the End
Name: name, dtype: object