In [14]:
import pandas as pd
import numpy as np
import nltk
nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to /usr/share/nltk_data...


True

In [16]:
df = pd.read_csv('../input/top-video-games-19952021-metacritic/all_games.csv')
df.head()

Unnamed: 0,name,platform,release_date,summary,meta_score,user_review
0,The Legend of Zelda: Ocarina of Time,Nintendo 64,"November 23, 1998","As a young boy, Link is tricked by Ganondorf, ...",99,9.1
1,Tony Hawk's Pro Skater 2,PlayStation,"September 20, 2000",As most major publishers' development efforts ...,98,7.4
2,Grand Theft Auto IV,PlayStation 3,"April 29, 2008",[Metacritic's 2008 PS3 Game of the Year; Also ...,98,7.7
3,SoulCalibur,Dreamcast,"September 8, 1999","This is a tale of souls and swords, transcendi...",98,8.4
4,Grand Theft Auto IV,Xbox 360,"April 29, 2008",[Metacritic's 2008 Xbox 360 Game of the Year; ...,98,7.9


In [17]:
df = df.dropna()
df.reset_index(drop=True, inplace=True)

In [11]:
df = df[:100]

In [12]:
df.describe()

Unnamed: 0,meta_score
count,100.0
mean,95.42
std,1.288488
min,94.0
25%,94.0
50%,95.0
75%,96.0
max,99.0


In [18]:
from nltk.tokenize import word_tokenize, sent_tokenize
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer

def nlp(df, text):
    #dummy column
    df['dummy'] = df[text].astype(str).str.lower()
    
    token_summ = (word_tokenize(post) for post in df['dummy'])
#     df['token_summ'] = [i for i in token_summ]
    
    reg = re.compile('(@[a-z0-9]+)|([^0-9a-z \t])|(\w+:\/\/\S+)')
    
    no_punc = []
    
    for filt in token_summ:
        review = []
        for token in filt:
            new_token = reg.sub('', token)
            if new_token != '':
                review.append(new_token)
        no_punc.append(review)
    
    no_stop = []
    
    for post in no_punc:
        new_term_vector = []
        for word in post:
            if not word in stopwords.words('english'):
                new_term_vector.append(word)
        no_stop.append(new_term_vector)
    
    pstem = PorterStemmer()
    wlem = WordNetLemmatizer()
    
    preproc_text = []
    
    for text in no_stop:
        final_text = []
        for word in text:
            pstem.stem(word)
            final_text.append(wlem.lemmatize(word))
        preproc_text.append(final_text)
        

    df['proc_summary'] = pd.Series(preproc_text)
    df.drop('dummy', axis=1, inplace=True)
    return df
    
proc_data = nlp(df, 'summary')
    

In [20]:
proc_data.reset_index(drop=True, inplace=True)
proc_data.head()

Unnamed: 0,name,platform,release_date,summary,meta_score,user_review,proc_summary
0,The Legend of Zelda: Ocarina of Time,Nintendo 64,"November 23, 1998","As a young boy, Link is tricked by Ganondorf, ...",99,9.1,"[young, boy, link, tricked, ganondorf, king, g..."
1,Tony Hawk's Pro Skater 2,PlayStation,"September 20, 2000",As most major publishers' development efforts ...,98,7.4,"[major, publisher, development, effort, shift,..."
2,Grand Theft Auto IV,PlayStation 3,"April 29, 2008",[Metacritic's 2008 PS3 Game of the Year; Also ...,98,7.7,"[metacritic, 2008, ps3, game, year, also, know..."
3,SoulCalibur,Dreamcast,"September 8, 1999","This is a tale of souls and swords, transcendi...",98,8.4,"[tale, soul, sword, transcending, world, histo..."
4,Grand Theft Auto IV,Xbox 360,"April 29, 2008",[Metacritic's 2008 Xbox 360 Game of the Year; ...,98,7.9,"[metacritic, 2008, xbox, 360, game, year, also..."


In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform([str(i) for i in proc_data['proc_summary']])
tfidf_matrix.shape

(18686, 40650)

In [24]:
tfidf.get_feature_names()[5000:5010]

['body',
 'bodybuilder',
 'bodycount',
 'bodyguard',
 'bodyhopping',
 'bodykit',
 'bodyramming',
 'bodyregionspecific',
 'bodysnatch',
 'bodysnatching']

In [25]:
from sklearn.metrics.pairwise import linear_kernel
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
cosine_sim

array([[1.        , 0.        , 0.02002085, ..., 0.00641481, 0.01352205,
        0.00309929],
       [0.        , 1.        , 0.0058262 , ..., 0.01093398, 0.        ,
        0.00651923],
       [0.02002085, 0.0058262 , 1.        , ..., 0.        , 0.04124733,
        0.00698127],
       ...,
       [0.00641481, 0.01093398, 0.        , ..., 1.        , 0.        ,
        0.        ],
       [0.01352205, 0.        , 0.04124733, ..., 0.        , 1.        ,
        0.00135484],
       [0.00309929, 0.00651923, 0.00698127, ..., 0.        , 0.00135484,
        1.        ]])

In [26]:
indices = pd.Series(proc_data.index, index=proc_data['name']).drop_duplicates()
indices

name
The Legend of Zelda: Ocarina of Time              0
Tony Hawk's Pro Skater 2                          1
Grand Theft Auto IV                               2
SoulCalibur                                       3
Grand Theft Auto IV                               4
                                              ...  
Fast & Furious: Showdown                      18681
Drake of the 99 Dragons                       18682
Afro Samurai 2: Revenge of Kuma Volume One    18683
Infestation: Survivor Stories (The War Z)     18684
Leisure Suit Larry: Box Office Bust           18685
Length: 18686, dtype: int64

In [27]:
def recommend(title, cosine_sim=cosine_sim):
    idx = indices[title]
    
    similarity = list(enumerate(cosine_sim[idx]))
    
    similarity = sorted(similarity, key=lambda x: x[1], reverse=True)
    
    similarity = similarity[1:11]
    
    game_indices = [i[0] for i in similarity]
    
    recs = proc_data['name'].iloc[game_indices]
    
    return recs

In [28]:
recommend('Pokemon HeartGold Version')

1126                     Pokemon HeartGold Version
1001                         Pokemon White Version
1147                         Pokemon Black Version
2914                          Pokemon Ruby Version
4198              Pokemon Sword / Shield Dual Pack
8704                             Pokemon Colosseum
15800                               Pokemon Rumble
14591    Pokemon Mystery Dungeon: Blue Rescue Team
10996                                Pokemon UNITE
4866                              New Pokemon Snap
Name: name, dtype: object