In [1]:
import pandas as pd
import numpy as np
import nltk
# nltk.download('omw-1.4')
# nltk.download('all')

In [5]:
df = pd.read_csv('../input/top-video-games-19952021-metacritic/all_games.csv')
df.head()

Unnamed: 0,name,platform,release_date,summary,meta_score,user_review
0,The Legend of Zelda: Ocarina of Time,Nintendo 64,"November 23, 1998","As a young boy, Link is tricked by Ganondorf, ...",99,9.1
1,Tony Hawk's Pro Skater 2,PlayStation,"September 20, 2000",As most major publishers' development efforts ...,98,7.4
2,Grand Theft Auto IV,PlayStation 3,"April 29, 2008",[Metacritic's 2008 PS3 Game of the Year; Also ...,98,7.7
3,SoulCalibur,Dreamcast,"September 8, 1999","This is a tale of souls and swords, transcendi...",98,8.4
4,Grand Theft Auto IV,Xbox 360,"April 29, 2008",[Metacritic's 2008 Xbox 360 Game of the Year; ...,98,7.9


In [6]:
df = df.dropna()
df.reset_index(drop=True, inplace=True)

In [7]:
df.describe()

Unnamed: 0,meta_score
count,18686.0
mean,70.656481
std,12.233243
min,20.0
25%,64.0
50%,72.0
75%,80.0
max,99.0


In [None]:
from nltk.tokenize import word_tokenize, sent_tokenize
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer

def nlp(df, text):
    #dummy column
    df['dummy'] = df[text].astype(str).str.lower()
    
    token_summ = (word_tokenize(post) for post in df['dummy'])
#     df['token_summ'] = [i for i in token_summ]
    
    reg = re.compile('(@[a-z0-9]+)|([^0-9a-z \t])|(\w+:\/\/\S+)')
    
    no_punc = []
    
    for filt in token_summ:
        review = []
        for token in filt:
            new_token = reg.sub('', token)
            if new_token != '':
                review.append(new_token)
        no_punc.append(review)
    
    no_stop = []
    
    for post in no_punc:
        new_term_vector = []
        for word in post:
            if not word in stopwords.words('english'):
                new_term_vector.append(word)
        no_stop.append(new_term_vector)
    
    pstem = PorterStemmer()
    wlem = WordNetLemmatizer()
    
    preproc_text = []
    
    for text in no_stop:
        final_text = []
        for word in text:
            pstem.stem(word)
            final_text.append(wlem.lemmatize(word))
        preproc_text.append(final_text)
        

    df['proc_summary'] = pd.Series(preproc_text)
    df.drop('dummy', axis=1, inplace=True)
    return df
    
proc_data = nlp(df, 'summary')
    

In [31]:
proc_data.head()

Unnamed: 0,name,platform,release_date,summary,meta_score,user_review,proc_summary
0,The Legend of Zelda: Ocarina of Time,Nintendo 64,"November 23, 1998","As a young boy, Link is tricked by Ganondorf, ...",99,9.1,"[young, boy, link, tricked, ganondorf, king, g..."
1,Tony Hawk's Pro Skater 2,PlayStation,"September 20, 2000",As most major publishers' development efforts ...,98,7.4,"[major, publisher, development, effort, shift,..."
2,Grand Theft Auto IV,PlayStation 3,"April 29, 2008",[Metacritic's 2008 PS3 Game of the Year; Also ...,98,7.7,"[metacritic, 2008, ps3, game, year, also, know..."
3,SoulCalibur,Dreamcast,"September 8, 1999","This is a tale of souls and swords, transcendi...",98,8.4,"[tale, soul, sword, transcending, world, histo..."
4,Grand Theft Auto IV,Xbox 360,"April 29, 2008",[Metacritic's 2008 Xbox 360 Game of the Year; ...,98,7.9,"[metacritic, 2008, xbox, 360, game, year, also..."
5,Super Mario Galaxy,Wii,"November 12, 2007",[Metacritic's 2007 Wii Game of the Year] The u...,97,9.1,"[metacritic, 2007, wii, game, year, ultimate, ..."
6,Super Mario Galaxy 2,Wii,"May 23, 2010","Super Mario Galaxy 2, the sequel to the galaxy...",97,9.1,"[super, mario, galaxy, 2, sequel, galaxyhoppin..."
7,Red Dead Redemption 2,Xbox One,"October 26, 2018",Developed by the creators of Grand Theft Auto ...,97,8.0,"[developed, creator, grand, theft, auto, v, re..."
8,Grand Theft Auto V,Xbox One,"November 18, 2014",Grand Theft Auto 5 melds storytelling and game...,97,7.9,"[grand, theft, auto, 5, meld, storytelling, ga..."
9,Grand Theft Auto V,PlayStation 3,"September 17, 2013","Los Santos is a vast, sun-soaked metropolis fu...",97,8.3,"[los, santos, vast, sunsoaked, metropolis, ful..."


In [32]:
proc_data.reset_index(drop=True, inplace=True)
proc_data.head()

Unnamed: 0,name,platform,release_date,summary,meta_score,user_review,proc_summary
0,The Legend of Zelda: Ocarina of Time,Nintendo 64,"November 23, 1998","As a young boy, Link is tricked by Ganondorf, ...",99,9.1,"[young, boy, link, tricked, ganondorf, king, g..."
1,Tony Hawk's Pro Skater 2,PlayStation,"September 20, 2000",As most major publishers' development efforts ...,98,7.4,"[major, publisher, development, effort, shift,..."
2,Grand Theft Auto IV,PlayStation 3,"April 29, 2008",[Metacritic's 2008 PS3 Game of the Year; Also ...,98,7.7,"[metacritic, 2008, ps3, game, year, also, know..."
3,SoulCalibur,Dreamcast,"September 8, 1999","This is a tale of souls and swords, transcendi...",98,8.4,"[tale, soul, sword, transcending, world, histo..."
4,Grand Theft Auto IV,Xbox 360,"April 29, 2008",[Metacritic's 2008 Xbox 360 Game of the Year; ...,98,7.9,"[metacritic, 2008, xbox, 360, game, year, also..."


In [33]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform([str(i) for i in proc_data['proc_summary']])
tfidf_matrix.shape

(10, 368)

In [34]:
tfidf_matrix

<10x368 sparse matrix of type '<class 'numpy.float64'>'
	with 471 stored elements in Compressed Sparse Row format>

In [3]:
# tfidf.get_feature_names()

In [4]:
from sklearn.metrics.pairwise import linear_kernel
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
cosine_sim

NameError: name 'tfidf_matrix' is not defined

In [37]:
indices = pd.Series(proc_data.index, index=proc_data['name']).drop_duplicates()
indices

name
The Legend of Zelda: Ocarina of Time    0
Tony Hawk's Pro Skater 2                1
Grand Theft Auto IV                     2
SoulCalibur                             3
Grand Theft Auto IV                     4
Super Mario Galaxy                      5
Super Mario Galaxy 2                    6
Red Dead Redemption 2                   7
Grand Theft Auto V                      8
Grand Theft Auto V                      9
dtype: int64

In [38]:
def recommend(title, cosine_sim=cosine_sim):
    idx = indices[title]
    
    similarity = list(enumerate(cosine_sim[idx]))
    
    similarity = sorted(similarity, key=lambda x: x[1], reverse=True)
    
    similarity = similarity[1:11]
    
    game_indices = [i[0] for i in similarity]
    
    recs = proc_data['name'].iloc[game_indices]
    
    return recs

In [39]:
recommend('Pokemon HeartGold Version')

KeyError: 'Pokemon HeartGold Version'