In [96]:
import pandas as pd

game_reviews = pd.read_csv('data\\Processed Data\\game_reviews.csv')
games = pd.read_csv('data\\Processed Data\\games.csv')
genres = pd.read_csv('data\\Processed Data\\genres.csv')
game_genre_tags = pd.read_csv('data\\Processed Data\\game_genre_tags.csv')
platforms = pd.read_csv('data\\Processed Data\\platforms.csv')

To Do:
 - What data needs to be added to review table for the engine?
 - Preliminary Neural Network model
 - How to allow a user to access the model, and make further preferential selections?

#### Prep Data

In [17]:
titles = games[['title', 'game_ID']]
reviews = pd.merge(game_reviews, titles, how="left", on="game_ID")
reviews = reviews[pd.notnull(reviews['review_text'])]

In [18]:
reviews.isna().sum()

game_ID             0
user_ID             0
score            3170
review_text         0
date           193598
title               0
dtype: int64

#### Build Model

In [19]:
import gensim
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.utils import simple_preprocess

In [20]:
#assert gensim.models.doc2vec.FAST_VERSION > -1

In [127]:
reviews

Unnamed: 0,game_ID,user_ID,score,review_text,date,title
0,10000,1,100.0,"To say that this game is ""fast"" is an understa...",,Burnout 3: Takedown
1,10001,1,70.0,The gothic-anime art direction and voice actin...,,Eternal Poison
2,10012,1,65.0,"Series fans'll eat it up , but others may want...",,Digimon World: Dusk
3,10013,1,70.0,"The result, though a little awkward, is a hybr...",,Sigma Star Saga
4,10042,1,10.0,While it is a nice throwback to the retro arca...,,NBA Unrivaled
...,...,...,...,...,...,...
1080619,26948,323194,10.0,The best dlc ever for the best saga in gaming...,"Jun 30, 2017",Dark Souls III: The Ringed City
1080620,26949,323195,6.0,"Well, here we are. Pinball FX3 is probably the...","Jan 14, 2018",Pinball FX3
1080621,26949,323196,9.0,Surprisingly fun and addicting game. I have ...,"Mar 4, 2018",Pinball FX3
1080622,26949,323197,4.0,I have always loved pinball. I've found it cap...,"Dec 31, 2017",Pinball FX3


In [21]:
# MyDocs reading from a data frame
class MyDocs(object):
    def __iter__(self):
        for i in range(reviews.shape[0]):
            yield TaggedDocument(words=simple_preprocess(reviews.iloc[i,3]), tags=['%s' % reviews.iloc[i,0]])

In [22]:
%%time
import multiprocessing
import os
cores = multiprocessing.cpu_count()

try:
    doc2vec_model = Doc2Vec.load('models/doc2vec.model')

except:
    print("start traing doc2vec model...")
    documents = MyDocs()
    doc2vec_model = Doc2Vec(dm=1, dbow_words=1, vector_size=200, window=8, min_count=20, workers=cores)
    doc2vec_model.build_vocab(documents)
    doc2vec_model.train(documents, total_examples=doc2vec_model.corpus_count, epochs=doc2vec_model.epochs)
    if not os.path.exists('models'):
        os.makedirs('models')
        doc2vec_model.save('models/doc2vec.model')
    else:
        doc2vec_model.save('models/doc2vec.model')

Wall time: 2.15 s


In [23]:
def search(keyword):
    result = []
    for name in reviews.title:
        if keyword in name.lower():
            result.append(name)
    return set(result)

In [24]:
search('overwatch')

{'Overwatch'}

In [64]:
vec = doc2vec_model['cars']
doc2vec_model.docvecs.most_similar([vec], topn=5)

[('RIDE 3', 0.5268259644508362),
 ('Project Gotham Racing 3', 0.5253642797470093),
 ('Driveclub Bikes', 0.5123881101608276),
 ('Midnight Club: Los Angeles', 0.5112804770469666),
 ('RIDE', 0.5072597861289978)]

In [82]:
doc2vec_model.docvecs.most_similar('Terraria', topn=20)

[('Minecraft', 0.8085135221481323),
 ('Starbound', 0.7476636171340942),
 ('Minecraft: Xbox 360 Edition', 0.731022834777832),
 ("Don't Starve", 0.6798550486564636),
 ('Minecraft: PlayStation 4 Edition', 0.6790744662284851),
 ('Minecraft: Xbox One Edition', 0.6783989071846008),
 ('Portal Knights', 0.6619856357574463),
 ('Minecraft: PlayStation 3 Edition', 0.647018551826477),
 ('Minecraft Dungeons', 0.6276648044586182),
 ('SteamWorld Dig', 0.6103273630142212)]

In [124]:
class Recommender(object):
    def __init__(self, model, games_df, platform_df, genre_df, game_genre_tags_df):
        self.model = model
        self.games_df = games_df
        self.platform_df = platform_df
        self.genre_df = genre_df
        self.game_genre_tags_df = game_genre_tags_df
        
        self.genre_dict = {}
        for i in self.game_genre_tags_df['game_ID'].unique():
            self.genre_dict[i] = [self.game_genre_tags_df['genre_ID'][j] for j in self.game_genre_tags_df[self.game_genre_tags_df['game_ID']==i].index]

    def create_lookup(self):
        self.lookup_df = self.games_df[['game_ID', 'title', 'platform_ID', 'summary', 'url']]
        self.lookup_df['game_ID'] = self.lookup_df['game_ID'].astype(str)
        self.lookup_dict = self.lookup_df.set_index('game_ID').to_dict(orient='index')
        return self.lookup_dict
        
    def get_recommendations(self, keyword):
        return self.model.docvecs.most_similar(keyword, topn=100)

    def get_filtered_recommendations(self, keyword, platform_ID, genre_IDs, n):
        lookup_dict = self.create_lookup()
        ranked_results = self.get_recommendations(keyword)
        
        filtered_results = {}

        for game in ranked_results:
            if len(filtered_results) < n:
                dict_id = game[0]
                dict_game_name = lookup_dict[game[0]]['title']
                dict_platform_ID = lookup_dict[game[0]]['platform_ID']
                dict_summary = lookup_dict[game[0]]['summary']
                dict_url = lookup_dict[game[0]]['url']
                
                if (platform_ID is None) and (len(genre_IDs) == 0):
                    filtered_results[dict_id] = (dict_game_name, dict_summary, dict_url)

                elif len(genre_IDs) == 0:
                    if (dict_platform_ID == platform_ID):
                        if dict_id in filtered_results:
                            continue
                        else:
                            filtered_results[dict_id] = (dict_game_name, dict_summary, dict_url)

                elif platform_ID is None:
                    if set(genre_IDs) & set(self.genre_dict[dict_id]) > 0:
                        if dict_id in filtered_results:
                            continue
                        else:
                            filtered_results[dict_id] = (dict_game_name, dict_summary, dict_url)

                elif (dict_platform_ID == platform_ID) and (set(genre_IDs) & set(self.genre_dict[dict_id]) > 0):
                        if dict_id in filtered_results:
                            continue
                        else:  
                            filtered_results[dict_id] = (dict_game_name, dict_summary, dict_url)

        if len(filtered_results) > 0:
            return filtered_results
        else:
            return 
        
    def lookup_id(self, id_number, id_type):
        if id_type == 'game':
            return self.games_df[self.games_df['game_ID']==id_number]
            
        elif id_type == 'platform':
            return self.platform_df[self.platform_df['platform_ID']==id_number]['platform'][0]
        
        elif id_type == 'genre':
            return self.genre_df[self.genre_df['genre_ID']==id_number]['genre_name'][0]
        
        else:
            raise ValueError("id_type must equal 'game', 'platform', or 'genre'")
        

In [125]:
test = Recommender(doc2vec_model, games, platforms, genres, game_genre_tags)

In [126]:
test.get_filtered_recommendations('Terraria', 3, [], 10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app


KeyError: 'Minecraft'