In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pickle

In [2]:
data = pd.read_csv('cleaned_vgchartz.csv')
print(data.head())
print(data.shape)
print(data['console'].nunique(), 'unique console')
print(data['title'].nunique(), 'unique title')

                                             img                       title  \
0  /games/boxart/full_5563178AmericaFrontccc.jpg          Grand Theft Auto V   
1  /games/boxart/full_4990510AmericaFrontccc.jpg   Call of Duty: Black Ops 3   
2  /games/boxart/full_4653215AmericaFrontccc.jpg       Red Dead Redemption 2   
3  /games/boxart/full_1977964AmericaFrontccc.jpg  Call of Duty: Black Ops II   
4  /games/boxart/full_4649679AmericaFrontccc.png  Call of Duty: Black Ops II   

  console             genre       publisher       developer  rating  \
0     PS4            Action  Rockstar Games  Rockstar North     9.7   
1     PS4           Shooter      Activision        Treyarch     8.1   
2     PS4  Action-Adventure  Rockstar Games  Rockstar Games     9.8   
3    X360           Shooter      Activision        Treyarch     8.4   
4     PS3           Shooter      Activision        Treyarch     8.0   

   total_sales  na_sales  jp_sales  pal_sales  other_sales release_date  \
0        19.39   

In [3]:
#Drop columns with sales since total_sales is available
df = data.drop(['last_update', 'developer', 'na_sales', 'jp_sales', 'pal_sales', 'other_sales'], axis=1)
df.head()

Unnamed: 0,img,title,console,genre,publisher,rating,total_sales,release_date
0,/games/boxart/full_5563178AmericaFrontccc.jpg,Grand Theft Auto V,PS4,Action,Rockstar Games,9.7,19.39,2014-11-18
1,/games/boxart/full_4990510AmericaFrontccc.jpg,Call of Duty: Black Ops 3,PS4,Shooter,Activision,8.1,15.09,2015-11-06
2,/games/boxart/full_4653215AmericaFrontccc.jpg,Red Dead Redemption 2,PS4,Action-Adventure,Rockstar Games,9.8,13.94,2018-10-26
3,/games/boxart/full_1977964AmericaFrontccc.jpg,Call of Duty: Black Ops II,X360,Shooter,Activision,8.4,13.86,2012-11-13
4,/games/boxart/full_4649679AmericaFrontccc.png,Call of Duty: Black Ops II,PS3,Shooter,Activision,8.0,13.8,2012-11-13


In [4]:
# Create a tag to combine genre and publisher
video_games = df[['console', 'title', 'genre', 'publisher']]
video_games = video_games.assign(tags=video_games['genre'] + ' ' + video_games['publisher'])
video_games

Unnamed: 0,console,title,genre,publisher,tags
0,PS4,Grand Theft Auto V,Action,Rockstar Games,Action Rockstar Games
1,PS4,Call of Duty: Black Ops 3,Shooter,Activision,Shooter Activision
2,PS4,Red Dead Redemption 2,Action-Adventure,Rockstar Games,Action-Adventure Rockstar Games
3,X360,Call of Duty: Black Ops II,Shooter,Activision,Shooter Activision
4,PS3,Call of Duty: Black Ops II,Shooter,Activision,Shooter Activision
...,...,...,...,...,...
277,3DS,RPG Maker: Fes,Role-Playing,NIS America,Role-Playing NIS America
278,NS,BlazBlue: Cross Tag Battle,Fighting,Arc System Works,Fighting Arc System Works
279,NS,Disgaea 1 Complete,Role-Playing,NIS America,Role-Playing NIS America
280,PS2,GrimGrimoire,Strategy,NIS America,Strategy NIS America


In [5]:
# Drop genre and publisher
new_df = video_games.drop(columns=['genre', 'publisher'])
new_df

Unnamed: 0,console,title,tags
0,PS4,Grand Theft Auto V,Action Rockstar Games
1,PS4,Call of Duty: Black Ops 3,Shooter Activision
2,PS4,Red Dead Redemption 2,Action-Adventure Rockstar Games
3,X360,Call of Duty: Black Ops II,Shooter Activision
4,PS3,Call of Duty: Black Ops II,Shooter Activision
...,...,...,...
277,3DS,RPG Maker: Fes,Role-Playing NIS America
278,NS,BlazBlue: Cross Tag Battle,Fighting Arc System Works
279,NS,Disgaea 1 Complete,Role-Playing NIS America
280,PS2,GrimGrimoire,Strategy NIS America


In [6]:
# Apply CountVectorizer to create matrix for recommender
cv = CountVectorizer(max_features=285, stop_words='english')
cv

In [7]:
# Convert tags column 
vector = cv.fit_transform(new_df['tags'].values.astype('U')).toarray()
vector.shape

(282, 70)

In [8]:
similarity = cosine_similarity(vector)
similarity

array([[1.        , 0.        , 0.8660254 , ..., 0.        , 0.        ,
        0.57735027],
       [0.        , 1.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.8660254 , 0.        , 1.        , ..., 0.        , 0.        ,
        0.5       ],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.57735027,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.57735027, 1.        ,
        0.        ],
       [0.57735027, 0.        , 0.5       , ..., 0.        , 0.        ,
        1.        ]])

In [9]:
def recommend(game_title):
    if game_title not in new_df['title'].values:
        print('Game title not found.')
        
    # Calculate similarity score
    index = new_df[new_df['title']==game_title].index[0]
    distance = sorted(list(enumerate(similarity[index])), reverse=True, key=lambda vector:vector[1])
    
    print(f"Recommendations for '{game_title}':")
    for i in distance[1:6]:
        print(new_df.iloc[i[0]].title)

In [10]:
# try out recommender code
recommend('Grand Theft Auto V')

Recommendations for 'Grand Theft Auto V':
Grand Theft Auto V
Red Dead Redemption 2
Fate/Extella: The Umbral Star
Mafia III
No Man's Sky


In [11]:
pickle.dump(new_df, open('video_games_list.pkl', 'wb'))

In [12]:
pickle.dump(similarity, open('similarity.pkl', 'wb'))

In [13]:
loaded_df = pickle.load(open('video_games_list.pkl', 'rb'))
loaded_similarity = pickle.load(open('similarity.pkl', 'rb'))

In [14]:
recommend('Disgaea 1 Complete')

Recommendations for 'Disgaea 1 Complete':
RPG Maker: Fes
Disgaea 1 Complete
Dragon Quest VIII: Journey of the Cursed King
Xenoblade Chronicles X
Monster Hunter 3 Ultimate


In [15]:
# Try using an incomplete game title
recommend('Black Ops')

Game title not found.


IndexError: index 0 is out of bounds for axis 0 with size 0

**One con with the recommender is that the user must input the exact title.**

The model used to create the video game recommender system is Content-Based Filtering.