In [23]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import re
import itertools
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from fuzzywuzzy import fuzz
import ipywidgets as widgets
from IPython.display import display

# read csv for games
game_data = pd.read_csv(r"C:\Users\firen\OneDrive\Documents\School-LAPTOP-JU6N3L8Q\Capstone\Steam Games Dataset\steam.csv")

pd.options.display.show_dimensions = False

# prints full frame
def print_full(x):
    pd.set_option('display.max_rows', len(x))
    print(x)
    pd.reset_option('display.max_rows')


# test for loaded csv
# print(game_data.shape)
# print(game_data.head())

# check for null values
# print(game_data.columns[game_data.isna().any()].tolist())
# how many null values
# game_data.isnull().sum()


# convert full date to just the year
def convert_date_to_year(date):
    year = date[:4]
    if year.isnumeric():
        return int(year)
    else:
        return np.nan


game_data['year'] = game_data['release_date'].apply(convert_date_to_year)


# calc pos rating percentage using pos and neg ratings to create new column 'rating_score'
def calc_pos_rating_percentage(row):
    pos_ratings = row['positive_ratings']
    neg_ratings = row['negative_ratings']
    total_count = pos_ratings + neg_ratings
    average = pos_ratings / total_count
    return round(average, 2)


# calc rating sum using pos and neg ratings to create new column 'rating_sum'
def calc_rating_sum(row):
    pos_ratings = row['positive_ratings']
    neg_ratings = row['negative_ratings']
    rating_sum = pos_ratings + neg_ratings
    return rating_sum


game_data['pos_rating_percentage'] = game_data.apply(calc_pos_rating_percentage, axis=1)
game_data['rating_sum'] = game_data.apply(calc_rating_sum, axis=1)

# calc the mean of the pos rating percentage
C = game_data['pos_rating_percentage'].mean()
# TEST
# print(C)

# calc min number of ratings required to be considered for recommendation; only the top 10% considered
m = game_data['rating_sum'].quantile(0.85)

# remove games below this percentile so that jupyter notebooks can handle the data size
game_data.drop(game_data[game_data.rating_sum < m].index, inplace=True)


# reset index to fix errors
game_data.reset_index(inplace  = True)
# TEST
# print(m)


# calc and create a weighted rating for each game using the above variables
def calc_weighted_rating(x, m=m, C=C):
    v = x['rating_sum']
    R = x['pos_rating_percentage']
    # below formula based on IMDB weighted rating formula to be cited
    return round((v / (v + m) * R) + (m / (m + v) * C), 2) * 100


# create new column 'weighted_rating' from the above formula
game_data['weighted_rating'] = game_data.apply(calc_weighted_rating, axis=1)

# TEST print head to ensure above worked
# print(game_data[['name', 'rating_sum', 'pos_rating_percentage', 'weighted_rating']].head(20))

# change multi word tags into hyphenated tags for easier interpretation
game_data['steamspy_tags'] = game_data['steamspy_tags'].str.replace(' ', '-')

# replace ';' in tag words with a space so that the vectorizer can interpret words easier
game_data['steamspy_tags'] = game_data['steamspy_tags'].str.replace(';', ' ')

# now each column is easier to read with vectorization below. we technically can use only the tag column since it
# is more detailed than the genre column and includes what is in the genre column already. for ease of use, we will
# simplify this by copying the tags column into the genre column so that they can be interpreted the exact same
# this can be changed in later developments which is why the above code has been left in but commented out
game_data['genres'] = game_data['steamspy_tags'].str.replace(';', ' ')

# count the number of times each specific genre is used
count = dict()
for i in game_data.index:
    # split each element from the genre column per row by a space so it can be counted and interpreted
    for g in game_data.loc[i, 'genres'].split(' '):
        # if the element encountered is not already in counts, count it
        if g not in count:
            # start genre dict count at 1
            count[g] = 1
        else:
            # increase genre dict count by 1
            count[g] = count[g] + 1

# TEST count number of genres
# count.keys()
# print(count['Strategy'])
# print(game_data[['genres', 'steamspy_tags']].head(20))

# create Tfigame_dataVectorizer object
tfigame_data_vector = TfidfVectorizer(stop_words='english')

# generate matrix of genres/tags
tfigame_data_matrix = tfigame_data_vector.fit_transform(game_data['genres'])

# TEST find size of above generated matrix and see which words it is enumerated and check against csv
# print(tfigame_data_matrix.shape)
# print(list(enumerate(tfigame_data_vector.get_feature_names_out())))

# generate cosine similarity matrix to begin foundation for recommendation of games
cosine_similarity_matrix = linear_kernel(tfigame_data_matrix, tfigame_data_matrix)

# TEST
# print(cosine_similarity_matrix)

In [None]:
# THIS IS WHERE DATA VISUALIZATION GOES!!!!!

In [None]:
# MORE SPACE IF NEEDED

In [24]:
# getters for info when making recommendations
def get_game_year_from_index(index):
    return game_data[game_data.index == index]['year'].values[0]


def get_game_name_from_index(index):
    return game_data[game_data.index == index]['name'].values[0]


def get_game_index_from_name(name):
    return game_data[game_data.name == name].index.values[0]


def get_game_developer_from_index(index):
    return game_data[game_data.index == index]['developer'].values[0]


def get_game_genre_from_index(index):
    return game_data[game_data.index == index]['genres'].values[0]


def get_game_pos_rating_percentage_from_index(index):
    return game_data[game_data.index == index]['pos_rating_percentage'].values[0]


def get_weighted_rating_from_index(index):
    return game_data[game_data.index == index]['weighted_rating'].values[0]


def get_rating_sum_from_index(index):
    return game_data[game_data.index == index]['rating_sum'].values[0]


def get_platforms_from_index(index):
    return game_data[game_data.index == index]['platforms'].values[0]


# levenshtein distance between phrases a and b to determine similarity and find the closest matching phrase
# used for finding phrases closest to user input
def phrase_match_score(a, b):
    return fuzz.ratio(a, b)
    # if a and b are the same the score will be 100


# find closest game name is used for when user inputs a text that doesn't match perfectly with a specific game title
# this way a user can still use the recommender without knowing all games within the dataset
# this is the most typical way for a user to use the recommender
def find_closest_game_name(name):
    # create a list of match differences by index using input
    levenshtein_scores = list(enumerate(game_data['name'].apply(phrase_match_score, b=name)))
    # sort the above list by distance
    sorted_levenshtein_scores = sorted(levenshtein_scores, key=lambda x: x[1], reverse=True)
    # find closest name with the shortest distance
    closest_name = get_game_name_from_index(sorted_levenshtein_scores[0][0])
    # get the score
    distance_score = sorted_levenshtein_scores[0][1]
    return closest_name, distance_score


# TEST for finding closest phrase
# print(find_closest_game_name("Team Fort"))


# this function returns the top 15 closest names to give user options when doing input
def closest_game_names(name):
    levenshtein_scores = list(enumerate(game_data['name'].apply(phrase_match_score, b=name)))
    sorted_levenshtein_scores = sorted(levenshtein_scores, key=lambda x: x[1], reverse=True)
    top_15_closest_names = [get_game_name_from_index(i[0]) for i in sorted_levenshtein_scores[:15]]
    return top_15_closest_names


# TEST for finding closest names of a game
# print(closest_names('Peach'))

def content_based_recommender(user_input_game, number_of_recommendations): #removed platform param at end
    # find closest game name match
    closest_game_name, distance_score = find_closest_game_name(user_input_game)

    # generate a dataframe to store the recommendations in
    recommended_games = pd.DataFrame(
        columns=['Game Title', 'Year', 'Developer', 'Genre', 'Positive Rating Percentage', 'Total Ratings',
                 'Weighted Rating'])

    # find the corresponding index of the game title
    game_index = get_game_index_from_name(closest_game_name)

    # return a list of the most similar game indexes as a list
    games_list = list(enumerate(cosine_similarity_matrix[int(game_index)]))

    # Sort list of similar games from top to bottom
    similar_games = list(
        filter(lambda x: x[0] != int(game_index), sorted(games_list, key=lambda x: x[1], reverse=True)))

    # Print the game title the similarity matrix is based on
    print('Here\'s the list of games similar to ' + '\033[1m' + str(closest_game_name) + '\033[0m' + ':\n')

    # Only return the games that are on selected platform
    # n_games = []
    # for i, s in similar_games:
      #  if platform in get_platforms_from_index(i):
            # append method deprecated
       #     print(s)
        #    n_games.append((i, s))
            # n_games = pd.concat([tmp, (i, s)])

    # Return the game tuple (game index, game distance score) and store in a dataframe
    for i, s in similar_games[int():int(number_of_recommendations)]:
        row = {'Game Title': get_game_name_from_index(i),
               'Year': get_game_year_from_index(i),
               'Developer': get_game_developer_from_index(i),
               'Genre': get_game_genre_from_index(i),
               'Positive Rating Percentage': get_game_pos_rating_percentage_from_index(i),
               'Total Ratings': get_rating_sum_from_index(i),
               'Weighted Rating': get_weighted_rating_from_index(i), }
        tmp = pd.DataFrame.from_dict([row])
        # print(tmp)
        # tmp = []
        # tmp.append(row)
        # append each row to this dataframe
        # append deprecated
        recommended_games = pd.DataFrame(recommended_games, columns = ['Game Title', 'Year', 'Developer',
                                                                       'Genre', 'Positive Rating Percentage',
                                                                      'Total Ratings', 'Weighted Rating'])
        recommended_games = pd.concat([recommended_games, tmp], ignore_index=True)
        # tmp = []
        # recommended_games = pd.concat(tmp, (row)) # add to after row for append , ignore_index=True)
    # if 'Weighted Ratings' in recommended_games.columns:
    #   del game_data['Weighted Ratings']

    return recommended_games# .to_string()

In [25]:
# begin use input
print("What game would you like to find games similar to? Here are some suggestions:")
display(game_data.head(10))

game_name_text = widgets.Text(
    placeholder='Type a game',
    description='Game Title:',
    disabled=False
)

game_name_btn = widgets.Button(description='Submit')
display(game_name_text, game_name_btn)

def game_name_btn_eventhandler(obj):
    names = closest_game_names(game_name_text.value)
    
    # clarify game in order to ensure recommender works properly may or may not need this
    print("Did you mean one of these games?")
    # display(names)
    game_name_dropdown = widgets.Dropdown(
        options=names,
        placeholder = 'Confirm game choice',
        description='Game Title:',
        disabled=False,
    )
    game_name_dropdown_btn = widgets.Button(description = 'Submit')
    display(game_name_dropdown, game_name_dropdown_btn)
    
    def game_name_dropdown_eventhandler(obj):
        user_input_game = game_name_dropdown.value
        
        print("How many games would you like me to recommend for you? Please type numbers only.")
        num_rec_slider = widgets.IntSlider(
            min=0,
            max=20,
            step=1,
            description='Number of Recommendations:',
            value=10
        )
        num_rec_btn = widgets.Button(description='Submit')
        display(num_rec_slider, num_rec_btn)

        def num_rec_btn_eventhandler(obj):
            number_of_recommendations = num_rec_slider.value
            print("Awesome! I'll start getting you game recommendations.\n")

            recommendations_for_user = content_based_recommender(user_input_game, number_of_recommendations)
# recommendations_for_user.drop('Weighted Ratings', axis=1, inplace=True)
            display(recommendations_for_user)
            print("\nTo start over, run cell again.")
            
    
        num_rec_btn.on_click(num_rec_btn_eventhandler)

    game_name_dropdown_btn.on_click(game_name_dropdown_eventhandler)

game_name_btn.on_click(game_name_btn_eventhandler)    


What game would you like to find games similar to? Here are some suggestions:


Unnamed: 0,index,appid,name,release_date,english,developer,publisher,platforms,required_age,categories,...,positive_ratings,negative_ratings,average_playtime,median_playtime,owners,price,year,pos_rating_percentage,rating_sum,weighted_rating
0,0,10,Counter-Strike,2000-11-01,1,Valve,Valve,windows;mac;linux,0,Multi-player;Online Multi-Player;Local Multi-P...,...,124534,3339,17612,317,10000000-20000000,7.19,2000,0.97,127873,97.0
1,1,20,Team Fortress Classic,1999-04-01,1,Valve,Valve,windows;mac;linux,0,Multi-player;Online Multi-Player;Local Multi-P...,...,3318,633,277,62,5000000-10000000,3.99,1999,0.84,3951,83.0
2,2,30,Day of Defeat,2003-05-01,1,Valve,Valve,windows;mac;linux,0,Multi-player;Valve Anti-Cheat enabled,...,3416,398,187,34,5000000-10000000,3.99,2003,0.9,3814,88.0
3,3,40,Deathmatch Classic,2001-06-01,1,Valve,Valve,windows;mac;linux,0,Multi-player;Online Multi-Player;Local Multi-P...,...,1273,267,258,184,5000000-10000000,3.99,2001,0.83,1540,80.0
4,4,50,Half-Life: Opposing Force,1999-11-01,1,Gearbox Software,Valve,windows;mac;linux,0,Single-player;Multi-player;Valve Anti-Cheat en...,...,5250,288,624,415,5000000-10000000,3.99,1999,0.95,5538,93.0
5,5,60,Ricochet,2000-11-01,1,Valve,Valve,windows;mac;linux,0,Multi-player;Online Multi-Player;Valve Anti-Ch...,...,2758,684,175,10,5000000-10000000,3.99,2000,0.8,3442,79.0
6,6,70,Half-Life,1998-11-08,1,Valve,Valve,windows;mac;linux,0,Single-player;Multi-player;Online Multi-Player...,...,27755,1100,1300,83,5000000-10000000,7.19,1998,0.96,28855,96.0
7,7,80,Counter-Strike: Condition Zero,2004-03-01,1,Valve,Valve,windows;mac;linux,0,Single-player;Multi-player;Valve Anti-Cheat en...,...,12120,1439,427,43,10000000-20000000,7.19,2004,0.89,13559,88.0
8,8,130,Half-Life: Blue Shift,2001-06-01,1,Gearbox Software,Valve,windows;mac;linux,0,Single-player,...,3822,420,361,205,5000000-10000000,3.99,2001,0.9,4242,88.0
9,9,220,Half-Life 2,2004-11-16,1,Valve,Valve,windows;mac;linux,0,Single-player;Steam Achievements;Steam Trading...,...,67902,2419,691,402,10000000-20000000,7.19,2004,0.97,70321,97.0


Text(value='', description='Game Title:', placeholder='Type a game')

Button(description='Submit', style=ButtonStyle())

Did you mean one of these games?


Dropdown(description='Game Title:', options=('Ziggurat', 'Fingered', 'Negligee', 'Dungreed', 'Digger Online', …

Button(description='Submit', style=ButtonStyle())

How many games would you like me to recommend for you? Please type numbers only.


IntSlider(value=10, description='Number of Recommendations:', max=20)

Button(description='Submit', style=ButtonStyle())

Awesome! I'll start getting you game recommendations.

Here's the list of games similar to [1mZiggurat[0m:



Unnamed: 0,Game Title,Year,Developer,Genre,Positive Rating Percentage,Total Ratings,Weighted Rating
0,Siralim 3,2018,Thylacine Studios,RPG Procedural-Generation Turn-Based,0.96,510,84.0
1,Immortal Redneck,2017,Crema,Rogue-lite Action FPS,0.91,1383,86.0
2,Sublevel Zero Redux,2015,Sigtrap,Indie Action Rogue-lite,0.89,461,80.0
3,In Death,2018,Sólfar Studios,Action Indie Rogue-lite,0.92,504,82.0
4,Heroes of Hammerwatch,2018,Crackshell,RPG Indie Rogue-lite,0.8,1757,78.0
5,Vagante,2018,Nuke Nine,Rogue-like Pixel-Graphics Rogue-lite,0.89,1857,86.0
6,Monolith,2017,Team D-13,Rogue-lite Indie Shoot-'Em-Up,0.99,529,86.0
7,Paranautical Activity: Deluxe Atonement Edition,2014,Digerati Distribution,Rogue-like FPS Indie,0.69,1391,70.0
8,Tower of Guns,2014,Terrible Posture Games,FPS Rogue-like Indie,0.82,974,79.0
9,Nuclear Throne,2015,Vlambeer,Rogue-like Indie Action,0.96,10099,95.0



To start over, run cell again.
