# Steam video game recommendation
The objective of the project is to try to fit a base game (when it is divided into training and testing) in one of the audience approval categories that will be created by me during the project, based on approval percentage.

In [None]:
import pandas as pd
import re
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt
import statistics
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix


removing some categories that I thought were unnecessary
decided to remove the genres because when I was going to use One Hot Encoding it would be a very polluted base by several columns, which would delay the analysis even more
I also thought that the studio that developed the games would be more relevant than their publisher, so I removed the publisher

In [None]:
steam_data = pd.read_csv('steam.csv')
# print(steam_data.columns)
steam_data = steam_data.drop(columns=['appid','name','price','categories','english', 'achievements','required_age' , 'release_date', 'developer','publisher', 'platforms', 'steamspy_tags'])
print(steam_data)

In [None]:
print("Seems we have a lot of unique values for the genres, we need to do some feature engineering")
steam_data.genres.unique().shape[0] , steam_data.shape[0]

## Working with genres

In [None]:
genres = []
for string in steam_data.genres.unique():
    tmp_genres = string.split(";")
    for genre in tmp_genres:
        if genre not in genres:
            genres.append(genre)

genres

In [None]:
steam_data = steam_data.reindex(columns=list(steam_data.columns)+genres)
steam_data.head()

In [None]:
ids = steam_data.index

for i, string in zip(ids, steam_data.genres):
    tmp_genres = string.split("|")
    for genre in genres:
        if genre  in tmp_genres:
            steam_data.at[i, genre] = 1
        else :
            steam_data.at[i, genre] = 0

steam_data.drop(columns="genres", axis=1, inplace=True)

In [None]:
for genre in genres:
    steam_data[genre] = pd.to_numeric(steam_data[genre], downcast="integer")

steam_data.tail()

## finding the median of the estimated number of game owners and replacing the values ​​in the 'owners' column with the median

In [None]:

owners = steam_data['owners']
owners_median = []

for i in owners:
    #splitting the strings in two for each element in the 'owners' column, which are normally represented by a string "initial value - final value"
    temp = i.split('-')

    #now transforming string elements into integers using list comprehension
    elements = [int(elemento) for elemento in temp]

    #inverting the order of elements, to put the maximum estimated value of owners before the minimum value
    invert = elements[::-1]

    #taking the median of the two values ​​and adding to a new list
    mediana = statistics.median(invert)
    owners_median.append(float(mediana))

owner_df = pd.DataFrame(owners_median, columns = ['median_of_owners'])

steam_data = steam_data.drop(['owners'], axis = 1)
steam_data = steam_data.join(owner_df['median_of_owners'])

In [None]:
#turning categories into binary numbers
"""
Knowing that the "categories" column is used to say what type of game itself, I will first find out how many categories are possible

Then I'll turn all the categories into just two, single and multiplayer. This will serve to simplify the analysis
"""




categories = steam_data['categories']
strings = []

"""
  There are some cases where a game can have multiple categories, and in those cases, they are separated by ';', so I'll break them down by that.
"""

for element in categories:
    strings.append(element.split(';'))

#the list 'strings' is made up of lists of strings. Variable 'temp2' will only have strings, no lists inside
temp = [element for element in strings]
temp_list = []

for element in temp:
    temp_list.extend(element)

#eliminating duplicate elements by transforming to set
categories_set = set(temp_list)

In [None]:
print(categories_set)


The categories were organized like this:

Multiplayer (here will also fit the categories: Co-op, Online Co-op, Online Multi-Player, Cross-Platform Multiplayer, Local Multi-Player, Local Co-op, Shared/Split Screen, and MMO);

Singleplayer;

Hybrid (if the game has the option to be played both multiplayer and singleplayer).

In [None]:
# The following loop will check if a game is hybrid, single-player, or multi-player, and add the right category for each game to 'new_changed_list'
multiplayer = ['Multi-player', 'Co-op', 'Online Co-op', 'Online Multi-Player', 'Cross-Platform Multiplayer', 'Local Multi-Player', 'Local Co-op', 'Shared/Split Screen', 'MMO']

condensed_category = []

for i in range(len(temp_list)):
    for multiplayer_element in multiplayer:
        if 'Single-player' in temp_list[i] and multiplayer_element in temp_list[i]:
            condensed_category.append('Hybrid')
            break
        elif multiplayer_element in temp_list[i]:
            condensed_category.append('Multiplayer')
            break
        elif multiplayer_element not in temp_list[i]:
            condensed_category.append('Singleplayer')
            break

In [None]:
#replacing the old category column with the new one
new_category = pd.DataFrame(condensed_category, columns = ['categories'])

steam_data = steam_data.drop(columns=['categories'], axis=1)
steam_data = steam_data.join(new_category['categories'])
print(steam_data)

In [None]:
enc = OneHotEncoder(handle_unknown='ignore')
enc_df = pd.DataFrame(enc.fit_transform(steam_data[['categories']]).toarray())
steam_data = steam_data.join(enc_df)
steam_data = steam_data.drop(columns=['categories'], axis=1)
print(steam_data)

Joining the positive_ratings and negative_ratings columns into one, which will be expressed as a percentage. Also, I will only select the games that have more ratings (quantity) than 100 (this number can be changed)


Example:
If the game has 70 upvotes and 30 downvotes, the new column will have "70" as its value.

In [None]:
pr = steam_data['positive_ratings']
nr = steam_data['negative_ratings']
ratings = pr+nr

#Selecting only the lines above X ratings, the value can be changed below
x = 100

ratings = pd.DataFrame(ratings, columns=['ratings'])

steam_data = steam_data.join(ratings['ratings'])
lista_av = []

for i in range(len(steam_data)):
    if steam_data['ratings'][i] < x: #change this number 100 if you want to decrease or increase the base
        lista_av.append(i)

steam_data = steam_data.drop(lista_av)

#creating the percentage column

pr = steam_data['positive_ratings']
nr = steam_data['negative_ratings']
ratings = pr+nr

ratings = tuple(ratings)
pr = tuple(pr)

assessments = []

#taking the percentage of 'approval' (positive votes versus negative votes)
for i in range(len(steam_data)):
    res = pr[i] * 100 / ratings[i]
    assessments.append(res)

steam_data = steam_data.drop(columns=['positive_ratings', 'negative_ratings', 'ratings'], axis = 1)

steam_data['ratings'] = assessments

steam_data = steam_data.dropna()


I will mark the following categories for the evaluations, based on the previous process:

Horrible: ratings < 40%

Bad: 40% <= ratings < 55%

Median: 55% <= ratings < 70%

Good: 70% <= ratings < 90%

Great: ratings >= 90%

In [None]:
#checking different categories for reviews
steam_data.reset_index(drop=True, inplace=True)
rt = steam_data['ratings']


#basically the following loops will assign different categories to the evaluations, and these categories will be used as the target attribute of the AIs

#transforming ratings into floats numbers and saving to a list
float_ratings = []
for i in range(len(steam_data)):
  float_ratings.append(float(steam_data['ratings'][i]))

rt_temp = []

for i in range(len(steam_data)):
    if float_ratings[i] < 40:
        rt_temp.append(0)
    elif float_ratings[i] >= 40 and float_ratings[i] < 55:
        rt_temp.append(1)
    elif float_ratings[i] >= 55 and float_ratings[i] < 70:
        rt_temp.append(2)
    elif float_ratings[i] >= 70 and float_ratings[i] < 90:
        rt_temp.append(3)
    elif float_ratings[i] >= 90:
        rt_temp.append(4)

steam_data = steam_data.drop(columns=['ratings'], axis=1)
steam_data['ratings'] = rt_temp

In [None]:
steam_data

## splitting the base into testing and training

In [None]:
X = steam_data.drop(['ratings','average_playtime','median_playtime','median_of_owners'], axis = 1)
y = steam_data['ratings']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)
print(X)

## KNN

In [None]:
KNNModel = KNeighborsClassifier(p=2, n_neighbors=15, weights = 'distance', algorithm = 'auto')
KNNModel.fit(X_train, y_train)

# predict the target on the train dataset
predict_train = KNNModel.predict(X_train)
print('Target on train data',predict_train) 

# Accuray Score on train dataset
accuracy_train = accuracy_score(y_train,predict_train)
print('accuracy_score on train dataset : ', accuracy_train)

# predict the target on the test dataset
predict_test = KNNModel.predict(X_test)
print('Target on test data',predict_test) 


# Accuracy Score on test dataset
nbscore = accuracy_score(y_test,predict_test)
print('accuracy_score on test dataset : ',nbscore)


# f, axes = plt.subplots(1, 2, figsize=(30, 12))
# pred = confusion_matrix(y_train, predict_train)

# sb.heatmap(pred, annot = True, fmt=".0f", annot_kws={"size": 18}, ax = axes[0])
# sb.heatmap(confusion_matrix(y_test, predict_test), 
#            annot = True, fmt=".0f", annot_kws={"size": 18}, ax = axes[1])




In [None]:
sb.set(color_codes=True)
sb.set(font_scale=1.4)

plt.figure(1, figsize=(30, 12))
plt.title("Confusion Matrix")
train = sb.heatmap(confusion_matrix(y_train, predict_train), annot=True, fmt=".0f",cmap="YlGnBu", cbar_kws={'label': 'Scale'})
train.set(ylabel="True Label", xlabel="Predicted Label")



## Decision Tree

In [None]:

# Decision Tree using Train Data
dectree = DecisionTreeClassifier(max_depth = 7)  # create the decision tree object
dectree.fit(X_train, y_train)                    # train the decision tree model

# Predict Response corresponding to Predictors
y_train_pred = dectree.predict(X_train)
y_test_pred = dectree.predict(X_test)
accuracy_train = accuracy_score(y_train,predict_train)

nbscore = accuracy_score(y_test,predict_test)
# Check the Goodness of Fit (on Train Data)
print("Goodness of Fit of Model \tTrain Dataset")
print("Classification Accuracy \t:", accuracy_train)
print()

# Check the Goodness of Fit (on Test Data)
print("Goodness of Fit of Model \tTest Dataset")
print("Classification Accuracy \t:", nbscore)
print()

# Plot the Confusion Matrix for Train and Test

In [None]:
sb.set(color_codes=True)
sb.set(font_scale=1.4)

plt.figure(1, figsize=(30, 12))
plt.title("Confusion Matrix")
train = sb.heatmap(confusion_matrix(y_train, predict_train), annot=True, fmt=".0f",cmap="YlGnBu", cbar_kws={'label': 'Scale'})
train.set(ylabel="True Label", xlabel="Predicted Label")
