In [318]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
sb.set()

from ast import literal_eval
from math import sqrt

In [319]:
steam_data = pd.read_csv("dataset/steam_app_data.csv")
steamspy_data = pd.read_csv("dataset/steamspy_data.csv")

In [320]:
# get relevant columns from datasets
data = steam_data[["type", "name", "steam_appid", "controller_support", "dlc", "short_description", "demos", "platforms", "movies", "achievements", "release_date", "is_free", "genres"]]
spy_data = steamspy_data[["appid", "developer", "publisher", "positive", "negative", "owners", "average_forever", "median_forever", "initialprice", "languages", "tags"]]

spy_data = spy_data.rename(columns={"appid":"steam_appid"})
# merge datasets based on steam appid
data = data.merge(spy_data, how = "inner", on = "steam_appid")

# drop rows which are not games or with no name, duplicates (repeated steam appid)
data = data.dropna(subset = ["type", "name"]).drop_duplicates(subset = ["steam_appid"])
data = data.drop(["type"], axis = 1)
# drop rows of games that have been deleted
data = data[data["name"] != "none"]
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 29074 entries, 0 to 29232
Data columns (total 22 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   name                29074 non-null  object 
 1   steam_appid         29074 non-null  int64  
 2   controller_support  5998 non-null   object 
 3   dlc                 4975 non-null   object 
 4   short_description   29050 non-null  object 
 5   demos               2138 non-null   object 
 6   platforms           29074 non-null  object 
 7   movies              27157 non-null  object 
 8   achievements        26842 non-null  object 
 9   release_date        29074 non-null  object 
 10  is_free             29074 non-null  object 
 11  genres              29028 non-null  object 
 12  developer           28960 non-null  object 
 13  publisher           28878 non-null  object 
 14  positive            29074 non-null  int64  
 15  negative            29074 non-null  int64  
 16  owne

In [321]:
# drop rows without english language
data = data.dropna(subset = ["languages"])
data = data[data["languages"].str.contains("English")]

In [322]:
# filling null values
variables = ["controller_support", "dlc", "demos", "movies"]

# 1 for exists, 0 for does not exist
for var in variables:
    data[var] = np.where(data[var].isnull(), 0, 1)

In [323]:
# cleaning price data

print("Number of free games:", data[data["is_free"]].shape[0])
print("Number of games without price:", data[data["initialprice"] == 0].shape[0])

# drop games which are not free but still have no price data
data = data.drop(data[(data["is_free"] == 0) & (data["initialprice"] == 0)].index)
data = data.drop(["is_free"], axis = 1)
data["initialprice"] = data["initialprice"] / 100

Number of free games: 2715
Number of games without price: 3483


In [324]:
# cleaning platforms data

def get_platforms(cell):
    # evaluate data stored as string as dictionary using ast.literal_eval
    platforms = literal_eval(cell)
    # generate comma separated list of supported platforms if platform is True
    return ', '.join(platform for platform in platforms.keys() if platforms[platform])

data["platforms"] = data["platforms"].copy().apply(get_platforms)

In [325]:
# cleaning achievement data

def get_achievements(cell):
    if cell is not np.nan:
        achievements = literal_eval(cell)
        return achievements["total"]
    else:
        return 0

data["achievements"] = data["achievements"].copy().apply(get_achievements)

In [326]:
# cleaning release date data

def get_release_date(cell):
    release_date = literal_eval(cell)
    if release_date["coming_soon"]:
        return np.nan
    elif release_date["date"] == '':
        return np.nan
    else: 
        return release_date["date"]

data["release_date"] = data["release_date"].copy().apply(get_release_date)

# drop unreleased games or games with no date
data = data.dropna(subset = ["release_date"])

In [327]:
import re

# convert release date to date time format
def format_date(cell):
    if re.search(r'[\d]{1,2} [A-Za-z]{3}, [\d]{4}', cell):
        # day mon, year => day mon year
        return cell.replace(',', '')
    elif re.search(r'[A-Za-z]{3} [\d]{4}', cell):
        # add 1st day of month if only month, year is given
        return '1 ' + cell

data["release_date"] = pd.to_datetime(data["release_date"].apply(format_date), format = "%d %b %Y", errors = "raise")

In [328]:
# cleaning tags data

def get_tags(cell):
    if cell == '[]':
        return np.nan
    else:
        tags_dict = literal_eval(cell)
        for tag in tags_dict.keys():
            tags = ', '.join(tags_dict)
        return tags

data["tags"] = data["tags"].copy().apply(get_tags)

In [329]:
# clean genres data

def get_genres(cell):
    if cell is not np.nan:
        genres_dictlist = literal_eval(cell)
        genres_list = []
        for genres_dict in genres_dictlist:
            genres_list.append(genres_dict["description"])
        for genre in genres_list:
            genres = ', '.join(genres_list)
        return genres
    else:
        return np.nan
    
data["genres"] = data["genres"].copy().apply(get_genres)

In [330]:
# drop games with no developer data
data = data.dropna(subset = ["developer"])

# fill in missing publisher with developer name (self-published games?)
for i in data[data["publisher"].isnull()].index:
    data.loc[i, "publisher"] = data["developer"][i]

In [331]:
# add column for total number of positive + negative recommendations
data.insert(loc = 16, column = "recommendations", value = data["positive"] + data["negative"])

In [332]:
# TO OPTIMIZE: iterating through dataframe :')
# get unique official steam genres
data = data.reset_index(drop = True)
steam_genres = set()
for i in range(data.shape[0]):
    if data["genres"][i] is np.nan:
        continue
    all_genres = data["genres"][i].split(", ")
    for steam_genre in all_genres:
        steam_genres.add(steam_genre)
steam_genres

{'Accounting',
 'Action',
 'Adventure',
 'Animation & Modeling',
 'Audio Production',
 'Casual',
 'Design & Illustration',
 'Documentary',
 'Early Access',
 'Education',
 'Free to Play',
 'Game Development',
 'Gore',
 'Indie',
 'Massively Multiplayer',
 'Nudity',
 'Photo Editing',
 'RPG',
 'Racing',
 'Sexual Content',
 'Simulation',
 'Software Training',
 'Sports',
 'Strategy',
 'Tutorial',
 'Utilities',
 'Video Production',
 'Violent',
 'Web Publishing'}

In [333]:
# remove very very dirty data
data = data[(data["genres"].str.contains("Nudity") == False) & (data["genres"].str.contains("Sexual Content") == False) & (data["tags"].str.contains("Nudity") == False) & (data["tags"].str.contains("Sexual Content") == False)]

In [334]:
# drop games without genres data 
data = data.dropna(subset = ["genres"])
data = data.reset_index(drop = True)

In [335]:
# TO OPTIMIZE

# remove steam genres from tags, tags=>additional tags
for i in range(data.shape[0]):
    if data["tags"][i] is not np.nan:
        non_steam_tags = list(set(data["tags"][i].split(sep = ', ')) - set(steam_genres))
        if non_steam_tags:
            data.loc[i, "tags"] = ', '.join(non_steam_tags)
        else:
            data.loc[i, "tags"] = np.nan
data.rename(columns = {"tags" : "additional_tags"}, inplace = True)

In [336]:
# use wilson score as an a better estimate of positive:negative rating
# Wilson Confidence Interval considers binomial distribution for score calculation
def wilson_score(positive, negative, z):
    n = positive + negative

    if n == 0:
        return 0

    p = float(positive) / n

    left = p + 1/(2*n)*z*z
    right = z*sqrt(p*(1-p)/n + z*z/(4*n*n))
    under = 1+1/n*z*z

    return (left - right) / under

z = 1.96 # 95% confidence interval => z-score = 1.96
data["rating"] = data.apply(lambda row: round(wilson_score(row.positive, row.negative, 1.96)*100, 1), axis = 1)

In [337]:
# drop rows with no ratings
data = data[data["recommendations"] != 0]
data = data.drop(["positive", "negative", "recommendations"], axis = 1)
data = data.reset_index(drop = True)

In [342]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26040 entries, 0 to 26039
Data columns (total 20 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   name                26040 non-null  object        
 1   steam_appid         26040 non-null  int64         
 2   controller_support  26040 non-null  int32         
 3   dlc                 26040 non-null  int32         
 4   short_description   26026 non-null  object        
 5   demos               26040 non-null  int32         
 6   platforms           26040 non-null  object        
 7   movies              26040 non-null  int32         
 8   achievements        26040 non-null  int64         
 9   release_date        26039 non-null  datetime64[ns]
 10  genres              26040 non-null  object        
 11  developer           26040 non-null  object        
 12  publisher           26040 non-null  object        
 13  owners              26040 non-null  object    

In [343]:
# export clean dataset
data.to_csv("steamdata_clean.csv", index = False)