In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
sb.set()

from ast import literal_eval
from math import sqrt

In [2]:
steam_data = pd.read_csv("dataset/steam_app_data.csv")
steamspy_data = pd.read_csv("dataset/steamspy_data.csv")

In [3]:
# get relevant columns from datasets
data = steam_data[["type", "name", "steam_appid", "controller_support", "dlc", "short_description", "demos", "platforms", "movies", "achievements", "release_date", "is_free", "genres"]]
spy_data = steamspy_data[["appid", "developer", "publisher", "positive", "negative", "owners", "average_forever", "median_forever", "initialprice", "languages", "tags"]]

spy_data = spy_data.rename(columns={"appid":"steam_appid"})
# merge datasets based on steam appid
data = data.merge(spy_data, how = "inner", on = "steam_appid")

# drop rows which are not games or with no name, duplicates (repeated steam appid)
data = data.dropna(subset = ["type", "name"]).drop_duplicates(subset = ["steam_appid"])
data = data.drop(["type"], axis = 1)
# drop rows of games that have been deleted
data = data[data["name"] != "none"]
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 29074 entries, 0 to 29232
Data columns (total 22 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   name                29074 non-null  object 
 1   steam_appid         29074 non-null  int64  
 2   controller_support  5998 non-null   object 
 3   dlc                 4975 non-null   object 
 4   short_description   29050 non-null  object 
 5   demos               2138 non-null   object 
 6   platforms           29074 non-null  object 
 7   movies              27157 non-null  object 
 8   achievements        26842 non-null  object 
 9   release_date        29074 non-null  object 
 10  is_free             29074 non-null  object 
 11  genres              29028 non-null  object 
 12  developer           28960 non-null  object 
 13  publisher           28878 non-null  object 
 14  positive            29074 non-null  int64  
 15  negative            29074 non-null  int64  
 16  owne

In [4]:
# drop rows without english language
data = data.dropna(subset = ["languages"])
data = data[data["languages"].str.contains("English")]

In [5]:
# filling null values
variables = ["controller_support", "dlc", "demos", "movies"]

# 1 for exists, 0 for does not exist
for var in variables:
    data[var] = np.where(data[var].isnull(), 0, 1)

In [6]:
# cleaning price data

print("Number of free games:", data[data["is_free"]].shape[0])
print("Number of games without price:", data[data["initialprice"] == 0].shape[0])

# drop games which are not free but still have no price data
data = data.drop(data[(data["is_free"] == 0) & (data["initialprice"] == 0)].index)
data = data.drop(["is_free"], axis = 1)
data["initialprice"] = data["initialprice"] / 100

Number of free games: 2715
Number of games without price: 3483


In [7]:
# cleaning platforms data

def get_platforms(cell):
    # evaluate data stored as string as dictionary using ast.literal_eval
    platforms = literal_eval(cell)
    # generate comma separated list of supported platforms if platform is True
    return ', '.join(platform for platform in platforms.keys() if platforms[platform])

data["platforms"] = data["platforms"].copy().apply(get_platforms)

In [8]:
# cleaning achievement data

def get_achievements(cell):
    if cell is not np.nan:
        achievements = literal_eval(cell)
        return achievements["total"]
    else:
        return 0

data["achievements"] = data["achievements"].copy().apply(get_achievements)

In [9]:
# cleaning release date data

def get_release_date(cell):
    # evaluate data stored as string as dictionary using ast.literal_eval
    release_date = literal_eval(cell)
    if release_date["coming_soon"]:
        return np.nan
    else:
        return release_date["date"]

data["release_date"] = data["release_date"].copy().apply(get_release_date)

# drop unreleased games
data = data.dropna(subset = ["release_date"])

In [10]:
# cleaning tags data

def get_tags(cell):
    if cell == '[]':
        return np.nan
    else:
        tags_dict = literal_eval(cell)
        for tag in tags_dict.keys():
            tags = ', '.join(tags_dict)
        return tags

data["tags"] = data["tags"].copy().apply(get_tags)

In [11]:
# clean genres data

def get_genres(cell):
    if cell is not np.nan:
        genres_dictlist = literal_eval(cell)
        genres_list = []
        for genres_dict in genres_dictlist:
            genres_list.append(genres_dict["description"])
        for genre in genres_list:
            genres = ', '.join(genres_list)
        return genres
    else:
        return np.nan
    
data["genres"] = data["genres"].copy().apply(get_genres)

In [12]:
# drop games with no developer data
data = data.dropna(subset = ["developer"])

# fill in missing publisher with developer name (self-published games?)
for i in data[data["publisher"].isnull()].index:
    data.loc[i, "publisher"] = data["developer"][i]

In [13]:
# add column for total number of positive + negative recommendations
data.insert(loc = 16, column = "recommendations", value = data["positive"] + data["negative"])

In [14]:
# TO OPTIMIZE: iterating through dataframe :')
# get unique official steam genres
data = data.reset_index(drop = True)
steam_genres = set()
for i in range(data.shape[0]):
    if data["genres"][i] is np.nan:
        continue
    all_genres = data["genres"][i].split(", ")
    for steam_genre in all_genres:
        steam_genres.add(steam_genre)
steam_genres

{'Accounting',
 'Action',
 'Adventure',
 'Animation & Modeling',
 'Audio Production',
 'Casual',
 'Design & Illustration',
 'Documentary',
 'Early Access',
 'Education',
 'Free to Play',
 'Game Development',
 'Gore',
 'Indie',
 'Massively Multiplayer',
 'Nudity',
 'Photo Editing',
 'RPG',
 'Racing',
 'Sexual Content',
 'Simulation',
 'Software Training',
 'Sports',
 'Strategy',
 'Tutorial',
 'Utilities',
 'Video Production',
 'Violent',
 'Web Publishing'}

In [15]:
# getting missing genre data

# drop games without both tags and genres data 
data = data.dropna(how = 'all', subset = ["tags", "genres"])

# fill in missing genre of game using its tags which are official steam genres
for i in data[data["genres"].isnull()].index:
    tags = data["tags"][i].split(sep = ', ')
    steam_tags = list(set(tags).intersection(set(steam_genres)))
    if steam_tags:
        data.loc[i, "genres"] = ', '.join(steam_tags)

# drop games where genres cannot be filled
data = data.dropna(subset = ["genres"])

In [16]:
# TO CHANGE: maybe remove intersection of tags/genres instead? tags => additional tags
# fill in missing tags data with genres 
data["tags"] = data["tags"].fillna(data.genres)

In [17]:
# remove very very dirty data
data = data[(data["genres"].str.contains("Nudity") == False) & (data["genres"].str.contains("Sexual Content") == False)]

In [18]:
# use wilson score as an a better estimate of positive:negative rating
# Wilson Confidence Interval considers binomial distribution for score calculation
def wilson_score(positive, negative, z):
    n = positive + negative

    if n == 0:
        return 0

    p = float(positive) / n

    left = p + 1/(2*n)*z*z
    right = z*sqrt(p*(1-p)/n + z*z/(4*n*n))
    under = 1+1/n*z*z

    return (left - right) / under

z = 1.96 # 95% confidence interval => z-score = 1.96
data["rating"] = data.apply(lambda row: round(wilson_score(row.positive, row.negative, 1.96)*100, 1), axis = 1)

In [19]:
# drop rows with no ratings
data = data[data["recommendations"] != 0]
data = data.drop(["positive", "negative", "recommendations"], axis = 1)
data = data.reset_index(drop = True)

In [20]:
data

Unnamed: 0,name,steam_appid,controller_support,dlc,short_description,demos,platforms,movies,achievements,release_date,genres,developer,publisher,owners,average_forever,median_forever,initialprice,languages,tags,rating
0,Counter-Strike,10,0,0,Play the world's number 1 online action game. ...,0,"windows, mac, linux",0,0,"1 Nov, 2000",Action,Valve,Valve,"10,000,000 .. 20,000,000",17612,317,9.99,"English, French, German, Italian, Spanish - Sp...","Action, FPS, Multiplayer, Shooter, Classic, Te...",97.3
1,Team Fortress Classic,20,0,0,One of the most popular online action games of...,0,"windows, mac, linux",0,0,"1 Apr, 1999",Action,Valve,Valve,"5,000,000 .. 10,000,000",277,62,4.99,"English, French, German, Italian, Spanish - Sp...","Action, FPS, Multiplayer, Classic, Shooter, Cl...",82.8
2,Day of Defeat,30,0,0,Enlist in an intense brand of Axis vs. Allied ...,0,"windows, mac, linux",0,0,"1 May, 2003",Action,Valve,Valve,"5,000,000 .. 10,000,000",187,34,4.99,"English, French, German, Italian, Spanish - Spain","FPS, World War II, Multiplayer, Action, Shoote...",88.6
3,Deathmatch Classic,40,0,0,Enjoy fast-paced multiplayer gaming with Death...,0,"windows, mac, linux",0,0,"1 Jun, 2001",Action,Valve,Valve,"5,000,000 .. 10,000,000",258,184,4.99,"English, French, German, Italian, Spanish - Sp...","Action, FPS, Multiplayer, Classic, Shooter, Fi...",80.7
4,Half-Life: Opposing Force,50,0,0,Return to the Black Mesa Research Facility as ...,0,"windows, mac, linux",0,0,"1 Nov, 1999",Action,Gearbox Software,Valve,"5,000,000 .. 10,000,000",624,415,4.99,"English, French, German, Korean","FPS, Action, Sci-fi, Singleplayer, Classic, Sh...",94.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27188,Room of Pandora,1065230,0,0,The Room of Pandora is a third-person interact...,0,windows,1,7,"24 Apr, 2019","Adventure, Casual, Indie",SHEN JIAWEI,SHEN JIAWEI,"0 .. 20,000",0,0,2.99,"English, Japanese, Simplified Chinese, Traditi...","Adventure, Indie, Casual, Puzzle",43.8
27189,Cyber Gun,1065570,0,0,Cyber Gun is a hardcore first-person shooter w...,0,windows,1,0,"23 Apr, 2019","Action, Adventure, Indie",Semyon Maximov,BekkerDev Studio,"0 .. 20,000",0,0,1.99,English,"Action, Indie, Adventure, FPS, 3D Platformer, ...",56.5
27190,Super Star Blast,1065650,1,0,Super Star Blast is a space based game with ch...,0,windows,1,24,"24 Apr, 2019","Action, Casual, Indie",EntwicklerX,EntwicklerX,"0 .. 20,000",0,0,4.99,English,"Action, Indie, Casual",0.0
27191,New Yankee 7: Deer Hunters,1066700,0,0,Pursue a snow-white deer through an enchanted ...,0,"windows, mac",1,0,"17 Apr, 2019","Adventure, Casual, Indie",Yustas Game Studio,Alawar Entertainment,"0 .. 20,000",0,0,6.99,"English, German, Korean, Russian","Indie, Casual, Adventure",34.2


In [21]:
# export clean dataset
data.to_csv("steamdata_clean.csv", index = False)