# Purpose of this file
Construct a new, clean csv file based on games_march2025_cleaned.csv

Might use web scrapping to fill more info if needed (hopefully not, I'm lazy)

### Needed labels
AppID, Title, Release date, Price, Supported languages, Categories / Genres / Tags, Steam reviews (ratio), Metacritic score, Developer, Publisher, Description (later), Peak CCU

In [22]:
import pandas as pd
import numpy as np
import os
from pathlib import Path
from datetime import datetime

BASE_DIR = Path().resolve().parent
RAW_DATA_PATH = os.path.join(BASE_DIR, 'data', 'raw')
PROCESSED_DATA_PATH = os.path.join(BASE_DIR, 'data', 'processed')

df = pd.read_csv(os.path.join(RAW_DATA_PATH, "games_march2025_cleaned.csv"), parse_dates=['release_date'])
print(f'Shape: {df.shape}')

Shape: (89618, 47)


In [23]:
print(f'First 5 entries:\n')
df.head(5)

First 5 entries:



Unnamed: 0,appid,name,release_date,required_age,price,dlc_count,detailed_description,about_the_game,short_description,reviews,...,average_playtime_2weeks,median_playtime_forever,median_playtime_2weeks,discount,peak_ccu,tags,pct_pos_total,num_reviews_total,pct_pos_recent,num_reviews_recent
0,730,Counter-Strike 2,2012-08-21,0,0.0,1,"For over two decades, Counter-Strike has offer...","For over two decades, Counter-Strike has offer...","For over two decades, Counter-Strike has offer...",,...,879,5174,350,0,1212356,"{'FPS': 90857, 'Shooter': 65397, 'Multiplayer'...",86,8632939,82,96473
1,578080,PUBG: BATTLEGROUNDS,2017-12-21,0,0.0,0,"LAND, LOOT, SURVIVE! Play PUBG: BATTLEGROUNDS ...","LAND, LOOT, SURVIVE! Play PUBG: BATTLEGROUNDS ...",Play PUBG: BATTLEGROUNDS for free. Land on str...,,...,0,0,0,0,616738,"{'Survival': 14838, 'Shooter': 12727, 'Battle ...",59,2513842,68,16720
2,570,Dota 2,2013-07-09,0,0.0,2,"The most-played game on Steam. Every day, mill...","The most-played game on Steam. Every day, mill...","Every day, millions of players worldwide enter...",“A modern multiplayer masterpiece.” 9.5/10 – D...,...,1536,898,892,0,555977,"{'Free to Play': 59933, 'MOBA': 20158, 'Multip...",81,2452595,80,29366
3,271590,Grand Theft Auto V Legacy,2015-04-13,17,0.0,0,"When a young street hustler, a retired bank ro...","When a young street hustler, a retired bank ro...",Grand Theft Auto V for PC offers players the o...,,...,771,7101,74,0,117698,"{'Open World': 32644, 'Action': 23539, 'Multip...",87,1803832,92,17517
4,359550,Tom Clancy's Rainbow Six® Siege,2015-12-01,17,3.99,9,Edition Comparison Ultimate Edition The Tom Cl...,“One of the best first-person shooters ever ma...,"Tom Clancy's Rainbow Six® Siege is an elite, t...",,...,682,2434,306,80,89916,"{'FPS': 9831, 'PvP': 9162, 'e-sports': 9072, '...",84,1168020,76,12608


In [24]:
df.columns

Index(['appid', 'name', 'release_date', 'required_age', 'price', 'dlc_count',
       'detailed_description', 'about_the_game', 'short_description',
       'reviews', 'header_image', 'website', 'support_url', 'support_email',
       'windows', 'mac', 'linux', 'metacritic_score', 'metacritic_url',
       'achievements', 'recommendations', 'notes', 'supported_languages',
       'full_audio_languages', 'packages', 'developers', 'publishers',
       'categories', 'genres', 'screenshots', 'movies', 'user_score',
       'score_rank', 'positive', 'negative', 'estimated_owners',
       'average_playtime_forever', 'average_playtime_2weeks',
       'median_playtime_forever', 'median_playtime_2weeks', 'discount',
       'peak_ccu', 'tags', 'pct_pos_total', 'num_reviews_total',
       'pct_pos_recent', 'num_reviews_recent'],
      dtype='object')

In [25]:
df.head(5)['metacritic_score']
print(df[df['metacritic_score'] == 0].shape[0])

86071


A lot of entries is missing metacritic scores

In [26]:
df = df.drop_duplicates(subset=['name', 'release_date'])
print(f'Shape: {df.shape}')

Shape: (89586, 47)


In [27]:
df.columns

Index(['appid', 'name', 'release_date', 'required_age', 'price', 'dlc_count',
       'detailed_description', 'about_the_game', 'short_description',
       'reviews', 'header_image', 'website', 'support_url', 'support_email',
       'windows', 'mac', 'linux', 'metacritic_score', 'metacritic_url',
       'achievements', 'recommendations', 'notes', 'supported_languages',
       'full_audio_languages', 'packages', 'developers', 'publishers',
       'categories', 'genres', 'screenshots', 'movies', 'user_score',
       'score_rank', 'positive', 'negative', 'estimated_owners',
       'average_playtime_forever', 'average_playtime_2weeks',
       'median_playtime_forever', 'median_playtime_2weeks', 'discount',
       'peak_ccu', 'tags', 'pct_pos_total', 'num_reviews_total',
       'pct_pos_recent', 'num_reviews_recent'],
      dtype='object')

## Why drop these columns?

### Kinda pointless:
required_age, dlc_count, header_image, website, support_url, support_email, metacritic_url, achievements, notes, packages, screenshots, movies
### We don't handle text descriptions at this stage (maybe consider them once we completed the base structure):
detailed_description, about_the_game, short_description, reviews
### Too many empty values:
metacritic_score, user_score, score_rank, average_playtime_forever, median_playtime_forever
### We don't really need recent/current data bc how long the game lasts isn't in our consideration:
average_playtime_2weeks, median_playtime_2weeks, 'discount', 'pct_pos_recent', 'num_reviews_recent'

In [28]:
columns_to_drop = [
    'required_age', 'dlc_count', 'detailed_description', 'about_the_game', 'short_description', 'reviews',
    'header_image', 'website', 'support_url', 'support_email', 'metacritic_score', 'metacritic_url',
    'achievements', 'notes', 'packages', 'screenshots', 'movies', 'user_score', 'score_rank',
    'average_playtime_2weeks', 'median_playtime_2weeks', 'discount',
    'pct_pos_recent', 'num_reviews_recent', 'average_playtime_forever', 'median_playtime_forever'
]
df_reduced = df.drop(columns=columns_to_drop)
df_reduced

Unnamed: 0,appid,name,release_date,price,windows,mac,linux,recommendations,supported_languages,full_audio_languages,...,publishers,categories,genres,positive,negative,estimated_owners,peak_ccu,tags,pct_pos_total,num_reviews_total
0,730,Counter-Strike 2,2012-08-21,0.00,True,False,True,4401572,"['Czech', 'Danish', 'Dutch', 'English', 'Finni...","['English', 'Indonesian']",...,['Valve'],"['Multi-player', 'Cross-Platform Multiplayer',...","['Action', 'Free To Play']",7480813,1135108,100000000 - 200000000,1212356,"{'FPS': 90857, 'Shooter': 65397, 'Multiplayer'...",86,8632939
1,578080,PUBG: BATTLEGROUNDS,2017-12-21,0.00,True,False,False,1732007,"['English', 'Korean', 'Simplified Chinese', 'F...",[],...,"['KRAFTON, Inc.']","['Multi-player', 'PvP', 'Online PvP', 'Stats',...","['Action', 'Adventure', 'Massively Multiplayer...",1487960,1024436,50000000 - 100000000,616738,"{'Survival': 14838, 'Shooter': 12727, 'Battle ...",59,2513842
2,570,Dota 2,2013-07-09,0.00,True,True,True,14337,"['Bulgarian', 'Czech', 'Danish', 'Dutch', 'Eng...","['English', 'Korean', 'Simplified Chinese', 'V...",...,['Valve'],"['Multi-player', 'Co-op', 'Steam Trading Cards...","['Action', 'Strategy', 'Free To Play']",1998462,451338,200000000 - 500000000,555977,"{'Free to Play': 59933, 'MOBA': 20158, 'Multip...",81,2452595
3,271590,Grand Theft Auto V Legacy,2015-04-13,0.00,True,False,False,1803063,"['English', 'French', 'Italian', 'German', 'Sp...","['English', 'Spanish - Latin America']",...,['Rockstar Games'],"['Single-player', 'Multi-player', 'PvP', 'Onli...","['Action', 'Adventure']",1719950,250012,50000000 - 100000000,117698,"{'Open World': 32644, 'Action': 23539, 'Multip...",87,1803832
4,359550,Tom Clancy's Rainbow Six® Siege,2015-12-01,3.99,True,False,False,1165929,"['English', 'French', 'Italian', 'German', 'Sp...","['English', 'French', 'Italian', 'German', 'Sp...",...,['Ubisoft'],"['Single-player', 'Multi-player', 'PvP', 'Onli...",['Action'],1152763,218446,20000000 - 50000000,89916,"{'FPS': 9831, 'PvP': 9162, 'e-sports': 9072, '...",84,1168020
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
89613,2115110,Outrun Them,2024-02-29,3.99,True,False,False,0,['English'],[],...,['TheBean'],"['Single-player', 'Steam Achievements', 'Stats...","['Indie', 'Early Access']",2,0,0 - 20000,0,"{'2D Platformer': 91, 'Runner': 85, 'Platforme...",-1,-1
89614,1174200,Lands of Pharaoh: Episode 1,2020-01-07,10.00,True,False,False,0,"['English', 'Turkish']","['English', 'Turkish']",...,['Orion Asistan Hizmetleri A.S.'],"['Single-player', 'Family Sharing']","['Action', 'Adventure', 'Indie', 'Strategy']",2,0,0 - 20000,0,"{'Action': 43, 'Adventure': 41, 'Indie': 41, '...",-1,-1
89615,1160190,Player One,2019-10-10,1.99,True,False,False,0,['English'],[],...,['Prime2Gold'],"['Single-player', 'Family Sharing']","['Indie', 'Early Access']",2,1,0 - 20000,0,"{'Indie': 31, 'Early Access': 21}",-1,-1
89616,3380340,DragonRoad,2025-01-25,29.99,True,False,False,0,"['English', 'Simplified Chinese', 'French', 'G...","['English', 'Italian']",...,['XuJie'],"['Single-player', 'Multi-player', 'PvP', 'Onli...","['Adventure', 'RPG', 'Simulation', 'Strategy']",2,0,0 - 20000,0,"{'RPG': 117, 'Action-Adventure': 111, 'Strateg...",-1,-1


In [29]:
df_reduced.iloc[30000]

appid                                                              870800
name                                                             Kilcount
release_date                                          2018-06-15 00:00:00
price                                                                 0.0
windows                                                              True
mac                                                                 False
linux                                                               False
recommendations                                                         0
supported_languages                                           ['English']
full_audio_languages                                                   []
developers                                                   ['Beatrice']
publishers                                                   ['Beatrice']
categories                        ['Single-player', 'Steam Achievements']
genres                                

We can see that some data is incomplete, so we have to drop them.

In [30]:
drop_condition = (df_reduced['peak_ccu'] == 0) | ((df_reduced['positive']==0) & (df_reduced['negative']==0))
df_retained = df_reduced[~drop_condition]
df_retained.shape

(17662, 21)

Besides the amount of positive & negative reviews, we also need to calculate user rating percentage.

In [31]:
df_retained['user_rating'] = df_retained['positive'] / (df_retained['positive'] + df_retained['negative']) * 100
df_retained.head(5)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_retained['user_rating'] = df_retained['positive'] / (df_retained['positive'] + df_retained['negative']) * 100


Unnamed: 0,appid,name,release_date,price,windows,mac,linux,recommendations,supported_languages,full_audio_languages,...,categories,genres,positive,negative,estimated_owners,peak_ccu,tags,pct_pos_total,num_reviews_total,user_rating
0,730,Counter-Strike 2,2012-08-21,0.0,True,False,True,4401572,"['Czech', 'Danish', 'Dutch', 'English', 'Finni...","['English', 'Indonesian']",...,"['Multi-player', 'Cross-Platform Multiplayer',...","['Action', 'Free To Play']",7480813,1135108,100000000 - 200000000,1212356,"{'FPS': 90857, 'Shooter': 65397, 'Multiplayer'...",86,8632939,86.82546
1,578080,PUBG: BATTLEGROUNDS,2017-12-21,0.0,True,False,False,1732007,"['English', 'Korean', 'Simplified Chinese', 'F...",[],...,"['Multi-player', 'PvP', 'Online PvP', 'Stats',...","['Action', 'Adventure', 'Massively Multiplayer...",1487960,1024436,50000000 - 100000000,616738,"{'Survival': 14838, 'Shooter': 12727, 'Battle ...",59,2513842,59.22474
2,570,Dota 2,2013-07-09,0.0,True,True,True,14337,"['Bulgarian', 'Czech', 'Danish', 'Dutch', 'Eng...","['English', 'Korean', 'Simplified Chinese', 'V...",...,"['Multi-player', 'Co-op', 'Steam Trading Cards...","['Action', 'Strategy', 'Free To Play']",1998462,451338,200000000 - 500000000,555977,"{'Free to Play': 59933, 'MOBA': 20158, 'Multip...",81,2452595,81.576537
3,271590,Grand Theft Auto V Legacy,2015-04-13,0.0,True,False,False,1803063,"['English', 'French', 'Italian', 'German', 'Sp...","['English', 'Spanish - Latin America']",...,"['Single-player', 'Multi-player', 'PvP', 'Onli...","['Action', 'Adventure']",1719950,250012,50000000 - 100000000,117698,"{'Open World': 32644, 'Action': 23539, 'Multip...",87,1803832,87.308791
4,359550,Tom Clancy's Rainbow Six® Siege,2015-12-01,3.99,True,False,False,1165929,"['English', 'French', 'Italian', 'German', 'Sp...","['English', 'French', 'Italian', 'German', 'Sp...",...,"['Single-player', 'Multi-player', 'PvP', 'Onli...",['Action'],1152763,218446,20000000 - 50000000,89916,"{'FPS': 9831, 'PvP': 9162, 'e-sports': 9072, '...",84,1168020,84.069095


# Defining the games' popularity level

I use 'estimated_owners' and 'peak_ccu' to generate a score in order to rate the game's popularity.

(There's definitely a more clever way to do this but my primary goal is to make it work first)

In [32]:
# Check how many potential values for 'estimated_owners'
df_retained['estimated_owners'].unique()

array(['100000000 - 200000000', '50000000 - 100000000',
       '200000000 - 500000000', '20000000 - 50000000',
       '10000000 - 20000000', '5000000 - 10000000', '2000000 - 5000000',
       '0 - 20000', '100000 - 200000', '1000000 - 2000000',
       '500000 - 1000000', '200000 - 500000', '20000 - 50000',
       '50000 - 100000'], dtype=object)

In [33]:
owners_score_map = {
    '0 - 20000': -3,
    '20000 - 50000': -1,
    '50000 - 100000': 1,
    '100000 - 200000': 3,
    '200000 - 500000': 5,
    '500000 - 1000000': 7,
    '1000000 - 2000000': 9,
    '2000000 - 5000000': 11,
    '5000000 - 10000000': 13,
    '10000000 - 20000000': 17,
    '20000000 - 50000000': 19,
    '50000000 - 100000000': 21,
}

df_retained['owners_score'] = df_retained['estimated_owners'].map(owners_score_map).fillna(23)

ccu_bins = [
    df_retained['peak_ccu'] < 1000,
    (df_retained['peak_ccu'] >= 1000) & (df_retained['peak_ccu'] < 2500),
    (df_retained['peak_ccu'] >= 2500) & (df_retained['peak_ccu'] < 5000),
    (df_retained['peak_ccu'] >= 5000) & (df_retained['peak_ccu'] < 10000),
    (df_retained['peak_ccu'] >= 10000) & (df_retained['peak_ccu'] < 50000),
    (df_retained['peak_ccu'] >= 50000) & (df_retained['peak_ccu'] < 100000),
    (df_retained['peak_ccu'] >= 100000) & (df_retained['peak_ccu'] < 200000),
    (df_retained['peak_ccu'] >= 200000) & (df_retained['peak_ccu'] < 500000),
]

ccu_scores = [-3, -1, 1, 3, 5, 7, 11, 13]
df_retained['ccu_score'] = np.select(ccu_bins, ccu_scores, default=15)

df_retained['total_score'] = df_retained['owners_score'] + df_retained['ccu_score']

rating_bins = [
    df_retained['total_score'] <= 0,
    (df_retained['total_score'] > 0) & (df_retained['total_score'] <= 5),
    (df_retained['total_score'] > 5) & (df_retained['total_score'] <= 17),
    (df_retained['total_score'] > 17) & (df_retained['total_score'] <= 27)
]

ratings = ["FAILED", "NICHE", "AVERAGE", "POPULAR"]

df_retained['popularity'] = np.select(rating_bins, ratings, default="PHENOMENAL")

columns_to_drop = [
    'estimated_owners', 'peak_ccu', 'owners_score', 'ccu_score', 'total_score'
]
df_rated = df_retained.drop(columns=columns_to_drop)
df_rated.head(20)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_retained['owners_score'] = df_retained['estimated_owners'].map(owners_score_map).fillna(23)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_retained['ccu_score'] = np.select(ccu_bins, ccu_scores, default=15)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_retained['total_score'] = df_retaine

Unnamed: 0,appid,name,release_date,price,windows,mac,linux,recommendations,supported_languages,full_audio_languages,...,publishers,categories,genres,positive,negative,tags,pct_pos_total,num_reviews_total,user_rating,popularity
0,730,Counter-Strike 2,2012-08-21,0.0,True,False,True,4401572,"['Czech', 'Danish', 'Dutch', 'English', 'Finni...","['English', 'Indonesian']",...,['Valve'],"['Multi-player', 'Cross-Platform Multiplayer',...","['Action', 'Free To Play']",7480813,1135108,"{'FPS': 90857, 'Shooter': 65397, 'Multiplayer'...",86,8632939,86.82546,PHENOMENAL
1,578080,PUBG: BATTLEGROUNDS,2017-12-21,0.0,True,False,False,1732007,"['English', 'Korean', 'Simplified Chinese', 'F...",[],...,"['KRAFTON, Inc.']","['Multi-player', 'PvP', 'Online PvP', 'Stats',...","['Action', 'Adventure', 'Massively Multiplayer...",1487960,1024436,"{'Survival': 14838, 'Shooter': 12727, 'Battle ...",59,2513842,59.22474,PHENOMENAL
2,570,Dota 2,2013-07-09,0.0,True,True,True,14337,"['Bulgarian', 'Czech', 'Danish', 'Dutch', 'Eng...","['English', 'Korean', 'Simplified Chinese', 'V...",...,['Valve'],"['Multi-player', 'Co-op', 'Steam Trading Cards...","['Action', 'Strategy', 'Free To Play']",1998462,451338,"{'Free to Play': 59933, 'MOBA': 20158, 'Multip...",81,2452595,81.576537,PHENOMENAL
3,271590,Grand Theft Auto V Legacy,2015-04-13,0.0,True,False,False,1803063,"['English', 'French', 'Italian', 'German', 'Sp...","['English', 'Spanish - Latin America']",...,['Rockstar Games'],"['Single-player', 'Multi-player', 'PvP', 'Onli...","['Action', 'Adventure']",1719950,250012,"{'Open World': 32644, 'Action': 23539, 'Multip...",87,1803832,87.308791,PHENOMENAL
4,359550,Tom Clancy's Rainbow Six® Siege,2015-12-01,3.99,True,False,False,1165929,"['English', 'French', 'Italian', 'German', 'Sp...","['English', 'French', 'Italian', 'German', 'Sp...",...,['Ubisoft'],"['Single-player', 'Multi-player', 'PvP', 'Onli...",['Action'],1152763,218446,"{'FPS': 9831, 'PvP': 9162, 'e-sports': 9072, '...",84,1168020,84.069095,POPULAR
5,440,Team Fortress 2,2007-10-10,0.0,True,False,True,41587,"['English', 'Danish', 'Dutch', 'Finnish', 'Fre...","['English', 'Spanish - Latin America']",...,['Valve'],"['Multi-player', 'Cross-Platform Multiplayer',...","['Action', 'Free To Play']",1025633,120619,"{'Free to Play': 62868, 'Hero Shooter': 61020,...",89,1146642,89.477096,POPULAR
6,105600,Terraria,2011-05-16,9.99,True,True,True,1098792,"['English', 'French', 'Italian', 'German', 'Sp...",[],...,['Re-Logic'],"['Single-player', 'Multi-player', 'PvP', 'Onli...","['Action', 'Adventure', 'Indie', 'RPG']",1344773,34460,"{'Open World Survival Craft': 16365, 'Sandbox'...",97,1102434,97.50151,POPULAR
7,252490,Rust,2018-02-08,39.99,True,True,False,992825,"['English', 'French', 'Italian', 'German', 'Sp...","['English', 'French', 'Italian', 'German', 'Sp...",...,['Facepunch Studios'],"['Multi-player', 'MMO', 'PvP', 'Online PvP', '...","['Action', 'Adventure', 'Indie', 'Massively Mu...",1043708,152272,"{'Survival': 18592, 'Crafting': 11822, 'Multip...",87,993856,87.268015,PHENOMENAL
8,4000,Garry's Mod,2006-11-29,5.99,True,True,True,984713,"['English', 'French', 'Italian', 'German', 'Sp...","['English', 'Ukrainian']",...,['Valve'],"['Single-player', 'Multi-player', 'PvP', 'Onli...","['Casual', 'Indie', 'Simulation']",1106689,36727,"{'Sandbox': 18706, 'Moddable': 14479, 'Multipl...",96,985010,96.787958,POPULAR
9,1172470,Apex Legends™,2020-11-04,0.0,True,False,False,1548,"['English', 'French', 'Italian', 'German', 'Sp...","['English', 'French', 'Italian', 'German', 'Sp...",...,['Electronic Arts'],"['Multi-player', 'PvP', 'Online PvP', 'Co-op',...","['Action', 'Adventure', 'Free To Play']",660150,322363,"{'Free to Play': 2170, 'Battle Royale': 1483, ...",67,983230,67.189951,PHENOMENAL


In [34]:
# The code above achieves similar effect as the following function (scoring has been adjusted):
'''
def rate_popularity(owners, peak_ccu):
    score = 0

    if owners = '0 - 20000': score -= 3
    elif owners = '20000 - 50000': score -= 1
    elif owners = '50000 - 100000': score += 1
    elif owners = '100000 - 200000': score += 3
    elif owners = '200000 - 500000': score += 5
    elif owners = '500000 - 1000000': score += 7
    elif owners = '1000000 - 2000000': score += 9
    elif owners = '2000000 - 5000000': score += 11
    elif owners = '5000000 - 10000000': score += 13
    elif owners = '10000000 - 20000000': score += 15
    elif owners = '20000000 - 50000000': score += 17
    elif owners = '50000000 - 100000000': score += 19
    else: score += 20

    if peak_ccu < 1000: score -= 3
    elif peak_ccu < 2500: score -= 1
    elif peak_ccu < 5000: score += 2
    elif peak_ccu < 10000: score += 4
    elif peak_ccu < 50000: score += 7
    elif peak_ccu < 100000: score += 10
    elif peak_ccu < 200000: score += 13
    elif peak_ccu < 500000: score += 17
    else: score += 20

    if score <= 0: return "FAILURE"     # 拉完了
    elif score <= 5: return "NICHE"     # NPC
    elif score <= 15: return "AVERAGE"  # 人上人
    elif score <= 30: return "POPULAR"  # 頂級
    else: return "PHENOMENAL"           # 夯
'''

'\ndef rate_popularity(owners, peak_ccu):\n    score = 0\n\n    if owners = \'0 - 20000\': score -= 3\n    elif owners = \'20000 - 50000\': score -= 1\n    elif owners = \'50000 - 100000\': score += 1\n    elif owners = \'100000 - 200000\': score += 3\n    elif owners = \'200000 - 500000\': score += 5\n    elif owners = \'500000 - 1000000\': score += 7\n    elif owners = \'1000000 - 2000000\': score += 9\n    elif owners = \'2000000 - 5000000\': score += 11\n    elif owners = \'5000000 - 10000000\': score += 13\n    elif owners = \'10000000 - 20000000\': score += 15\n    elif owners = \'20000000 - 50000000\': score += 17\n    elif owners = \'50000000 - 100000000\': score += 19\n    else: score += 20\n\n    if peak_ccu < 1000: score -= 3\n    elif peak_ccu < 2500: score -= 1\n    elif peak_ccu < 5000: score += 2\n    elif peak_ccu < 10000: score += 4\n    elif peak_ccu < 50000: score += 7\n    elif peak_ccu < 100000: score += 10\n    elif peak_ccu < 200000: score += 13\n    elif peak_cc

In [35]:
df_rated.info()

<class 'pandas.core.frame.DataFrame'>
Index: 17662 entries, 0 to 89600
Data columns (total 21 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   appid                 17662 non-null  int64         
 1   name                  17662 non-null  object        
 2   release_date          17662 non-null  datetime64[ns]
 3   price                 17662 non-null  float64       
 4   windows               17662 non-null  bool          
 5   mac                   17662 non-null  bool          
 6   linux                 17662 non-null  bool          
 7   recommendations       17662 non-null  int64         
 8   supported_languages   17662 non-null  object        
 9   full_audio_languages  17662 non-null  object        
 10  developers            17662 non-null  object        
 11  publishers            17662 non-null  object        
 12  categories            17662 non-null  object        
 13  genres               

Currently, I have not yet consider data from the supplementary dataset to be a feature of the main dataset.

In [36]:
save_path = os.path.join(PROCESSED_DATA_PATH, 'training_data.csv')
df_rated.to_csv(save_path, index=False)