Goal: Clean the three datasets scraped from [this website](https://howlongtobeat.com). The script got the playtime information of about 40000 games on different platforms, in addition to information about how many people had included their input for that game. 

In [63]:
import json
import pandas as pd
import numpy as np

In [64]:
game_times = pd.read_json('time.json')

View the columns and entries for each datasets

In [65]:
game_times

Unnamed: 0,name,url,Main Story time,Main Story tag info,Main + Extra time,Main + Extra tag info,Completionist time,Completionist tag info,Solo time,Solo tag info,Co-Op time,Co-Op tag info,Vs. time,Vs. tag info
0,Ratchet & Clank: Rift Apart,game?id=79776,11 Hours,search_list_tidbit center time_100,14½ Hours,search_list_tidbit center time_100,18 Hours,search_list_tidbit center time_100,,,,,,
1,Control,game?id=57507,11½ Hours,search_list_tidbit center time_100,18½ Hours,search_list_tidbit center time_100,26 Hours,search_list_tidbit center time_100,,,,,,
2,Resident Evil Village,game?id=80038,9½ Hours,search_list_tidbit center time_100,11 Hours,search_list_tidbit center time_100,35 Hours,search_list_tidbit center time_100,,,,,,
3,The Legend of Zelda: Breath of the Wild,game?id=38019,50 Hours,search_list_tidbit center time_100,97 Hours,search_list_tidbit center time_100,188 Hours,search_list_tidbit center time_100,,,,,,
4,The Witcher 3: Wild Hunt,game?id=10270,51 Hours,search_list_tidbit center time_100,102 Hours,search_list_tidbit center time_100,172 Hours,search_list_tidbit center time_100,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47979,Code of Honor 3: Desperate Measures,game?id=1779,2 Hours,search_list_tidbit center time_40,3 Hours,search_list_tidbit center time_40,4 Hours,search_list_tidbit center time_40,,,,,,
47980,Code of Honor: The French Foreign Legion,game?id=1780,2 Hours,search_list_tidbit center time_50,2 Hours,search_list_tidbit center time_40,2 Hours,search_list_tidbit center time_40,,,,,,
47981,Coderunner,game?id=1786,2 Hours,search_list_tidbit center time_40,--,search_list_tidbit center time_00,--,search_list_tidbit center time_00,,,,,,
47982,Coin Dozer,game?id=1793,--,search_list_tidbit center time_00,--,search_list_tidbit center time_00,--,search_list_tidbit center time_00,,,,,,


In [66]:
game_times.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 47984 entries, 0 to 47983
Data columns (total 14 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   name                    47984 non-null  object
 1   url                     47984 non-null  object
 2   Main Story time         44993 non-null  object
 3   Main Story tag info     44993 non-null  object
 4   Main + Extra time       44993 non-null  object
 5   Main + Extra tag info   44993 non-null  object
 6   Completionist time      44993 non-null  object
 7   Completionist tag info  44993 non-null  object
 8   Solo time               1859 non-null   object
 9   Solo tag info           1859 non-null   object
 10  Co-Op time              1745 non-null   object
 11  Co-Op tag info          1745 non-null   object
 12  Vs. time                2405 non-null   object
 13  Vs. tag info            2405 non-null   object
dtypes: object(14)
memory usage: 5.1+ MB


On their website, information that isn't available about the completion time of something is shown by the character **--**. For each entry, we can check and replace this character with Nan values.

In [67]:
def replace_with_null(x):
    if type(x) == str:
        nospace_x = x.replace(' ', '').strip()
    else:
        return x
    if nospace_x == '--':
        return np.nan
    else:
        return x

In [68]:
game_times = game_times.applymap(replace_with_null)

In [69]:
game_times

Unnamed: 0,name,url,Main Story time,Main Story tag info,Main + Extra time,Main + Extra tag info,Completionist time,Completionist tag info,Solo time,Solo tag info,Co-Op time,Co-Op tag info,Vs. time,Vs. tag info
0,Ratchet & Clank: Rift Apart,game?id=79776,11 Hours,search_list_tidbit center time_100,14½ Hours,search_list_tidbit center time_100,18 Hours,search_list_tidbit center time_100,,,,,,
1,Control,game?id=57507,11½ Hours,search_list_tidbit center time_100,18½ Hours,search_list_tidbit center time_100,26 Hours,search_list_tidbit center time_100,,,,,,
2,Resident Evil Village,game?id=80038,9½ Hours,search_list_tidbit center time_100,11 Hours,search_list_tidbit center time_100,35 Hours,search_list_tidbit center time_100,,,,,,
3,The Legend of Zelda: Breath of the Wild,game?id=38019,50 Hours,search_list_tidbit center time_100,97 Hours,search_list_tidbit center time_100,188 Hours,search_list_tidbit center time_100,,,,,,
4,The Witcher 3: Wild Hunt,game?id=10270,51 Hours,search_list_tidbit center time_100,102 Hours,search_list_tidbit center time_100,172 Hours,search_list_tidbit center time_100,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47979,Code of Honor 3: Desperate Measures,game?id=1779,2 Hours,search_list_tidbit center time_40,3 Hours,search_list_tidbit center time_40,4 Hours,search_list_tidbit center time_40,,,,,,
47980,Code of Honor: The French Foreign Legion,game?id=1780,2 Hours,search_list_tidbit center time_50,2 Hours,search_list_tidbit center time_40,2 Hours,search_list_tidbit center time_40,,,,,,
47981,Coderunner,game?id=1786,2 Hours,search_list_tidbit center time_40,,search_list_tidbit center time_00,,search_list_tidbit center time_00,,,,,,
47982,Coin Dozer,game?id=1793,,search_list_tidbit center time_00,,search_list_tidbit center time_00,,search_list_tidbit center time_00,,,,,,


For the columns that represent time, remove Hours from them. If there is a fraction sign remove it, turn str to int and thn add 0.5 to it.

In [70]:
def exrtact_num_from_txt(txt):
    txt = txt.strip()
    num = ''
    for c in txt:
        if c in ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']:
            num = num + c
    return int(num)

In [71]:
def time_to_int(x):
    if x is not np.nan:
        num = ''
        has_half = False
        if '½' in x:
            has_half = True
        num = exrtact_num_from_txt(x)
        if has_half:
            num += 0.5
        return num
    else:
        return x

In [72]:
#All columns except name and url
time_cols = []
for c in game_times.columns:
    if 'time' in c:
        time_cols.append(c)

#Change the format of all columns that have data about time to floats
game_times[time_cols] = game_times[time_cols].applymap(time_to_int)

In [73]:
game_times

Unnamed: 0,name,url,Main Story time,Main Story tag info,Main + Extra time,Main + Extra tag info,Completionist time,Completionist tag info,Solo time,Solo tag info,Co-Op time,Co-Op tag info,Vs. time,Vs. tag info
0,Ratchet & Clank: Rift Apart,game?id=79776,11.0,search_list_tidbit center time_100,14.5,search_list_tidbit center time_100,18.0,search_list_tidbit center time_100,,,,,,
1,Control,game?id=57507,11.5,search_list_tidbit center time_100,18.5,search_list_tidbit center time_100,26.0,search_list_tidbit center time_100,,,,,,
2,Resident Evil Village,game?id=80038,9.5,search_list_tidbit center time_100,11.0,search_list_tidbit center time_100,35.0,search_list_tidbit center time_100,,,,,,
3,The Legend of Zelda: Breath of the Wild,game?id=38019,50.0,search_list_tidbit center time_100,97.0,search_list_tidbit center time_100,188.0,search_list_tidbit center time_100,,,,,,
4,The Witcher 3: Wild Hunt,game?id=10270,51.0,search_list_tidbit center time_100,102.0,search_list_tidbit center time_100,172.0,search_list_tidbit center time_100,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47979,Code of Honor 3: Desperate Measures,game?id=1779,2.0,search_list_tidbit center time_40,3.0,search_list_tidbit center time_40,4.0,search_list_tidbit center time_40,,,,,,
47980,Code of Honor: The French Foreign Legion,game?id=1780,2.0,search_list_tidbit center time_50,2.0,search_list_tidbit center time_40,2.0,search_list_tidbit center time_40,,,,,,
47981,Coderunner,game?id=1786,2.0,search_list_tidbit center time_40,,search_list_tidbit center time_00,,search_list_tidbit center time_00,,,,,,
47982,Coin Dozer,game?id=1793,,search_list_tidbit center time_00,,search_list_tidbit center time_00,,search_list_tidbit center time_00,,,,,,


In [74]:
tag_info_cols = []
for col in game_times.columns:
    if 'tag' in col:
        tag_info_cols.append(col)


#replace with a lambda functions
def temp(entry):
    if entry is not np.nan:
        return exrtact_num_from_txt(entry)
    else:
        return np.nan
    
game_times[tag_info_cols] = game_times[tag_info_cols].applymap(temp)

In [75]:
game_times

Unnamed: 0,name,url,Main Story time,Main Story tag info,Main + Extra time,Main + Extra tag info,Completionist time,Completionist tag info,Solo time,Solo tag info,Co-Op time,Co-Op tag info,Vs. time,Vs. tag info
0,Ratchet & Clank: Rift Apart,game?id=79776,11.0,100.0,14.5,100.0,18.0,100.0,,,,,,
1,Control,game?id=57507,11.5,100.0,18.5,100.0,26.0,100.0,,,,,,
2,Resident Evil Village,game?id=80038,9.5,100.0,11.0,100.0,35.0,100.0,,,,,,
3,The Legend of Zelda: Breath of the Wild,game?id=38019,50.0,100.0,97.0,100.0,188.0,100.0,,,,,,
4,The Witcher 3: Wild Hunt,game?id=10270,51.0,100.0,102.0,100.0,172.0,100.0,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47979,Code of Honor 3: Desperate Measures,game?id=1779,2.0,40.0,3.0,40.0,4.0,40.0,,,,,,
47980,Code of Honor: The French Foreign Legion,game?id=1780,2.0,50.0,2.0,40.0,2.0,40.0,,,,,,
47981,Coderunner,game?id=1786,2.0,40.0,,0.0,,0.0,,,,,,
47982,Coin Dozer,game?id=1793,,0.0,,0.0,,0.0,,,,,,


In [81]:
game_times.to_csv('timeclean.csv')