In [3]:
from pathlib import Path
from PIL import Image
import pandas as pd
import ast, os
import i2v

root_path   = Path('../steam_data/')
## List of users. For each user, all items purchased in the past.
#### For each item: [item_id, item_name, playtime_forever, playtime_2weeks]
items_path  = Path(root_path, 'australian_users_items.json')

## List of games and their attributes.
#### For each game: [publisher, genres, app_name, title, url
####                 release_date, tags, discount_price, reviews_url,
####                 specs, price, early_access, id, developer]
games_path  = Path(root_path, 'steam_games.json')

# illust2vec = i2v.make_i2v_with_chainer(
#     "i2v/illust2vec_tag_ver200.caffemodel", "i2v/tag_list.json")

### content.csv

In [4]:
items = {'id': [], 'genres': [], 'tags': [], 'i2v': [], 'i2v_conf': []}

with open(games_path, encoding='utf-8') as fp:
    lines = fp.read().split('\n')
    n = len(lines)
    
    for k, line in enumerate(lines[:-1]):
        
#         if k < 28626: continue
        
        print(f'\r{k}/{n}', flush=True, end='')
        
        data = ast.literal_eval(line)
        num_words = 0.
        
        if 'id' not in data.keys(): continue
        
        genres, tags, all_tags, all_conf = [], [], [], []
        
        if 'genres' in data.keys():
            genres = data['genres']
            num_words+=1
        
        if 'tags' in data.keys():
            tags = data['tags']
            num_words+=1
        
        imgspath = Path(root_path, 'media', data['id'])
        if os.path.isdir(imgspath):
        
            imgs = os.listdir(imgspath)

            for img in imgs:
                if 'header' in img or 'screenshot' in img: 

                    imgpath = Path(imgspath, img)
                    try:
                        img  = Image.open(imgpath)
                    except:
                        continue
                    tuples = illust2vec.estimate_plausible_tags([img], threshold=0.1)
                    i2v  = [tag[0] for tag in tuples[0]['general']]
                    conf = [tag[1] for tag in tuples[0]['general']]

                    all_tags.extend(i2v)
                    all_conf.extend(conf)

                    num_words += 1
        
        if num_words > 0:
            items['id'].append(data['id'])
            items['genres'].append(genres)
            items['tags'].append(tags)
            items['i2v'].append( all_tags )
            items['i2v_conf'].append( all_conf )
        

32134/32136

In [21]:
df = pd.DataFrame(items).to_csv('content.csv')
df.head(10)

Unnamed: 0,id,genres,tags,i2v,i2v_conf
0,761140,"[Action, Casual, Indie, Simulation, Strategy]","[Strategy, Action, Indie, Casual, Simulation]","[no humans, photo, solo, parody, 1girl, orange...","[0.41829943656921387, 0.24202537536621094, 0.1..."
1,643980,"[Free to Play, Indie, RPG, Strategy]","[Free to Play, Strategy, Indie, RPG, Card Game...","[no humans, photo, solo, 1girl, solo, hat, pho...","[0.38092318177223206, 0.2749617099761963, 0.13..."
2,670290,"[Casual, Free to Play, Indie, Simulation, Sports]","[Free to Play, Simulation, Sports, Casual, Ind...","[solo, 1girl, photo, no humans, chair, solo, 1...","[0.4940672516822815, 0.31001073122024536, 0.26..."
3,767400,"[Action, Adventure, Casual]","[Action, Adventure, Casual]","[no humans, solo, photo, parody, 1boy, english...","[0.32657280564308167, 0.32152217626571655, 0.2..."
4,773570,[],"[Action, Indie, Casual, Sports]","[solo, 1girl, no humans, english, letterboxed,...","[0.48371535539627075, 0.41923975944519043, 0.2..."
5,772540,"[Action, Adventure, Simulation]","[Action, Adventure, Simulation, FPS, Shooter, ...","[tree, nature, water, 1girl, no humans, grass,...","[0.786588191986084, 0.6902362704277039, 0.3383..."
6,774276,"[Free to Play, Indie, Simulation, Sports]","[Free to Play, Indie, Simulation, Sports]","[comic, no humans, parody, screencap, no human...","[0.3170931339263916, 0.2462535798549652, 0.152..."
7,774277,"[Free to Play, Indie, Simulation, Sports]","[Free to Play, Indie, Simulation, Sports]","[no humans, comic, parody, no humans, english,...","[0.3281368315219879, 0.27011388540267944, 0.12..."
8,774278,"[Free to Play, Indie, Simulation, Sports]","[Free to Play, Indie, Simulation, Sports]","[no humans, comic, screencap, parody, no human...","[0.21559679508209229, 0.17124179005622864, 0.1..."
9,768800,"[Casual, Indie, Racing, Simulation]","[Indie, Casual, Simulation, Racing]","[solo, 1girl, no humans, 1boy, skirt, short ha...","[0.5948513746261597, 0.43106314539909363, 0.27..."


### ratings.csv

In [None]:
ratings = {'user': [], 'item': [], 'playtime': []}

with open(items_path, encoding='utf-8') as fp:
    lines = fp.read().split('\n')
    n = len(lines)
    
    for k, line in enumerate(lines[:-1]):
        print(f'\r{k}/{n}', flush=True, end='')
        
        data = ast.literal_eval(line)
        
        items = data['items']
        for item in items:
        
            ratings['user'].append(data['user_id'])
            ratings['item'].append(item['item_id'])
            ratings['playtime'].append(item['playtime_forever'])
        
        
df = pd.DataFrame(ratings)
df.to_csv('../steam_data/ratings.csv', index=None)


# playtime = playtime/60
# if playtime > 100:
#     playtime = 100
# playtime = playtime // 10

## Folds

In [4]:
root_path    = Path('../steam_data/')
folds_path   = Path(root_path, 'folds')
score_path   = Path(root_path, 'ratings_score.csv')

ratings_score = pd.read_csv(score_path)
ratings_score.head()

Unnamed: 0,user,item,playtime,score
0,76561197970982479,10,6,4.714286
1,76561197970982479,20,0,5.0
2,76561197970982479,30,7,4.666667
3,76561197970982479,40,0,5.0
4,76561197970982479,50,0,5.0


In [21]:
dic = {}
for k, idx in enumerate(ratings_score.index):
    print(f'\r{k}/{len(ratings_score)}', end='', flush=True)
    key= str(ratings_score.loc[idx]['user']) + ':' + str(ratings_score.loc[idx]['item'])
    value = ratings_score.loc[idx]['score'] 
    dic[key]=value 

5075979/5075980

In [24]:
list(dic.keys())[0:10]

['76561197970982479:10',
 '76561197970982479:20',
 '76561197970982479:30',
 '76561197970982479:40',
 '76561197970982479:50',
 '76561197970982479:60',
 '76561197970982479:70',
 '76561197970982479:130',
 '76561197970982479:300',
 '76561197970982479:240']

In [25]:
for fold in range(5):
    print('\nFold:', fold)
    path = Path(folds_path, ' ' + str(fold))
    
    rtrain = pd.read_csv(Path(path, 'train.csv'))
    rtest  = pd.read_csv(Path(path, 'test.csv'))
    
    for ratings, mode in zip([rtrain, rtest], ['train', 'test']):
        print()
        score = []
        drop= []
        for k, sample in enumerate(ratings.index):
            print(f'\r{k}/{len(ratings)}', end='', flush=True)
            
            user = rtrain.loc[sample]['user']
            item = rtrain.loc[sample]['item']
            
            key = str(user) + ':' + str(item)
            if dic.get(key) is None:
                drop.append(k)
                continue
                
            score.append(dic[key])

        print(len(drop))
        ratings = ratings.drop(drop)
        ratings['score'] = score
        display(ratings.head())
        ratings.to_csv(Path(path, f'{mode}_score.csv'))
        


Fold: 0

4122566/412256761651


Unnamed: 0,user,item,playtime,score
0,76561197970982479,10,0.0,4.714286
1,76561197970982479,20,0.0,5.0
2,76561197970982479,30,0.0,4.666667
3,76561197970982479,40,0.0,5.0
4,76561197970982479,50,0.0,5.0



1030641/10306426117


Unnamed: 0,user,item,playtime,score
0,76561197970982479,3920,0.0,4.714286
1,76561197970982479,320,0.0,5.0
2,76561197970982479,7940,1.0,4.666667
3,76561197970982479,17340,0.0,5.0
4,76561197970982479,500,0.0,5.0



Fold: 1

3657430/4122567

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



3675817/4122567

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



3686579/4122567

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



3696954/4122567

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



3707750/4122567

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



3718576/4122567

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



3728972/4122567

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



3739843/4122567

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



3747882/4122567

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



3759340/4122567

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



3796470/4122567

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



3807062/4122567

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



3817937/4122567

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



3826703/4122567

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



4122566/412256761800


Unnamed: 0,user,item,playtime,score
0,76561197970982479,10,0.0,4.714286
1,76561197970982479,20,0.0,5.0
2,76561197970982479,30,0.0,4.666667
3,76561197970982479,40,0.0,5.0
4,76561197970982479,50,0.0,5.0



1030641/10306426096


Unnamed: 0,user,item,playtime,score
0,76561197970982479,60,0.0,4.714286
1,76561197970982479,130,0.0,5.0
2,76561197970982479,2630,0.0,4.666667
3,76561197970982479,220,1.0,5.0
4,76561197970982479,12900,0.0,5.0



Fold: 2

4122566/412256761709


Unnamed: 0,user,item,playtime,score
0,76561197970982479,10,0.0,4.714286
1,76561197970982479,20,0.0,5.0
2,76561197970982479,30,0.0,4.666667
3,76561197970982479,40,0.0,5.0
4,76561197970982479,50,0.0,5.0



1030641/10306426072


Unnamed: 0,user,item,playtime,score
0,76561197970982479,300,7.0,4.714286
1,76561197970982479,3830,0.0,5.0
2,76561197970982479,3900,0.0,4.666667
3,76561197970982479,6910,4.0,5.0
4,76561197970982479,7670,1.0,5.0



Fold: 3

4122566/412256761823


Unnamed: 0,user,item,playtime,score
0,76561197970982479,10,0.0,4.714286
1,76561197970982479,50,0.0,5.0
2,76561197970982479,60,0.0,5.0
3,76561197970982479,70,0.0,5.0
4,76561197970982479,130,0.0,5.0



1030641/10306426151


Unnamed: 0,user,item,playtime,score
0,76561197970982479,20,0.0,4.714286
1,76561197970982479,30,0.0,5.0
2,76561197970982479,40,0.0,5.0
3,76561197970982479,240,3.0,5.0
4,76561197970982479,400,0.0,5.0



Fold: 4

4122567/412256861933


Unnamed: 0,user,item,playtime,score
0,76561197970982479,20,0.0,5.0
1,76561197970982479,30,0.0,4.666667
2,76561197970982479,40,0.0,5.0
3,76561197970982479,60,0.0,5.0
4,76561197970982479,130,0.0,5.0



1030640/10306416138


Unnamed: 0,user,item,playtime,score
0,76561197970982479,10,0.0,5.0
1,76561197970982479,50,0.0,4.666667
2,76561197970982479,70,0.0,5.0
3,76561197970982479,34440,0.0,5.0
4,76561197970982479,6400,0.0,5.0


In [26]:
for fold in range(5):
    print('---------------------------')
    print('Fold:', fold)
    print('---------------------------')
    path = Path(folds_path, ' ' + str(fold))
    
    rtrain = pd.read_csv(Path(path, 'train.csv'))
    rtest  = pd.read_csv(Path(path, 'test.csv'))
    
    print(len(rtrain), len(rtest))
    
    rtrain = pd.read_csv(Path(path, 'train_score.csv'))
    rtest  = pd.read_csv(Path(path, 'test_score.csv'))
    
    print(len(rtrain), len(rtest))
    

---------------------------
Fold: 0
---------------------------
4122567 1030642
4060916 1024525
---------------------------
Fold: 1
---------------------------
4122567 1030642
4060767 1024546
---------------------------
Fold: 2
---------------------------
4122567 1030642
4060858 1024570
---------------------------
Fold: 3
---------------------------
4122567 1030642
4060744 1024491
---------------------------
Fold: 4
---------------------------
4122568 1030641
4060635 1024503


In [37]:
from shutil import copyfile
new_folds_path   = Path(root_path, 'folds_score')

if not os.path.isdir(new_folds_path):
    os.mkdir(new_folds_path)

for fold in range(5):
    print('---------------------------')
    print('Fold:', fold)
    print('---------------------------')
    path = Path(folds_path, ' ' + str(fold))

#     new_path = Path(new_folds_path, ' ' + str(fold))
#     if not os.path.isdir(new_path):
#         os.mkdir(new_path)
    
    
    os.remove(Path(path, 'train_score.csv'))
    os.remove(Path(path, 'test_score.csv'))
#     copyfile(Path(path, 'train_score.csv'), Path(new_path, 'train_score.csv'))
#     copyfile(Path(path, 'test_score.csv'), Path(new_path, 'test_score.csv'))
    
#     rtrain = pd.read_csv(Path(path, 'train_score.csv'))
#     rtest  = pd.read_csv(Path(path, 'test_score.csv'))

---------------------------
Fold: 0
---------------------------
---------------------------
Fold: 1
---------------------------
---------------------------
Fold: 2
---------------------------
---------------------------
Fold: 3
---------------------------
---------------------------
Fold: 4
---------------------------


In [39]:
! zip -r folds.zip /mnt/DADOS_PARIS1/laranjeira/recsteam/steam_data/folds

# ! zip -r folds_score.zip /mnt/DADOS_PARIS1/laranjeira/recsteam/steam_data/folds_score

  adding: mnt/DADOS_PARIS1/laranjeira/recsteam/steam_data/folds/ (stored 0%)
  adding: mnt/DADOS_PARIS1/laranjeira/recsteam/steam_data/folds/ 0/ (stored 0%)
  adding: mnt/DADOS_PARIS1/laranjeira/recsteam/steam_data/folds/ 0/train.csv (deflated 83%)
  adding: mnt/DADOS_PARIS1/laranjeira/recsteam/steam_data/folds/ 0/test.csv (deflated 81%)
  adding: mnt/DADOS_PARIS1/laranjeira/recsteam/steam_data/folds/ 1/ (stored 0%)
  adding: mnt/DADOS_PARIS1/laranjeira/recsteam/steam_data/folds/ 1/train.csv (deflated 83%)
  adding: mnt/DADOS_PARIS1/laranjeira/recsteam/steam_data/folds/ 1/test.csv (deflated 81%)
  adding: mnt/DADOS_PARIS1/laranjeira/recsteam/steam_data/folds/ 4/ (stored 0%)
  adding: mnt/DADOS_PARIS1/laranjeira/recsteam/steam_data/folds/ 4/train.csv (deflated 83%)
  adding: mnt/DADOS_PARIS1/laranjeira/recsteam/steam_data/folds/ 4/test.csv (deflated 81%)
  adding: mnt/DADOS_PARIS1/laranjeira/recsteam/steam_data/folds/ 2/ (stored 0%)
  adding: mnt/DADOS_PARIS1/laranjeira/recsteam/steam_d