In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook as tqdm
from os import path, makedirs

In [None]:
#
## compability to SMA implementations
def _store_csr(split_df, fo_name, buf_size=10**5):
    with open(fo_name, 'w') as fo:
        buf_text = []
        buf_data = split_df.groupby(['cuid'])
        for uid in tqdm(buf_data.groups):
            bunch_data = buf_data.get_group(uid)

            uIDs = bunch_data['cuid'].values
            iIDs = bunch_data['ciid'].values
            uiRns = bunch_data['rating'].values

            buf_uf = []
            buf_if = []
            for _ in range(bunch_data.shape[0]):
                buf_uf.append('%d:%d' % (uIDs[_], 0))
                buf_if.append('%d:%.1f' % (iIDs[_], uiRns[_]))
            one_line = ['0 0 %d %d' % (uIDs.size, iIDs.size)] + buf_uf + buf_if
            buf_text.append(' '.join(one_line))

            if len(buf_text) % buf_size == 0:
                buf_text.append('')
                fo.write('\n'.join(buf_text))
                buf_text = []

        # check whether buffer is empty
        if buf_text:
            buf_text.append('')
            fo.write('\n'.join(buf_text))
        fo.close()

        
def sma_format(fi_dir, fo_pattern, **kwargs):
    print('make CSR for training')
    tr_df = pd.read_csv('%strainingset' % fi_dir, header=None, 
                        names=['cuid', 'ciid', 'rating'], sep=':+', engine='python')
    _store_csr(tr_df, fo_pattern % 'train.data')

    print('make CSR for testing')
    ev_df = pd.read_csv('%stestingset' % fi_dir, header=None, 
                        names=['cuid', 'ciid', 'rating'], sep=':+', engine='python')
    _store_csr(ev_df, fo_pattern % 'test.data')

    print('draw configs for entire data')
    buf_config = []
    max_uID = tr_df['cuid'].max() + 1
    max_iID = tr_df['ciid'].max() + 1
    buf_config.append(
        '$USER_COUNT_VALUE=%d\n$ITEM_COUNT_VALUE=%d' % (max_uID, max_iID))
    max_uiRn = tr_df['rating'].max()
    min_uiRn = tr_df['rating'].min()
    buf_config.append('$MAX_RATING_VALUE=%.1f\n$MIN_RATING_VALUE=%.1f' %
                      (max_uiRn, min_uiRn))
    num_row_tr = tr_df['cuid'].unique().size
    num_entr_tr = 2 * tr_df.shape[0]
    buf_config.append('$TRAIN_ROW_NUM_VALUE=%d\n$TRAIN_VAL_NUM_VALUE=%d' %
                      (num_row_tr, num_entr_tr))
    num_row_ev = ev_df['cuid'].unique().size
    num_entr_ev = 2 * ev_df.shape[0]
    buf_config.append('$TEST_ROW_NUM_VALUE=%d\n$TEST_VAL_NUM_VALUE=%d' %
                      (num_row_ev, num_entr_ev))
    with open('%sdConfig.properties' % fi_dir, 'w') as fo:
        fo.write('\n'.join(buf_config))
        fo.close()
    print('done')

In [None]:
fio_dir = '/home/crl_it/data/amazon/2/'
sma_format(fio_dir, fio_dir+'%s')