# File transfer

In [1]:
import pandas as pd
import numpy as np
import scipy.sparse
import scipy

In [2]:
def convert_to_parquet(filename, out_filename):
    df = pd.read_csv(filename)
    df.to_parquet(out_filename + ".parquet")

In [3]:
def convert_h5_to_sparse_csr(filename, out_filename, chunksize=2500):
    start = 0
    total_rows = 0

    sparse_chunks_data_list = []
    chunks_index_list = []
    columns_name = None
    while True:
        df_chunk = pd.read_hdf(filename, start=start, stop=start+chunksize)
        if len(df_chunk) == 0:
            break
        chunk_data_as_sparse = scipy.sparse.csr_matrix(df_chunk.to_numpy())
        sparse_chunks_data_list.append(chunk_data_as_sparse)
        chunks_index_list.append(df_chunk.index.to_numpy())

        if columns_name is None:
            columns_name = df_chunk.columns.to_numpy()
        else:
            assert np.all(columns_name == df_chunk.columns.to_numpy())

        total_rows += len(df_chunk)
        if len(df_chunk) < chunksize: 
            del df_chunk
            break
        del df_chunk
        start += chunksize
        
    all_data_sparse = scipy.sparse.vstack(sparse_chunks_data_list)
    del sparse_chunks_data_list
    
    all_indices = np.hstack(chunks_index_list)
    
    scipy.sparse.save_npz(out_filename+"_values.sparse", all_data_sparse)
    np.savez(out_filename+"_idxcol.npz", index=all_indices, columns =columns_name)

### Training set

In [None]:
convert_h5_to_sparse_csr("../multi/train_multi_targets.h5", "train_multi_targets")

In [None]:
convert_h5_to_sparse_csr("../multi/train_multi_inputs.h5", "train_multi_inputs")

In [7]:
convert_h5_to_sparse_csr("../citeseq/train_cite_targets.h5", "train_cite_targets")

In [8]:
convert_h5_to_sparse_csr("../citeseq/train_cite_inputs.h5", "train_cite_inputs")

### Testing set

In [9]:
convert_h5_to_sparse_csr("../multi/test_multi_inputs.h5", "test_multi_inputs")

In [10]:
convert_h5_to_sparse_csr("../citeseq/test_cite_inputs.h5", "test_cite_inputs")

### Eval

In [3]:
convert_to_parquet("../submission/evaluation_ids.csv", "evaluation")

In [4]:
convert_to_parquet("../submission/metadata.csv", "metadata")