<a href="https://colab.research.google.com/github/byronknoll/tensorflow-compress/blob/master/nncp-splitter.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# NNCP Splitter

### Description

This notebook can be used to split files that have been preprocessed by NNCP. This is for compression using [tensorflow-compress](https://github.com/byronknoll/tensorflow-compress). The primary use-case is to get around Colab's session time limit by processing large files in smaller parts.

This file splitting does not use the naive method of dividing the file into consecutive parts. Instead, it takes into account the batch size used in tensorflow-compress so that the same sequence of symbols will be used for compressing the split parts as for the original file.

### Instructions

1.   In tensorflow-compress, using "preprocess_only" mode, choose "nncp" preprocessor and download the result.
2.   Upload the preprocessed file (named "preprocessed.dat") to this notebook, and download the split parts.
3.   In tensorflow-compress, compress each split part sequentially, enabling the checkpoint option. Choose "nncp-done" as the preprocessor.
4.   In tensorflow-compress, decompress each split part sequentially, enabling the checkpoint option. Choose "nncp-done" as the preprocessor.
5.   Upload the decompressed parts to this notebook to reproduce the original file. The files should be named: part.0, part.1, ..., part.N. Also upload the original NNCP dictionary file (named "dictionary.words").



## Parameters

In [None]:
batch_size = 96 #@param {type:"integer"}
#@markdown >_Set this to the same value that will be used in tensorflow-compress._
mode = 'split' #@param ["split", "join"]
num_parts = 4 #@param {type:"integer"}
#@markdown >_This is the number of parts the file should be split to._
http_path = '' #@param {type:"string"}
#@markdown >_The file from this URL will be downloaded. It is recommended to use Google Drive URLs to get fast transfer speed. Use this format for Google Drive files: https://drive.google.com/uc?id= and paste the file ID at the end of the URL. You can find the file ID from the "Get Link" URL in Google Drive. You can enter multiple URLs here, space separated._
local_upload = False #@param {type:"boolean"}
#@markdown >_If enabled, you will be prompted in the "Setup Files" section to select files to upload from your local computer. You can upload multiple files. Note: the upload speed can be quite slow (use "http_path" for better transfer speeds)._
download_option = "no_download" #@param ["no_download", "local", "google_drive"]
#@markdown >_If this is set to "local", the output files will be downloaded to your computer. If set to "google_drive", they will be copied to your Google Drive account (which is significantly faster than downloading locally)._

## Setup

In [None]:
#@title Imports

from google.colab import files
from google.colab import drive
import math

In [None]:
#@title Mount Google Drive
if download_option == "google_drive":
  drive.mount('/content/gdrive')

In [None]:
#@title Setup Files

!mkdir -p "data"

if local_upload:
  %cd data
  files.upload()
  %cd ..

if http_path:
  %cd data
  paths = http_path.split()
  for path in paths:
    !gdown $path
  %cd ..

if mode == "join":
  !gdown --id 1EzVPbRkBIIbgOzvEMeM0YpibDi2R4SHD
  !tar -xf nncp-2019-11-16.tar.gz
  %cd nncp-2019-11-16/
  !make preprocess
  %cd ..

## Run

In [None]:
#@title Split/Join

if mode == "split":
  input_path = "data/preprocessed.dat"
  orig = open(input_path, 'rb').read()
  int_list = []
  for i in range(0, len(orig), 2):
    int_list.append(orig[i] * 256 + orig[i+1])
  file_len = len(int_list)
  split = math.ceil(file_len / batch_size)
  part_split = math.ceil(file_len / (num_parts * batch_size))
  pos = 0
  for i in range(num_parts):
    output = []
    for j in range(batch_size):
      for k in range(part_split):
        if pos + k >= split:
          break
        index = pos + (j*split) + k
        if index >= file_len:
          break
        output.append(int_list[index])
    pos += part_split
    with open(("data/part." + str(i)), "wb") as out:
      for j in range(len(output)):
        out.write(bytes(((output[j] // 256),)))
        out.write(bytes(((output[j] % 256),)))

if mode == "join":
  file_len = 0
  for i in range(num_parts):
    part = open("data/part." + str(i), 'rb').read()
    file_len += len(part) / 2
  split = math.ceil(file_len / batch_size)
  part_split = math.ceil(file_len / (num_parts * batch_size))
  int_list = [0] * math.floor(file_len)
  pos = 0
  for i in range(num_parts):
    part = open("data/part." + str(i), 'rb').read()
    part_list = []
    for j in range(0, len(part), 2):
      part_list.append(part[j] * 256 + part[j+1])
    index2 = 0
    for j in range(batch_size):
      for k in range(part_split):
        if pos + k >= split:
          break
        index = pos + (j*split) + k
        if index >= file_len:
          break
        int_list[index] = part_list[index2]
        index2 += 1
    pos += part_split
  with open("data/output.dat", "wb") as out:
    for i in range(len(int_list)):
      out.write(bytes(((int_list[i] // 256),)))
      out.write(bytes(((int_list[i] % 256),)))
  !./nncp-2019-11-16/preprocess d data/dictionary.words ./data/output.dat ./data/final.dat

In [None]:
#@title File Sizes
!ls -l data

In [None]:
#@title MD5
!md5sum data/*

In [None]:
#@title Download Result
def download(path):
  """Downloads the file at the specified path."""
  if download_option == 'local':
    files.download(path)
  elif download_option == 'google_drive':
    !cp -f $path /content/gdrive/My\ Drive

if mode == "split":
  for i in range(num_parts):
    download("data/part." + str(i))

if mode == "join":
  download("data/final.dat")