# Import libraries

In [6]:
import numpy as np
import string
from os import listdir
from os.path import isfile, join

import matplotlib.pyplot as plt
%matplotlib inline

# Simulating the data

## Binary data

In [7]:
percentages = [i for i in range(50,101,10)]  # a list a desired percentage

for p in percentages:   # call ramdom to generate random data, call packbits and then write them into file
    binary_data = np.packbits(np.random.choice([0, 1], size=8*1024*1024*100, replace=True, p = [p/100, 1-p/100]))
    open('Data/zeros_%sp' %p, 'wb').write(binary_data)

## DNA data

In [10]:
# generate DNA sequence 100 million letters long with equal probability and save it into a file

dna = np.random.choice(['A', 'T', 'G', 'C'], size=100000000, replace=True);
open('Data/dna.fa', 'w').write(''.join(dna));

## Protein data

In [9]:
# generate protein sequence 100 million letters long with equal probability and save it into a file

protein = np.random.choice(list(string.ascii_uppercase), size=100000000, replace=True);
open('Data/protein.fa', 'w').write(''.join(protein));

# Compressing the data

In [11]:
# print out the file names under the dir for sanity check

onlyfiles = [f for f in listdir('Data') if isfile(join('Data', f))]
print(str([f for f in onlyfiles]))

['zeros_90p', 'zeros_80p', 'dna.fa', 'protein.fa', 'zeros_100p', 'zeros_50p', 'zeros_70p', 'zeros_60p']


In [12]:
from pathlib import Path

onlyfiles = [f for f in listdir('Data') if isfile(join('Data', f))]        # a list containing all the files to be compressed

for f in onlyfiles:
    print("---------------------------------------------------------")
    file = 'Data/' + str(f)
    art_file = file + '.art'
    zip_file = file +'.gz'
    duplicate_file = 'Data/ZIP/' + str(f)                   # some file names to be used

    command = !rm $zip_file                             # removed the already generated compressed files
    
    print("\n", "Terminal output for %s using gzip" %file)     # compress each file using four commands
    !time gzip -k $file
    print("\n", "Terminal output for %s using bzip2" %file)
    !time bzip2 -k $file
    print("\n", "Terminal output for %s using pbzip2" %file)
    !time pbzip2 -k $duplicate_file
    print("\n", "Terminal output for %s using ArithmeticCompress" %file)
    !time ArithmeticCompress $file $art_file

---------------------------------------------------------

 Terminal output for Data/zeros_90p using gzip
23.57user 0.13system 0:23.70elapsed 99%CPU (0avgtext+0avgdata 1824maxresident)k
0inputs+114728outputs (0major+136minor)pagefaults 0swaps

 Terminal output for Data/zeros_90p using bzip2
11.14user 0.11system 0:11.26elapsed 99%CPU (0avgtext+0avgdata 7740maxresident)k
0inputs+119464outputs (0major+1690minor)pagefaults 0swaps

 Terminal output for Data/zeros_90p using pbzip2
19.30user 0.78system 0:00.76elapsed 2639%CPU (0avgtext+0avgdata 275020maxresident)k
0inputs+119488outputs (0major+223335minor)pagefaults 0swaps

 Terminal output for Data/zeros_90p using ArithmeticCompress
28.51user 0.29system 0:28.81elapsed 99%CPU (0avgtext+0avgdata 4284maxresident)k
0inputs+96064outputs (0major+232minor)pagefaults 0swaps
---------------------------------------------------------

 Terminal output for Data/zeros_80p using gzip
16.52user 0.19system 0:16.72elapsed 99%CPU (0avgtext+0avgdata 1776maxres

## Table Summary

|original file|command type|input file size|output file size|time elapse|
|------|------|------|------|------|
|zeros_50p|gzip|105 MB|105 MB|0:04.45|
|zeros_50p|bzip2|105 MB|105 MB|0:16.72|
|zeros_50p|pbzip2|105 MB|105 MB|0:01.50|
|zeros_50p|ArithmeticCompress|105 MB|105 MB|0:40.79|
|zeros_60p|gzip|105 MB|102 MB|0:05.63|
|zeros_60p|bzip2|105 MB|105 MB|0:18.21|
|zeros_60p|pbzip2|105 MB|105 MB|0:01.39|
|zeros_60p|ArithmeticCompress|105 MB|102 MB|0:42.58|
|zeros_70p|gzip|105 MB|93.6 MB|0:06.49|
|zeros_70p|bzip2|105 MB|99.8 MB|0:14.28|
|zeros_70p|pbzip2|105 MB|99.8 MB|0:01.17|
|zeros_70p|ArithmeticCompress|105 MB|92.4 MB|0:48.34|
|zeros_80p|gzip|105 MB|81.2 MB|0:16.72|
|zeros_80p|bzip2|105 MB|86.6 MB|0:12.04|
|zeros_80p|pbzip2|105 MB|86.7 MB|0:00.95|
|zeros_80p|ArithmeticCompress|105 MB|75.7 MB|0:35.39|
|zeros_90p|gzip|105 MB|58.7 MB|0:23.70|
|zeros_90p|bzip2|105 MB|61.2 MB|0:11.26|
|zeros_90p|pbzip2|105 MB|61.2 MB|0:00.76|
|zeros_90p|ArithmeticCompress|105 MB|49.2 MB|0:28.81|
|zeros_100p|gzip|105 MB|102 kB|0:00.87|
|zeros_100p|bzip2|105 MB|113 B|0:01.22|
|zeros_100p|pbzip2|105 MB|5.62 kB|0:00.10|
|zeros_100p|ArithmeticCompress|105 MB|1.03 kB|0:18.93|
|dna.fa|gzip|100 MB|29.2 MB|0:14.56|
|dna.fa|bzip2|100 MB|27.3 MB|0:09.49|
|dna.fa|pbzip2|100 MB|27.3 MB|0:00.69|
|dna.fa|ArithmeticCompress|100 MB|25 MB|0:21.41|
|protein.fa|gzip|100 MB|63.5 MB|0:04.67|
|protein.fa|bzip2|100 MB|59.8 MB|0:12.80|
|protein.fa|pbzip2|100 MB|59.8 MB|0:00.82|
|protein.fa|ArithmeticCompress|100 MB|58.8 MB|0:29.85|

## Compressing real data

In [71]:
from Bio import Entrez
from Bio import SeqIO
import sys

Entrez.email = "zach_lyu@berkeley.edu"
list_seq = []      # a list containing the sequences
list_name = []     # a list containing the names

handle = Entrez.esearch(db = 'nucleotide',       # search 10 results and save the names and sequences
                        term = 'gp120 and HIV',
                        sort = 'relevance',
                        idtype = 'acc',
                        retmax = 10)
            
            
for i in Entrez.read(handle)['IdList']:
    handle = Entrez.efetch(db = 'nucleotide', id = i, rettype = 'gb', retmode = 'text')
    record = SeqIO.read(handle, "genbank")
    list_name.append(str(record.name))
    list_seq.append(str(record.seq))
    handle.close()

In [76]:
dict_gp120 = dict(zip(list_name, list_seq))        # summarize the information into a dictionary

ofile = open("multi_fasta.fa", "w")

for i in range(len(list_seq)):                    # write to the multi_fasta file
    ofile.write(">" + list_name[i] + "\n" +list_seq[i] + "\n")

ofile.close()

In [77]:
!cat multi_fasta.fa           # print out the content for sanity check

>AF236860
GTTCCTGTGTGGAAAGATGCAGAGACCACCTTATTTTGTGCATCAGATGCCAAAGCACATGAGACAGAAGTGCACAATGTCTGGGCCACACATGCCTGTGTACCCACAGACCCCAACCCACAAGAAATACAACTGAAAAATGTAACAGAGAATTTTAACATGTGGAAAAATAACATGGTAGAGCAGATGCAGGAGGATGTAATCAGTTTATGGGATCAAAGTCTAAAGCCATGTGTAAAGTTAACTCCTCTCTGCGTTACTTTAAATTGTACCGATGCTACTTTGACCAATAGCACTTACATAACCAATGTCTCTAAGATAATAGGAGATATAACAGAGGAAGTAAGAAACTGTTCTTTTAATATGACCACAGAACTAAGAGATAAGAAGCAGAAGGTCCATGCACTTTTTTTATAAGCTTGATATAGTAGAAATTGAAAAGAATAGGAATGAGTATAGGTTAATAAATTGTAATACTTCGGTCATTAAGCAGGCTTGTCCAAAGATATCCTTTGATCCAATTCCTATACATTATTGTACTCCAGCTGGTTATGCGATTTTAAAGTGTAATGATAAGAATTTCAATGGGACAGGGCCATGTAAAAATGTCAGCTCAGTACAATGCACACATGGAATTAAGCCAGTGGTATCAACTCAATTGCTGTTAAATGGCAGTCTAGCAGAAGAAGAGATAATAATCAGATCTGAAAATCTCACAAACAATGCCAAAACCATAATAGTGCACCTTAATAAATCTGTAGAAATCAATTGTACCAGACCCTTCAACAATACAAGAACAAGTATAACTATAGGACCAGGACAAATGTTCTATAGAACAGGAGAGATAATAGGAGATATAACAAAAGCATATTGTGAGATTAATGGAACAAAATGGAATGAAACTTTAAAACAGGTAGCTGAAAAACTAAAAGAGCACTTTAATAATAAGACAATAGTCTTTCAACCACCCTCAGGAGGAGATCTAGAAA

In [78]:
from pathlib import Path           # compress the multi_fasta using the same methods and print out the terminal output

file = 'multi_fasta.fa'
art_file = file + '.art'
zip_file = file +'.gz'

command = !rm $zip_file
    
print("\n", "Terminal output for %s using gzip" %file)
!time gzip -k $file
print("\n", "Terminal output for %s using bzip2" %file)
!time bzip2 -k $file
print("\n", "Terminal output for %s using ArithmeticCompress" %file)
!time ArithmeticCompress $file $art_file


 Terminal output for multi_fasta.fa using gzip
0.00user 0.00system 0:00.00elapsed 100%CPU (0avgtext+0avgdata 1720maxresident)k
0inputs+8outputs (0major+97minor)pagefaults 0swaps

 Terminal output for multi_fasta.fa using bzip2
0.00user 0.00system 0:00.00elapsed 75%CPU (0avgtext+0avgdata 1608maxresident)k
0inputs+8outputs (0major+95minor)pagefaults 0swaps

 Terminal output for multi_fasta.fa using ArithmeticCompress
0.00user 0.00system 0:00.00elapsed 100%CPU (0avgtext+0avgdata 4372maxresident)k
0inputs+8outputs (0major+235minor)pagefaults 0swaps


## Real data table

|original file|command type|input file size|output file size|compression ratio|
|------|------|------|------|------|
|multi_fasta.fa|gzip|6.61 kB|1.25 kB|18.91%|
|multi_fasta.fa|bzip2|6.61 kB|1.33 kB|20.12%|
|multi_fasta.fa|ArithmeticCompress	|6.61 kB|2.42 kB|36.61%|