In [None]:
import os
import shutil
import shlex
import sys
import subprocess as sp

### Simple Iterators

* `to_upper(I)` transforms each word in a stream `I` to upper-case.
* `take(N, I)` terminates a stream `I` after after `N` iterations.

In [None]:
def to_upper(I):
    for i in I:
        yield i.upper()
        

def take(N, I):
    for n, i in enumerate(I):
        if n < N:
            yield i
        else:
            break

    return StopIteration
        

In [None]:
# dictionary file
fp = '/usr/share/dict/words'

In [None]:
%timeit words = [x.strip() for x in to_upper(open(fp))][:10]

In [None]:
%timeit words = [x.strip() for x in take(10, to_upper(open(fp)))]

### Stream through dictionary:

The function `write_by_first_letter`:
* collects a list of words for each first-letter,
* writes collections to 'partition' files,
* yields partition key for each new first-letter.

In [None]:
def write_by_first_letter(I):
    
    first = next(I)
    fixed_letter = first[0].upper()
    buff = [first]
    yield fixed_letter
    
    for i in I:
        letter = i[0].upper()
        if letter == fixed_letter:
            buff.append(i)
        else:
            with open('file_%s' % fixed_letter, 'w') as fh:
                fh.write('\n'.join(buff))
            fixed_letter = letter
            buff = [i]
            yield fixed_letter
            
    with open('file_%s' % fixed_letter, 'w') as fh:
        fh.write('\n'.join(buff))
        
    return StopIteration
            


In [None]:
# partition dictionary to files by first name
for i in write_by_first_letter(open(fp)):
    print(i, end=' ')

In [None]:
!ls file_*

In [None]:
!rm file_*

In [None]:
# partition dictionary to files by first name, stopping after 10 letters
for i in take(10, write_by_first_letter(open(fp))):
    print(i, end=' ')

In [None]:
!ls file_*

In [None]:
!rm file_*

### potential issues:
* Issue 1: Each partition varies in size.
    - Can you fix the above 'partition writer', so that the buffer `buff` always has fixed size (e.g. 1000)?
    - This allows you to manage the size of the buffer.
    - Note: the files will still be of separate sizes.
* Issue 2: What if the dictionary were not sorted?
    - Why is this impossible to do in a streaming fashion?
    - What approaches might you take to deal with such a situation?

## File and Process Management

A quick introduction to the following libraries:
* The `os` module for file management.
* The `subprocess` module for process management.

For each of the functions below, look up the documentation to better understand the purpose and parameters of the function!

In [None]:
# make data
import numpy as np
import pandas as pd

arr = np.random.randint(0, 1000, size=(10000,100))
pd.DataFrame(arr).to_csv('test.csv', header=True, index=False)

### File management in python (`os`, `shutil` module)

In [None]:
# create a test directory, if it doesn't exist
os.makedirs('test', exist_ok=True)

# iterate through the dataframe and create files of 100 lines long
fcnt = 0
for df in pd.read_csv('test.csv', chunksize=100):
    fp = os.path.join('test', 'chunk_%d' % fcnt)
    df.loc[df['3'] % 2 == 0].to_csv(fp, header=True, index=False)
    fcnt += 1

In [None]:
# (recursive) copy directory test (and all files contained within)
shutil.copytree('test', 'copytest')

In [None]:
os.path.exists('copytest')

In [None]:
# depth first traverse all files cotained in the current directory '.'
for base, dirs, files in os.walk('.'):
    print(base, dirs, files, end='\n\n')

In [None]:
shutil.rmtree('copytest')
shutil.rmtree('test')

In [None]:
os.path.exists('copytest')

### Subprocess management in python (`subprocess` module)

In [None]:
# shlex.split parses a terminal command into a list for subprocess
cmd = shlex.split('cp test.csv cptest.csv')
cmd

In [None]:
# subprocess.call starts a shell subprocess and calls the command using it.
# The function *waits* for the process to finish, and returns the *return code*.
# the return code of a shell process is 0 if successful and 1 if not.
sp.call(cmd)

In [None]:
os.path.exists('cptest.csv')

In [None]:
# removing a single file with the os module
os.remove('cptest.csv')

In [None]:
# This shell command with `cut` takes the 9,11,87 columns of the csv file
cmd = shlex.split('cut -d, -f9,11,87 test.csv')
cmd

In [None]:
# The command is successful, but doesn't seem to do anything!
# This is because the command doesn't save to a file; it prints to screen ('standard out')
sp.call(cmd)

In [None]:
# to save to a file, redirect standard out to a file
sp.call(cmd, stdout=open('test_out.csv', 'w'))

In [None]:
pd.read_csv('test_out.csv').head()

In [None]:
os.remove('test_out.csv')

In [None]:
# ls also prints to screen.
# Why does this command fail? (return code is 1)
sp.call(['ls', 'blah'])

In [None]:
sp.call(['ls', 'blah'], stderr=open('errlog', 'w'))

In [None]:
with open('errlog') as fh:
    print(fh.read())
os.remove('errlog')

In [None]:
# Popen stands for 'process open'
# Popen is *non-blocking* -- meaning it starts the subprocess and resumes the python
# control-flow, without waiting for the process to finish.
# sp.PIPE is a buffer to read from later
proc = sp.Popen(['ls', 'blah'], stdout=sp.PIPE, stderr=sp.PIPE)

In [None]:
proc.stderr.read()

In [None]:
# This script waits 10 seconds between statements

In [None]:
! cat script.sh

In [None]:
import time

In [None]:
proc = sp.Popen(['./script.sh'], stdout=sp.PIPE, stderr=sp.PIPE)
for d in range(30):
    if d % 5 == 0:
        print('time = %d' % d)
        print(proc.stdout.peek())
        print('return code: ' + str(proc.poll()))

    time.sleep(1)

In [None]:
proc.stdout.read()

In [None]:
proc = sp.Popen(['./script.sh'], stdout=sp.PIPE, stderr=sp.PIPE)
for d in range(30):
    if d % 5 == 0:
        print('time = %d' % d)
        proc.communicate()
        print('return code: ' + str(proc.poll()))

    time.sleep(1)
