In [None]:
# By now you should feel comfortable with automating your own functions
# However, these skills are only so helpful if you cannot apply them to the data
# and files you use on an everyday basis

# Lesson 3 will focus on "Data Wrangling" -- specifically, interacting with 
# files and spreadsheets.

In [None]:
# The first thing we will learn is how to interact with existing files and folders
# on your disk. This will compose Lesson 3A

# To do so we will need to import a few libraries in order to activate some useful
# functions


In [None]:
# Python loads with several "BUILT-IN" functions already activated. However,
# many other functions are available through other libraries, and need only to be 
# loaded. Still others can be easily accessed through simple download and installation. 
# To access existing Python modules and libraries, there is specific syntax one can 
# call upon in order to accomplish this goal

In [None]:
import subprocess
import os,sys
from glob import glob
import shutil

In [None]:
# Now that these libraries are imported, we can see the methods associated with them

os.
shutil.copy

In [None]:
# if we don't want to import an entire library, but just a single function from a library
# we can use the FROM syntax

from shutil import copy2

In [None]:
# Now copy2 is in our python namespace, and we do not need to invoke it using its library

copy2?

In [None]:
# We have imported several libraries that will help you interact with your os. The functions
# within are very helpful because they allow you to perform many high and low level OS
# commands that (more or less) should work across different operating systems.

# That means that code you write with these commands should be generalizable to different
# operating systems.

# In addition, you will surely write a lot of code that will involve moving, copying, and
# renaming files. These functions will allow you to do all of that and incorporate such
# commands into For Loops.

# I'll introduce a few of these functions, but you can learn more about them and other
# functions online

# https://docs.python.org/3/library/shutil.html
# https://docs.python.org/3/library/subprocess.html#module-subprocess
# https://docs.python.org/3/library/os.html
# https://docs.python.org/3/library/os.path.html#module-os.path

In [None]:
# First, lets get the current working directory and save it into a variable. This will
# allow us to perform some operations that will (hopefully) work for each of us despite
# different directory structures across our computers, drives and operating systems

cwd = os.getcwd()
cwd

In [None]:
# we can use os.listdir() to list the contents of this directory
os.listdir()

# notice this lists all files, including hidden files

In [None]:
# Let's make a new directory outside of your PyCourse directory, one directory back
# I'll use a few os functions to do this and I'll document how it works

# split the directory so that the cwd name is saved into one variable (tail) and the rest
# of the path is saved into a different variable (pth)
pth,tail = os.path.split(cwd)
print('path is %s \n cwd is %s'%(pth,tail))

# create the string for the new directory by concatenating the pth variable to our new
# directory name. 
newdir_name = 'py_scrap'
to_make = os.path.join(pth,newdir_name)
print('new directory will be %s'%to_make)

# make the new directory
os.mkdir(to_make)

# now lets look back one directory to see if our new directory "py_scrap" exists
os.listdir('..')

In [None]:
# We can use os commands to formally test whether a directory (or file) exists

print('the path %s exists?'%(to_make))
print(os.path.exists(to_make))

# We can even be more specific
print('the path %s is a directory that exists?'%(to_make))
print(os.path.isdir(to_make))

print('the path %s is a file that exists?'%(to_make))
print(os.path.isfile(to_make))


In [None]:
# Now lets change our working directory into the new directory we just made
os.chdir(to_make)

'the current working directory is now %s'%os.getcwd()

In [None]:
# In a similar fashion, we can use Python functions to move files around and other
# high level commands, particularly with the shutil library

# Lets copy a file into this new directory

# give new filepath and assure its existence
file_path = os.path.join(cwd,'stuff/rand_file')
print(file_path)
print('%s exists?'%file_path,os.path.isfile(file_path))

# copy the file
new_pth = shutil.copy2(file_path,to_make)

# assert its existence and list the contents of the directory
print(new_pth)
print('%s exists?'%new_pth,os.path.isfile(new_pth))
print('directory contents: \n', os.listdir())



# If you want to copy a full directory, use shutil.copytree instead
# If you want to move a file instead of copy it, you can use shutil.move 

In [None]:
# You can use all sorts of tools to run basic commands line functions, including high-level
# tasks like changing owners of files, creating archives, creating symbolic links, 
# and removing files.

# Just to show some of these tools, I'll create an archive containing our new file, 
# make a symlink of it, then rename and remove the symlink 

# first make the directory to archive and move a copy of rand_file into it
to_rxiv = os.path.join(cwd,'jnk')
os.mkdir(to_rxiv)
new_fl = shutil.copy2(new_pth,to_rxiv) # notice how I'm using a directory as the destination. Rather than
                                       # rewrite it, the file will be copied into it. But still, be careful.
print('contents of %s: \n'%to_rxiv,os.listdir(to_rxiv))

# archive it
shutil.make_archive(base_name = 'new_archive',format = 'gztar', root_dir = to_rxiv)
print('listing contents of %s to see if archive was made... \n'%to_make,os.listdir())

# make symbolic link
rxiv = os.path.join(cwd,'new_archive.tar.gz')
sym = os.path.join(os.getcwd(),'Im_a_symlink')
os.symlink(src = rxiv,dst = sym)
print('listing contents of %s to see if symlink was made... \n'%to_make,os.listdir())

# rename it
nsym = os.path.join(os.getcwd(),'new_link')
os.rename(sym,nsym)
print('listing contents of %s to see if symlink was renamed... \n'%to_make,os.listdir())

# now remove the symlink
os.remove(nsym)
print('listing contents of %s to see if symlink was removed... \n'%to_make,os.listdir())

In [None]:
# Okay, now that we're done with this scratch directory, lets get rid of it.

# We'll start by moving back into the pycourse directory
os.chdir(cwd)
print('the current working directory is now %s'%os.getcwd())

# Since the directory we want to delete is indeed a directory, we should use
# shutil.rmtree -- this functions very similar to rm -r in Linux, and is much
# more intuitive then os.removedirs, which is somewhat destructive

shutil.rmtree(to_make)
shutil.rmtree(to_rxiv)

# now lets see whether the old directory still exits
print('%s is a path that exists?'%(to_make),os.path.isdir(to_make))


In [None]:
# The list of high and low level commands that can by accessed through these libraries,
# as well as the sys library, is quite comprehensive. There is much more to explore,
# though admittedly, much of the low-level stuff is probably beyond the scope of this
# course

# I will now show you three more very useful tools that I find myself using frequently 
# when moving files around 

In [None]:
# os.walk allows you to iterate through directory trees and iteratively store or operate
# on variables representing different files and directories within that tree

# Here, I will iterate through the pycourse directory. For each iteration, root will
# capture the string representing the path of each directory WITHIN the pycourse 
# directory, dirs will capture a list of strings representing the name of each directory
# within root, and files will capture a list of strings representing the names of every
# file within each directory and subdirectory.

# For each iteration, I will print root, dirs, and if there are any files in root, I will
# simply print the first file

for root, dirs, files in os.walk(cwd):
    if len(files) > 0:
        print(root,dirs,files[0])
    else:
        print(root,dirs)

# the usefulness of this might be come clear during the exercises

In [None]:
# One of the more versatile functions in Python is the os.system command
# Its lets you enter a command as if you were in your terminal instead of
# Python.

# This makes it extremely versatile. For example, you can run commandline based
# Freesurfer and FSL commands (among other more basic commands) inside of Python 
# functions!

# While it is great for interactive coding, its less useful for writing scripts
# or sharable code, as the commands are often system specific. Let me demonstrate:

# os.system works by simply entering the command-line command, in 'single quotes'
# as an argument/

# For example, on a Linux or Mac OS, this will list the contents of the directory

print(os.system('ls'))

# However, on Window OS, the above will not work. Instead you would need to do this

print(os.system('dir'))

# In contrast, this will work on all three OSs:
print(os.listdir())

# Still, os.system is a powerful tool if you're used to working in the command line,
# (though its uses are not easy to demonstrate in an Jupyter Notebook.)

In [None]:
# Finally, to take this concept a step further, you may want to run a command on
# the commandline within a Python function, but also *capture the output* of this 
# command in a variable, which you can continue to use in your Python environment.

# For this purpose, you can use subprocess.check_output

# The following will not work on windows OS, though you can experiment with your
# own commands!

# I will check the file size of all the folders in my git directory
contents = subprocess.check_output('du -sh ../*',shell=True)
# now I'll print the contents in a more readable format 
#(I have to convert the output from bytes to string, and then remove some symbols)
print(str(contents).replace('\\t','\n').replace('\\n',' '))
print('\n')

# Now I'll iterate through this information and only print a file if its above 1 MB
folders = str(contents).split('\\t')

for folder in folders:
    folder = folder.replace('\\n',' ')
    jnk = folder.split(' ')
    fnm,size = jnk[0],jnk[-1]                      
    if size[-1] == 'M':
        if float(size[:-1]) > 1.0:
            print('%s is larger than 1.0 MB'%fnm)
    elif size[-1] == 'G':
        print('%s is larger than 1.0 MB'%fnm)
        
# There are obviously much more useful applications for this function, but the takeaway
# is that you can save the output of command line commands into a variable!

In [None]:
### !!!!!!!!!!!WARNING!!!!!!!!!!! ###

# You should be sure to use extreme caution when using these commands, as you
# can accidentally delete or overwrite important files or folders if you're not
# careful

# For example, you could delete your whole hard drive with one short command
# something like shutil.rmtree('/'). You can also accidentally overwrite files
# with the copy, move and rename functions.

# The point is, don't do anything that involves writing new files names, or 
# file removal, without being absolutely sure you know what you're doing!!!!

In [None]:
# The last tool I'll show you in this section is one that you will use constantly: glob
# Glob will collect all paths fitting a certain "search string" and compile them into
# a list.

# I will use glob to collect all files ending in the pycourse directory that end in the
# ".ipynb" extention. We can do this by making use of symbols such as the wildcard or
# '*' symbol:

search_str = os.path.join(cwd,'*.ipynb')
print('here is our search string: \n',search_str,'\n')
jupyters = glob(search_str)
print('and here is the output of our glob \n')
print(jupyters)

In [None]:
# This is incredibly useful anytime you want to perform operations on multiple files
# especially when those files have similar filenames. This comes in great handy when
# working with neuroimages. We will return to this many times so don't worry if you
# don't get enough practice with it now!

In [None]:
import numpy as np
####### EXERCISES PART 1 ##############

# WARNING: For all exercises, using absolute rather than relative paths will help prevent 
# you from accidentally deleting or copying things you didn't mean to. 
# For example: a = '/Users/jakevogel/git/pycourse_vuMC/Lesson 3A.ipynb' is okay.
#              a = 'Lesson 3A.ipynb' or a = '../../Lesson 3A.ipynb' are not!
# Be very careful when completing these exercises!

## PART A
# Use os.walk to create a primitive search function. The function should take a
# "search string" as an input, and should output the parent directory of any
# files matching the search string. The function should have a second optional 
# argument where the directory to be searched can be passed. This second argument 
# should have a default input of the users current working directory
# BONUS: Add a third argument that determines whether the search should find only
# exact matches, or also partial matches


## PART B 
rands = np.random.randint(1,9999, 30)
subdict = {}
for i,r in enumerate(rands):
    subdict.update({i:'NL_BRAIN_%s'%rands[i]})
pth = os.path.join(cwd,'stuff/ex2')
# There are several (fake) PET images located in a directory stored into the variable pth.
# These files are listed sequentially from subj0 to subj29. However, these subjects have 
# IDs, and the map between the file IDs and the actual IDs can be found in subdict.
# Collect all files into a list. Then, copy them into a directory you create outside of the 
# pycourse directroy. Next, iterate through each file and rename it so that "subjX" 
# is replaced with the subject ID (KEY) in subdict that is indexed by the VALUE X.

## PART C
# Now that you've renamed the subject IDs, you want to sort them into different directories according
# to these IDs, which correspond to the month the scan was taken. 0-1999 = January, 2000-3999 = February,
# 3000-5999 = March, etc. Create directories for each month and copy the appropriate files into them.
# If any of the new directories have more than 5 files in them, targzip them and remove the un-targzipped directory.



In [None]:
# Don't look below until you've tried it a few times. The answers are in the next cell
# You can always create a new cell above this one and use it as scratch space
# If you mess up the variables, you can always rerun the cell above to reset them
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#


In [None]:
######## ANSWERS TO EXERCISE 1 ##############


## PART A
# Use os.walk to create a primitive search function.
def find_file(look_for,search_in=os.getcwd(),match='partial'):
    ''' will find all files within directory search_in that include look_for
    if match is set to partial, will find all files that include the search string.
    if match is set to exact, will only find files that exactly match the search string.'''
              
    if type(look_for) != str or type(search_in) != str:
        raise TypeError('inputs must be string arguments')
    
    if match != 'partial' and match != 'exact':
        print('Warning: argument match passed incorrectly','\n',
                'Moving forward with partial matches')
        match = 'partial'
    
    count = 0
    for root, dirs, files in os.walk(search_in):
        for fl in files:
            if match == 'partial':
                if look_for in fl:
                    print('found file %s in directory %s'%(fl,root))
                    count = count+1
            elif match == 'exact':
                if look_for == fl:
                    print('found file %s in directory %s'%(fl,root))
                    count = count+1
    
    if count == 0:
        print('Sorry, I could not find any files matching that search string')
    else:
        print('I found %s files matching that search string'%(count))
    
    
## PART B

# Collect all files into a list.
flz = sorted(glob(os.path.join(pth,'*')))

# Copy them into a directory you create outside of the pycourse directory
npth,jnk = os.path.split(cwd)
newdir = os.path.join(npth,'for_exercise')
os.mkdir(newdir)
for fl in flz:
    shutil.copy2(fl,newdir)
os.listdir(newdir)

# Iterate through each file and rename it so that "subjx" is replaced with the subject ID
flz = sorted(glob(os.path.join(newdir,'*')))
for fl in flz:
    fl_path,fl_name = os.path.split(fl)
    to_change = fl_name.split('.')[0]
    sub_num = to_change.split('subj')[-1]
    sid = subdict[int(sub_num)]
    new_fl = os.path.join(fl_path,'%s.nii.gz'%(sid))
    os.rename(fl,new_fl)

os.listdir(newdir)


## PART C

# Create directories for each month and copy the appropriate files into them.
dirs = {'January': [0,1999],'February':[2000,3999],'March':[4000,5999],'April':[6000,7999],'May':[8000,9999]}
flz = sorted(glob(os.path.join(newdir,'*')))
for fl in flz:
    fl_pth,flnm = os.path.split(fl)
    ID = int(flnm.split('_')[-1].split('.')[0])
    for dirnm,cutoffs in dirs.items():
        if ID >= cutoffs[0] and ID <= cutoffs[1]:
            ndir = os.path.join(fl_pth,dirnm)
            if not os.path.isdir(ndir):
                os.mkdir(ndir)
            shutil.copy2(fl,ndir)

# If any of the new directories have more than 5 files in them, targzip them and remove the un-targzipped directory. 
ndirs = sorted(glob(os.path.join(newdir,'*/')))
for ndir in ndirs:
    dirnm = ndir.split('/')[-2]
    d_files = glob(os.path.join(ndir,'*'))
    dir_size = len(d_files)
    if dir_size>5:
        shutil.make_archive(base_name = os.path.join(newdir,'rxiv_%s'%(dirnm)),format = 'gztar', root_dir = ndir)
        shutil.rmtree(ndir)
        
os.listdir(newdir)          
                   
# clean_up
shutil.rmtree(newdir)
