# problem statement
we'll start by describing the problem we want to solve. if you already know all you need to solve it, you can skip the rest of this session! 

imagine you found a nice dataset you need to analyse, but instead of consisting of a single file or a set of conveniently named files sitting together in a directory the files are scattered about in folders, sub folders, and sub-sub folders. there are tens, or hundreds of them. also, not all of the files are data files, some of them are documentation files. 

you just want a list of the data files so you can iterate over it and process  them all in some way. so you now need to create a python function that takes a path to a root directory as its arguement, and then will traverse the  folder tree structure and collect all the files therein that have a given filename extension and returns a list of the files found (path + filename) 

## bonus: 
filter the data files (assumed to have `.dat` filename ending) and return the list, ordered by **decreasing file size**.

In [1]:
import os

## USER INPUT

In [2]:
filename_to_collect = ".dat"
# collect_from_directory = ""

In [3]:
print(os.getcwd()) # get-current-working-directory
start_here = os.getcwd()

/Users/peterf/Documents/Python and automation/Python Workshop/python_exercises


In [4]:
def get_filesize(f_file, f_filepath):
    #Add the / becuase it the directory doesn't end in a / as I might expect it to
    return os.path.getsize(f_filepath+"/"+f_file)

In [5]:
def collect_files(f_collect_from):
    files_dict = {}
    #for the tuple containing the current directory, the sub directories and the files...
    for current_directory, sub_directories, files in os.walk(f_collect_from):
        #... for each filename in the list of files...
        for file in files:
            #...if the filename ends in the selected file type (e.g. .dat)
            if file.endswith(filename_to_collect):
                #make a new dictionary with some info about the file
                #decided to make the directory and filename the key, as that's more unique than either alone
                files_dict[current_directory+file] = {"directory":current_directory, 
                                                      "filename":file, 
                                                      "file size":get_filesize(file,current_directory)}
    return files_dict

In [6]:
def print_file_info(f_dict):
    for each_file in f_dict:
#     print(each_file)
    #for each dictionary containing a different bit of info about each file...
        for each_sub_dict in f_dict[each_file]:
            #print that bit of info, e.g. its directory
            print(each_sub_dict, ":", f_dict[each_file][each_sub_dict])
    # use ---- as a spacer beteeen each file's info
        print("----")

In [7]:
#make the dictionary that will contain the output
#...and perform the collect files function on the selected directory
output_dict = collect_files(start_here)

In [8]:
print_file_info(output_dict)

directory : /Users/peterf/Documents/Python and automation/Python Workshop/python_exercises/exercises/01 intro and setup/exciting_data
filename : file1.dat
file size : 12
----
directory : /Users/peterf/Documents/Python and automation/Python Workshop/python_exercises/exercises/01 intro and setup/exciting_data
filename : file2.dat
file size : 13
----
directory : /Users/peterf/Documents/Python and automation/Python Workshop/python_exercises/exercises/01 intro and setup/exciting_data/sub_1
filename : file3.dat
file size : 65
----
directory : /Users/peterf/Documents/Python and automation/Python Workshop/python_exercises/exercises/01 intro and setup/exciting_data/sub_2
filename : file4.dat
file size : 13
----
directory : /Users/peterf/Documents/Python and automation/Python Workshop/python_exercises/exercises/01 intro and setup/exciting_data/sub_2/subsub_1
filename : file6.dat
file size : 13
----
directory : /Users/peterf/Documents/Python and automation/Python Workshop/python_exercises/exercis

# Appendix, proof of concepts

In [69]:
test_string = "myfile.dat"

test_string.endswith(".dat")

True

In [88]:
test_file = "/Users/peterf/Documents/Python and automation/Python Workshop/python_exercises/exercises/01 intro and setup/exciting_data/sub_3/file5.dat"

print(test_file, "-----", os.path.getsize(test_file), "bytes")

/Users/peterf/Documents/Python and automation/Python Workshop/python_exercises/exercises/01 intro and setup/exciting_data/sub_3/file5.dat ----- 26 bytes


In [144]:
for each_file in files_and_directories:
    print(get_filesize(each_file, files_and_directories[each_file]))

12
13
65
13
13
130
26


In [11]:
for each_file in output_dict:
#     print(each_file)
    #for each dictionary containing a different bit of info about each file...
    for each_sub_dict in output_dict[each_file]:
        #print that bit of info, e.g. its directory
        print(each_sub_dict, ":", output_dict[each_file][each_sub_dict])
    # use ---- as a spacer beteeen each file's info
    print("----")

directory : /Users/peterf/Documents/Python and automation/Python Workshop/python_exercises/exercises/01 intro and setup/exciting_data
filename : file1.dat
file size : 12
----
directory : /Users/peterf/Documents/Python and automation/Python Workshop/python_exercises/exercises/01 intro and setup/exciting_data
filename : file2.dat
file size : 13
----
directory : /Users/peterf/Documents/Python and automation/Python Workshop/python_exercises/exercises/01 intro and setup/exciting_data/sub_1
filename : file3.dat
file size : 65
----
directory : /Users/peterf/Documents/Python and automation/Python Workshop/python_exercises/exercises/01 intro and setup/exciting_data/sub_2
filename : file4.dat
file size : 13
----
directory : /Users/peterf/Documents/Python and automation/Python Workshop/python_exercises/exercises/01 intro and setup/exciting_data/sub_2/subsub_1
filename : file6.dat
file size : 13
----
directory : /Users/peterf/Documents/Python and automation/Python Workshop/python_exercises/exercis