Skip to content

Commit

Permalink
Merge branch 'master' of github.com:wscullin/collfs
Browse files Browse the repository at this point in the history
  • Loading branch information
ahmadia committed Jul 10, 2012
2 parents 10ad53e + 6c08e8f commit f09c2e6
Show file tree
Hide file tree
Showing 2 changed files with 299 additions and 0 deletions.
250 changes: 250 additions & 0 deletions enchilada_import.py
@@ -0,0 +1,250 @@
"""
+++ Aron
This is an experimental unification of:
Asher's MPI_Import (finding/directory caching)
Will's mpiimporter (collective probing/bytecode loading)
Jed's collfs (collective .so loading)
It's bolted on to Asher's cached_import.py file,
and almost completely untested/unverified.
---
This is an initial implementation of the finder/loader discussed at:
http://mail.scipy.org/pipermail/numpy-discussion/2012-March/061160.html
This is intended to take the place of MPI_Import.py. This version has
only been tested minimally, and is being made available primarily for
testing and preliminary benchmarking.
Known issues:
- Modules loaded via the Windows registry may be incorrectly hidden by
a module of the same name in sys.path.
- If a file is added to a directory on sys.path, it won't be cached, so
there may be precedence issues. If a file disappears or its permissions
change, the import will fail.
Update (3/16/12): I've merged in a new version, simple_finder, described
below.
To use the finder, start a script off with the following:
import sys
from cached_import import finder
sys.meta_path.append(finder())
There are also variants of the finder that use MPI. The rank 0 process
builds the cache and then broadcasts it. For these, replace finder
with either pympi_finder or mpi4py_finder.
This finder works by building a cache mapping module names to
locations. The expensive parts of this process are the calls that
result in a stat. For that reason, we don't, by default, check whether
a module file is readable.
Since calls like os.isfile are expensive, I've added an alternate
version called simple_finder. Instead of figuring out where all of the
modules in sys.path are located, we just cache the contents of
directories on sys.path and use the standard probing algorithm for the
imports. This is much cheaper at startup and easier to maintain. It
appears to be a bit faster than the MPI-enabled finders, though that
will depend on the number of modules in sys.path as well as the number
of modules actually imported.
"""

import sys,os,imp
import mpiimporter

class finder(object):
def __init__(self,skip_checks=True,build=True):
"""Build a finder object.
Arguments:
- skip_checks: Don't test whether modules are readable while building
the cache. This improves performace, but can cause an
unreadable file that looks like a Python module to
shadow a readable module with the same name later
in sys.path.
-build: if set, build the cache now. This is used in the mpi4py_finder
and pympi_finder extensions
"""
# Store some suffix and module description information
t = imp.get_suffixes()
self.skip_checks = skip_checks
self._suffixes = [x[0] for x in t] # in order of precedence
self._rsuffixes = self._suffixes[::-1] # and in reverse order
self._suffix_tuples = dict((x[0],tuple(x)) for x in t)

# We store the value of sys.path in _syspath so we can keep track
# of changes. _cache is a dictionary mapping module names to tuples
# containing the information needed to load the module (path and
# module description).
if build:
self._syspath = list(sys.path)
self._build_cache()
else: # For some subclasses
self._syspath = []
self._cache = {}

def _build_cache(self):
"""Traverse sys.path, building (or re-building) the cache."""
import os
self._cache = {}
for d in self._syspath:
self._process_dir(os.path.realpath(d))

def find_module(self,fullname,path=None):
"""Return mpiloader if 'fullname' is in sys.path (and isn't a builtin or
frozen module)."""

# Don't override builtin/frozen modules. TODO: Windows registry?
if (fullname not in sys.builtin_module_names and
not imp.is_frozen(fullname) and
fullname in self._cache):

return self
return None

def load_module(self,fullname):
"""Load the module fullname using cached path."""
if fullname in self._cache:
if fullname in sys.modules:
return sys.modules[fullname]
pathname,desc = self._cache[fullname]
#print "__LOADING ",fullname,pathname


ignore, ext = os.path.splitext(pathname)
target_path = [os.path.dirname(pathname)]

subname = fullname.split(".")[-1]

if os.path.isfile(pathname):
# (If we're loading a PY_SOURCE file, the interpreter will
# automatically check for a compiled (.py[c|o]) file.)


if ext == '.so':
file, filename, stuff = imp.find_module(subname, target_path)
mod = imp.load_module(fullname,file,pathname,desc)
else:
file, filename, stuff = mpiimporter.find_module(subname, target_path)
mod = mpiimporter.load_module(fullname,file,pathname,desc)

if file:
file.close()
# Not a file, so it's a package directory
else:
file, filename, stuff = mpiimporter.find_module(subname, target_path)
mod = mpiimporter.load_module(fullname,file,pathname,desc)

mod.__loader__ = self # for introspection
return mod
raise ImportError("This shouldn't happen!")


# Build up a dict of modules (including package directories) found in a
# directory. If this directory has been prepended to the path, we need to
# overwrite any conflicting entries in the cache. To make sure precedence
# is correct, we'll reverse the list of suffixes when we're prepending.
#
# Rather than add a lot of checks here to make sure we don't stomp on a
# builtin module, we'll just reject these in find_module
def _process_dir(self,dir,parent=None,prepend=False,visited=None):
"""Process a directory dir, looking for valid modules.
Arguments:
dir -- (an absolute, real path to a directory)
parent -- parent module, in the case where dir is a package directory
prepend -- True if dir has just been prepended to sys.path. In that
case, we'll replace existing cached entries with the same
module name.
visited -- list of the real paths of visited directories. Used to
prevent infinite recursion in the case of symlink cycles
in package subdirectories.
"""
import stat

# Avoid symlink cycles in a package.
if not visited:
visited = [dir]
elif dir not in visited:
visited.append(dir)
else:
return

# All files and subdirs. Store the name and the path.
try:
contents = dict((x,os.path.join(dir,x))
for x in os.listdir(dir))
# Unreadable directory, so skip
except OSError:
return

# If this is a possible package directory with no __init__.py, bail
# out. If __init__.py is there, we need to see if there's an exising
# module by that name.
if parent:
if "__init__.py" not in contents:
return
if not (self.skip_checks or
os.access(os.path.join(dir,"__init__.py"),os.R_OK)):
return
if parent in self._cache and not prepend:
return
# Okay, this is a valid, non-duplicate module.
self._cache[parent] = (dir,('','',imp.PKG_DIRECTORY))

# Split contents into files & subdirs (only stat each one once)
files = {}
subdirs = {}
for entry in contents:
try:
mode = os.stat(contents[entry]).st_mode
except OSError:
continue # couldn't read!
if stat.S_ISDIR(mode) and (self.skip_checks or
os.access(contents[entry],os.R_OK)):
subdirs[entry] = contents[entry]
elif stat.S_ISREG(mode) and (self.skip_checks or
os.access(contents[entry],os.R_OK)):
files[entry] = contents[entry]

# Package directories have the highest precedence. But when prepend is
# True, we need to reverse the order here. We'll do this with these
# nested functions.
def process_subdirs():
for d in subdirs:
fqname = parent+"."+d if parent else d # fully qualified name
self._process_dir(os.path.join(dir,d),fqname,prepend,visited)

def process_files():
ordered_suffixes = self._rsuffixes if prepend else self._suffixes
for s in ordered_suffixes:
l = len(s)
for f in files:
# Check for matching suffix.
if f[-l:] == s:
fqname = parent+"."+f[:-l] if parent else f[:-l]
if fqname not in self._cache or prepend:
self._cache[fqname] = (files[f],
self._suffix_tuples[s])

if prepend:
process_files()
process_subdirs()
else:
process_subdirs()
process_files()

"""Finder that lets one MPI process do all of the initial caching.
"""
class mpi4py_finder(finder):
def __init__(self,skip_checks=True):
from mpi4py import MPI
comm = MPI.COMM_WORLD
rank = comm.Get_rank()
if rank == 0:
finder.__init__(self,skip_checks)
else:
finder.__init__(self,skip_checks,False)
self._syspath,self._cache = comm.bcast((self._syspath,self._cache))
49 changes: 49 additions & 0 deletions shaheen/scalability_tests.ll
@@ -0,0 +1,49 @@
#!/usr/bin/env bash
#
# @ job_name = kslrun_job
# @ job_type = bluegene
# @ output = ./$(job_name)_$(jobid).out
# @ error = ./$(job_name)_$(jobid).err
# @ environment = COPY_ALL;
# @ wall_clock_limit = 4:00:00,4:00:00
# @ notification = always
# @ bg_size = 4096
# @ account_no = k01

# @ queue

projdir=/project/k01/pyclaw
sandbox=${projdir}/sandbox
builddir=${projdir}/opt/share
srcdir=${builddir}/sources

pythondir=${builddir}/python/2.7.2/bgp
ldpath=${pythondir}/lib
numpy_path=${builddir}/numpy/1.6.2/bgp/lib/python
nose_path=${builddir}/nose/1.1.2/bgp/lib/python
clawpack_path=${builddir}/clawpack/dev/bgp/lib/python
petsc4py_path=${builddir}/petsc4py/1.2/bgp/lib/python
mpi_python_path=${builddir}/mpi4py/1.3/bgp/lib/python

bgp_python_path=${numpy_path}:${nose_path}:${clawpack_path}:${petsc4py_path}:${mpi_python_path}

bgp_python=${pythondir}/bin/python
mpi_python=${builddir}/mpi4py/1.3/bgp/lib/python/mpi4py/bin/python-mpi

testdir=/gpfs/scratch/aron/sandbox/import/collfs/tests

cd $testdir
logdir=${testdir}/runs
mkdir -p ${logdir}

for np in 4096 8192 16384
do
mpirun -env LD_LIBRARY_PATH=${ldpath} -env PYTHONPATH=${bgp_python_path} \
-mode VN -exp_env HOME -n $np ${bgp_python} test_python_importer.py &> ${logdir}/python_${np}.txt
mpirun -env LD_LIBRARY_PATH=${ldpath} -env PYTHONPATH=${bgp_python_path} \
-mode VN -exp_env HOME -n $np ${mpi_python} test_collfs_importer.py &> ${logdir}/collfs_${np}.txt
mpirun -env LD_LIBRARY_PATH=${ldpath} -env PYTHONPATH=${bgp_python_path} \
-mode VN -exp_env HOME -n $np ${bgp_python} test_mpi4py_cached_importer.py &> ${logdir}/asher_${np}.txt
mpirun -env LD_LIBRARY_PATH=${ldpath} -env PYTHONPATH=${bgp_python_path} \
-mode VN -exp_env HOME -n $np ${mpi_python} test_mpi4py_cached_importer.py &> ${logdir}/asher_collfs_${np}.txt
done

0 comments on commit f09c2e6

Please sign in to comment.