diff --git a/examples/config files - basic/3 connector-ldap.yml b/examples/config files - basic/3 connector-ldap.yml index d63c53505..f7e259c6b 100755 --- a/examples/config files - basic/3 connector-ldap.yml +++ b/examples/config files - basic/3 connector-ldap.yml @@ -65,6 +65,14 @@ all_users_filter: "(&(objectClass=user)(objectCategory=person)(!(userAccountCont # or this one for OpenLDAP: "(&(|(objectClass=groupOfNames)(objectClass=posixGroup))(cn={group}))" group_filter_format: "(&(|(objectCategory=group)(objectClass=groupOfNames)(objectClass=posixGroup))(cn={group}))" +# (optional) string_encoding (default value given below) +# string_encoding specifies the Unicode string encoding used by the directory. +# All values retrieved from the directory are converted to Unicode before being +# sent to or compared with values on the Adobe side, to avoid encoding issues. +# The value must be a Python codec name or alias, such as 'latin1' or 'utf-8. +# See https://docs.python.org/2/library/codecs.html#standard-encodings for details. +#string_encoding: utf-8 + # (optional) user_identity_type_format (no default) # user_identity_type_format specifies how to construct a user's desired identity # type on the Adobe side by combining constant strings with attribute values. @@ -86,6 +94,8 @@ group_filter_format: "(&(|(objectCategory=group)(objectClass=groupOfNames)(objec # The default value used here is simple, and suitable for OpenLDAP systems. If you # are using a non-email-aware AD system, which holds the username separately # from the domain name, you may want: "{sAMAccountName}@mydomain.com" +# NOTE: for this and every format setting, the constant strings must be in +# the encoding specified by the string_encoding setting, above. user_email_format: "{mail}" # (optional) user_domain_format (no default value) diff --git a/examples/config files - basic/4 connector-csv.yml b/examples/config files - basic/4 connector-csv.yml index 21f6f6e23..d2db41b6c 100644 --- a/examples/config files - basic/4 connector-csv.yml +++ b/examples/config files - basic/4 connector-csv.yml @@ -20,6 +20,14 @@ # To set it to a specific value, uncomment this setting: #delimiter: "," +# (optional) string_encoding (default value given below) +# string_encoding specifies the Unicode string encoding used in the CSV file. +# All values retrieved from the file are converted to Unicode before being +# sent to or compared with values on the Adobe side, to avoid encoding issues. +# The value must be a Python codec name or alias, such as 'latin1' or 'utf-8. +# See https://docs.python.org/2/library/codecs.html#standard-encodings for details. +#string_encoding: utf-8 + # (optional) email_column_name (default "email") # The column name that contains the user's email address. # Values in this column must be valid, unquoted email addresses. diff --git a/setup.py b/setup.py index d790c7f7f..82ad80374 100644 --- a/setup.py +++ b/setup.py @@ -43,7 +43,7 @@ 'pycrypto', 'python-ldap==2.4.25', 'PyYAML', - 'umapi-client>=2.3', + 'umapi-client>=2.4.1', 'psutil', 'keyring' ], diff --git a/user_sync/app.py b/user_sync/app.py index fd9e73b82..e7d86fa15 100644 --- a/user_sync/app.py +++ b/user_sync/app.py @@ -29,9 +29,9 @@ import user_sync.config import user_sync.connector.directory import user_sync.connector.umapi -from user_sync.error import AssertionException import user_sync.lockfile import user_sync.rules +from user_sync.error import AssertionException from user_sync.version import __version__ as APP_VERSION LOG_STRING_FORMAT = '%(asctime)s %(process)d %(levelname)s %(name)s - %(message)s' @@ -85,6 +85,12 @@ def process_args(): "When using this option, you must also specify what you want done with Adobe-only " "users by also including --adobe-only-user-action and one of its arguments", metavar='input_path', dest='stray_list_input_path') + parser.add_argument('--config-file-encoding', + help="config files are expected to contain only ASCII characters; if you " + "use an extended character set (e.g., to specify group names), then " + "specify the encoding of your configuration files with this argument. " + "All encoding names understood by Python are allowed.", + dest='encoding_name', default='ascii') return parser.parse_args() @@ -137,7 +143,7 @@ def init_log(logging_config): fileHandler.setLevel(file_log_level) fileHandler.setFormatter(logging.Formatter(LOG_STRING_FORMAT, LOG_DATE_FORMAT)) logging.getLogger().addHandler(fileHandler) - if (unknown_file_log_level == True): + if unknown_file_log_level: logger.log(logging.WARNING, 'Unknown file log level: %s setting to info' % options['file_log_level']) @@ -200,6 +206,7 @@ def begin_work(config_loader): def create_config_loader(args): config_bootstrap_options = { 'main_config_filename': args.config_filename, + 'config_file_encoding': args.encoding_name, } config_loader = user_sync.config.ConfigLoader(config_bootstrap_options) return config_loader diff --git a/user_sync/config.py b/user_sync/config.py index a15fb8ced..50d3578ec 100644 --- a/user_sync/config.py +++ b/user_sync/config.py @@ -18,6 +18,7 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. +import codecs import logging import os import re @@ -41,6 +42,7 @@ def __init__(self, caller_options): self.options = options = { # these are in alphabetical order! Always add new ones that way! 'delete_strays': False, + 'config_file_encoding': 'ascii', 'directory_connector_module_name': None, 'directory_connector_overridden_options': None, 'directory_group_filter': None, @@ -56,14 +58,15 @@ def __init__(self, caller_options): 'update_user_info': True, 'username_filter_regex': None, } - options.update(caller_options) - + options.update(caller_options) main_config_filename = options.get('main_config_filename') + config_encoding = options['config_file_encoding'] + try: + codecs.lookup(config_encoding) + except LookupError: + raise AssertionException("Unknown encoding '%s' specified with --config-file-encoding" % config_encoding) + ConfigFileLoader.config_encoding = config_encoding main_config_content = ConfigFileLoader.load_root_config(main_config_filename) - - if (not os.path.isfile(main_config_filename)): - raise AssertionException('Config file does not exist: %s' % (main_config_filename)) - self.logger = logger = logging.getLogger('config') logger.info("Using main config file: %s", main_config_filename) self.main_config = DictConfig("<%s>" % main_config_filename, main_config_content) @@ -606,6 +609,10 @@ class ConfigFileLoader: ''' Loads config files and does pathname expansion on settings that refer to files or directories ''' + # config files can contain Unicode characters, so an encoding for them + # can be specified as a command line argument. This defaults to ascii. + config_encoding = 'ascii' + # key_paths in the root configuration file that should have filename values # mapped to their value options. See load_from_yaml for the option meanings. ROOT_CONFIG_PATH_KEYS = {'/adobe_users/connectors/umapi': (True, True, None), @@ -680,9 +687,11 @@ def load_from_yaml(cls, filename, path_keys): cmd = filename[3:-1] try: bytes = subprocess.check_output(cmd, cwd=dir, shell=True) - yml = yaml.load(bytes) + yml = yaml.load(bytes.decode(cls.config_encoding, 'strict')) except subprocess.CalledProcessError as e: raise AssertionException("Error executing process '%s' in dir '%s': %s" % (cmd, dir, e)) + except UnicodeDecodeError as e: + raise AssertionException('Encoding error in process output: %s' % e) except yaml.error.MarkedYAMLError as e: raise AssertionException('Error parsing process YAML data: %s' % e) else: @@ -693,17 +702,20 @@ def load_from_yaml(cls, filename, path_keys): cls.filename = os.path.split(cls.filepath)[1] cls.dirpath = os.path.dirname(cls.filepath) try: - with open(filename, 'r', 1) as input_file: - yml = yaml.load(input_file) + with open(filename, 'rb', 1) as input_file: + bytes = input_file.read() + yml = yaml.load(bytes.decode(cls.config_encoding, 'strict')) except IOError as e: # if a file operation error occurred while loading the - # configuration file, swallow up the exception and re-raise this + # configuration file, swallow up the exception and re-raise it # as an configuration loader exception. - raise AssertionException('Error reading configuration file: %s' % e) + raise AssertionException("Error reading configuration file '%s': %s" % (cls.filepath, e)) + except UnicodeDecodeError as e: + # as above, but in case of encoding errors + raise AssertionException("Encoding error in configuration file '%s: %s" % (cls.filepath, e)) except yaml.error.MarkedYAMLError as e: - # same as above, but indicate this problem has to do with - # parsing the configuration file. - raise AssertionException('Error parsing configuration file: %s' % e) + # as above, but in case of parse errors + raise AssertionException("Error parsing configuration file '%s': %s" % (cls.filepath, e)) # process the content of the dict for path_key, options in path_keys.iteritems(): diff --git a/user_sync/connector/directory_csv.py b/user_sync/connector/directory_csv.py index 1f2804479..e37c873bc 100644 --- a/user_sync/connector/directory_csv.py +++ b/user_sync/connector/directory_csv.py @@ -56,6 +56,7 @@ def __init__(self, caller_options): caller_config = user_sync.config.DictConfig('%s configuration' % self.name, caller_options) builder = user_sync.config.OptionsBuilder(caller_config) builder.set_string_value('delimiter', None) + builder.set_string_value('string_encoding', 'utf-8') builder.set_string_value('first_name_column_name', 'firstname') builder.set_string_value('last_name_column_name', 'lastname') builder.set_string_value('email_column_name', 'email') @@ -73,6 +74,8 @@ def __init__(self, caller_options): logger.debug('%s initialized with options: %s', self.name, options) caller_config.report_unused_values(logger) + # encoding of column values + self.encoding = options['string_encoding'] # identity type for new users if not specified in column self.user_identity_type = user_sync.identity_type.parse_identity_type(options['user_identity_type']) @@ -190,7 +193,4 @@ def get_column_value(self, row, column_name): :type column_name: str ''' value = row.get(column_name) - if (value == ''): - value = None - return value - + return value.decode(self.encoding) if value else None diff --git a/user_sync/connector/directory_ldap.py b/user_sync/connector/directory_ldap.py index 219ba7db6..3fefac80a 100755 --- a/user_sync/connector/directory_ldap.py +++ b/user_sync/connector/directory_ldap.py @@ -20,7 +20,6 @@ import string -import keyring import ldap.controls.libldap import user_sync.config @@ -64,6 +63,7 @@ def __init__(self, caller_options): builder.set_string_value('group_filter_format', '(&(|(objectCategory=group)(objectClass=groupOfNames)(objectClass=posixGroup))(cn={group}))') builder.set_string_value('all_users_filter', '(&(objectClass=user)(objectCategory=person)(!(userAccountControl:1.2.840.113556.1.4.803:=2)))') builder.set_bool_value('require_tls_cert', False) + builder.set_string_value('string_encoding', 'utf-8') builder.set_string_value('user_identity_type_format', None) builder.set_string_value('user_email_format', '{mail}') builder.set_string_value('user_username_format', None) @@ -79,6 +79,7 @@ def __init__(self, caller_options): self.logger = logger = user_sync.connector.helper.create_logger(options) logger.debug('%s initialized with options: %s', self.name, options) + LDAPValueFormatter.encoding = options['string_encoding'] self.user_identity_type = user_sync.identity_type.parse_identity_type(options['user_identity_type']) self.user_identity_type_formatter = LDAPValueFormatter(options['user_identity_type_format']) self.user_email_formatter = LDAPValueFormatter(options['user_email_format']) @@ -367,19 +368,20 @@ def iter_search_result(self, base_dn, scope, filter_string, attributes): raise class LDAPValueFormatter(object): + encoding = 'utf-8' + def __init__(self, string_format): ''' :type string_format: str - ''' - if (string_format == None): + ''' + if (string_format == None): attribute_names = [] else: formatter = string.Formatter() attribute_names = [item[1] for item in formatter.parse(string_format) if item[1]] - self.string_format = string_format self.attribute_names = attribute_names - + def get_attribute_names(self): ''' :rtype list(str) @@ -402,11 +404,11 @@ def generate_value(self, record): break values[attribute_name] = value if values is not None: - result = self.string_format.format(**values) + result = self.string_format.format(**values).decode(self.encoding) return (result, attribute_name) - @staticmethod - def get_attribute_value(attributes, attribute_name): + @classmethod + def get_attribute_value(cls, attributes, attribute_name): ''' :type attributes: dict :type attribute_name: str @@ -414,5 +416,5 @@ def get_attribute_value(attributes, attribute_name): if attribute_name in attributes: attribute_value = attributes[attribute_name] if (len(attribute_value) > 0): - return attribute_value[0] + return attribute_value[0].decode(cls.encoding) return None diff --git a/user_sync/connector/umapi.py b/user_sync/connector/umapi.py index 828f51404..63b666804 100644 --- a/user_sync/connector/umapi.py +++ b/user_sync/connector/umapi.py @@ -65,7 +65,7 @@ def __init__(self, name, caller_options): options['enterprise'] = enterprise_options = enterprise_builder.get_options() self.options = options self.logger = logger = helper.create_logger(options) - server_config.report_unused_values(logger) + if server_config: server_config.report_unused_values(logger) logger.debug('UMAPI initialized with options: %s', options) # set up the auth dict for umapi-client diff --git a/user_sync/helper.py b/user_sync/helper.py index 771526eba..0dc8e42be 100644 --- a/user_sync/helper.py +++ b/user_sync/helper.py @@ -37,9 +37,11 @@ def open_file(name, mode, buffering = -1): def normalize_string(string_value): ''' - :type string_value: str + Normalize a unicode or regular string + :param string_value: either a unicode or regular string or None + :return: the same type that came in ''' - return string_value.strip().lower() if string_value != None else None + return string_value.strip().lower() if string_value is not None else None def guess_delimiter_from_filename(filename): ''' diff --git a/user_sync/rules.py b/user_sync/rules.py index ef4b886f9..dbf8467e5 100644 --- a/user_sync/rules.py +++ b/user_sync/rules.py @@ -93,6 +93,7 @@ def __init__(self, caller_options): # in the secondary umapis (and exclude all that don't match). Finally, # we keep track of user keys (in any umapi) that we have updated, so # we can correctly report their count. + self.adobe_user_count = 0 self.included_user_keys = set() self.excluded_user_count = 0 self.updated_user_keys = set() @@ -172,7 +173,7 @@ def log_action_summary(self, umapi_connectors): self.action_summary['directory_users_read'] = len(self.directory_user_by_user_key) self.action_summary['directory_users_selected'] = len(self.filtered_directory_user_by_user_key) # find the total number of adobe users and excluded users - self.action_summary['adobe_users_read'] = len(self.included_user_keys) + self.excluded_user_count + self.action_summary['adobe_users_read'] = self.adobe_user_count self.action_summary['adobe_users_excluded'] = self.excluded_user_count self.action_summary['adobe_users_updated'] = len(self.updated_user_keys) # find out the number of users that have no changes; this depends on whether @@ -752,6 +753,7 @@ def update_umapi_users_for_connector(self, umapi_info, umapi_connector): def is_umapi_user_excluded(self, in_primary_org, user_key, current_groups): if in_primary_org: + self.adobe_user_count += 1 # in the primary umapi, we actually check the exclusion conditions identity_type, username, domain = self.parse_user_key(user_key) if identity_type in self.exclude_identity_types: @@ -886,7 +888,7 @@ def get_user_key(self, id_type, username, domain, email=None): domain = "" elif not domain: return None - return id_type + ',' + username + ',' + domain + return unicode(id_type) + u',' + unicode(username) + u',' + unicode(domain) def parse_user_key(self, user_key): '''Returns the identity_type, username, and domain for the user.