adobe-apiplatform · adobeDan · May 5, 2017 · May 5, 2017 · May 5, 2017
diff --git a/examples/config files - basic/3 connector-ldap.yml b/examples/config files - basic/3 connector-ldap.yml
@@ -65,6 +65,14 @@ all_users_filter: "(&(objectClass=user)(objectCategory=person)(!(userAccountCont
 # or this one for OpenLDAP: "(&(|(objectClass=groupOfNames)(objectClass=posixGroup))(cn={group}))"
 group_filter_format: "(&(|(objectCategory=group)(objectClass=groupOfNames)(objectClass=posixGroup))(cn={group}))"
 
+# (optional) string_encoding (default value given below)
+# string_encoding specifies the Unicode string encoding used by the directory.
+# All values retrieved from the directory are converted to Unicode before being
+# sent to or compared with values on the Adobe side, to avoid encoding issues.
+# The value must be a Python codec name or alias, such as 'latin1' or 'utf-8.
+# See https://docs.python.org/2/library/codecs.html#standard-encodings for details.
+#string_encoding: utf-8
+
 # (optional) user_identity_type_format (no default)
 # user_identity_type_format specifies how to construct a user's desired identity
 # type on the Adobe side by combining constant strings with attribute values.
@@ -86,6 +94,8 @@ group_filter_format: "(&(|(objectCategory=group)(objectClass=groupOfNames)(objec
 # The default value used here is simple, and suitable for OpenLDAP systems.  If you
 # are using a non-email-aware AD system, which holds the username separately
 # from the domain name, you may want: "{sAMAccountName}@mydomain.com"
+# NOTE: for this and every format setting, the constant strings must be in
+# the encoding specified by the string_encoding setting, above.
 user_email_format: "{mail}"
 
 # (optional) user_domain_format (no default value)

diff --git a/examples/config files - basic/4 connector-csv.yml b/examples/config files - basic/4 connector-csv.yml
@@ -20,6 +20,14 @@
 # To set it to a specific value, uncomment this setting:
 #delimiter: ","
 
+# (optional) string_encoding (default value given below)
+# string_encoding specifies the Unicode string encoding used in the CSV file.
+# All values retrieved from the file are converted to Unicode before being
+# sent to or compared with values on the Adobe side, to avoid encoding issues.
+# The value must be a Python codec name or alias, such as 'latin1' or 'utf-8.
+# See https://docs.python.org/2/library/codecs.html#standard-encodings for details.
+#string_encoding: utf-8
+
 # (optional) email_column_name (default "email")
 # The column name that contains the user's email address.
 # Values in this column must be valid, unquoted email addresses.

diff --git a/setup.py b/setup.py
@@ -43,7 +43,7 @@
           'pycrypto',
           'python-ldap==2.4.25',
           'PyYAML',
-          'umapi-client>=2.3',
+          'umapi-client>=2.4.1',
           'psutil',
           'keyring'
       ],

diff --git a/user_sync/app.py b/user_sync/app.py
@@ -29,9 +29,9 @@
 import user_sync.config
 import user_sync.connector.directory
 import user_sync.connector.umapi
-from user_sync.error import AssertionException
 import user_sync.lockfile
 import user_sync.rules
+from user_sync.error import AssertionException
 from user_sync.version import __version__ as APP_VERSION
 
 LOG_STRING_FORMAT = '%(asctime)s %(process)d %(levelname)s %(name)s - %(message)s'
@@ -85,6 +85,12 @@ def process_args():
                              "When using this option, you must also specify what you want done with Adobe-only "
                              "users by also including --adobe-only-user-action and one of its arguments",
                         metavar='input_path', dest='stray_list_input_path')
+    parser.add_argument('--config-file-encoding',
+                        help="config files are expected to contain only ASCII characters; if you "
+                             "use an extended character set (e.g., to specify group names), then "
+                             "specify the encoding of your configuration files with this argument. "
+                             "All encoding names understood by Python are allowed.",
+                        dest='encoding_name', default='ascii')
     return parser.parse_args()
 
 
@@ -137,7 +143,7 @@ def init_log(logging_config):
         fileHandler.setLevel(file_log_level)
         fileHandler.setFormatter(logging.Formatter(LOG_STRING_FORMAT, LOG_DATE_FORMAT))
         logging.getLogger().addHandler(fileHandler)
-        if (unknown_file_log_level == True):
+        if unknown_file_log_level:
             logger.log(logging.WARNING, 'Unknown file log level: %s setting to info' % options['file_log_level'])
 
 
@@ -200,6 +206,7 @@ def begin_work(config_loader):
 def create_config_loader(args):
     config_bootstrap_options = {
         'main_config_filename': args.config_filename,
+        'config_file_encoding': args.encoding_name,
     }
     config_loader = user_sync.config.ConfigLoader(config_bootstrap_options)
     return config_loader

diff --git a/user_sync/config.py b/user_sync/config.py
@@ -18,6 +18,7 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.
 
+import codecs
 import logging
 import os
 import re
@@ -41,6 +42,7 @@ def __init__(self, caller_options):
         self.options = options = {
             # these are in alphabetical order!  Always add new ones that way!
             'delete_strays': False,
+            'config_file_encoding': 'ascii',
             'directory_connector_module_name': None,
             'directory_connector_overridden_options': None,
             'directory_group_filter': None,
@@ -56,14 +58,15 @@ def __init__(self, caller_options):
             'update_user_info': True,
             'username_filter_regex': None,
         }
-        options.update(caller_options)     
-
+        options.update(caller_options)
         main_config_filename = options.get('main_config_filename')
+        config_encoding = options['config_file_encoding']
+        try:
+            codecs.lookup(config_encoding)
+        except LookupError:
+            raise AssertionException("Unknown encoding '%s' specified with --config-file-encoding" % config_encoding)
+        ConfigFileLoader.config_encoding = config_encoding
         main_config_content = ConfigFileLoader.load_root_config(main_config_filename)
-
-        if (not os.path.isfile(main_config_filename)):
-            raise AssertionException('Config file does not exist: %s' % (main_config_filename))  
-
         self.logger = logger = logging.getLogger('config')
         logger.info("Using main config file: %s", main_config_filename)                
         self.main_config = DictConfig("<%s>" % main_config_filename, main_config_content)
@@ -606,6 +609,10 @@ class ConfigFileLoader:
     '''
     Loads config files and does pathname expansion on settings that refer to files or directories
     '''
+    # config files can contain Unicode characters, so an encoding for them
+    # can be specified as a command line argument.  This defaults to ascii.
+    config_encoding = 'ascii'
+
     # key_paths in the root configuration file that should have filename values
     # mapped to their value options.  See load_from_yaml for the option meanings.
     ROOT_CONFIG_PATH_KEYS = {'/adobe_users/connectors/umapi': (True, True, None),
@@ -680,9 +687,11 @@ def load_from_yaml(cls, filename, path_keys):
                 cmd = filename[3:-1]
             try:
                 bytes = subprocess.check_output(cmd, cwd=dir, shell=True)
-                yml = yaml.load(bytes)
+                yml = yaml.load(bytes.decode(cls.config_encoding, 'strict'))
             except subprocess.CalledProcessError as e:
                 raise AssertionException("Error executing process '%s' in dir '%s': %s" % (cmd, dir, e))
+            except UnicodeDecodeError as e:
+                raise AssertionException('Encoding error in process output: %s' % e)
             except yaml.error.MarkedYAMLError as e:
                 raise AssertionException('Error parsing process YAML data: %s' % e)
         else:
@@ -693,17 +702,20 @@ def load_from_yaml(cls, filename, path_keys):
             cls.filename = os.path.split(cls.filepath)[1]
             cls.dirpath = os.path.dirname(cls.filepath)
             try:
-                with open(filename, 'r', 1) as input_file:
-                    yml = yaml.load(input_file)
+                with open(filename, 'rb', 1) as input_file:
+                    bytes = input_file.read()
+                    yml = yaml.load(bytes.decode(cls.config_encoding, 'strict'))
             except IOError as e:
                 # if a file operation error occurred while loading the
-                # configuration file, swallow up the exception and re-raise this
+                # configuration file, swallow up the exception and re-raise it
                 # as an configuration loader exception.
-                raise AssertionException('Error reading configuration file: %s' % e)
+                raise AssertionException("Error reading configuration file '%s': %s" % (cls.filepath, e))
+            except UnicodeDecodeError as e:
+                # as above, but in case of encoding errors
+                raise AssertionException("Encoding error in configuration file '%s: %s" % (cls.filepath, e))
             except yaml.error.MarkedYAMLError as e:
-                # same as above, but indicate this problem has to do with
-                # parsing the configuration file.
-                raise AssertionException('Error parsing configuration file: %s' % e)
+                # as above, but in case of parse errors
+                raise AssertionException("Error parsing configuration file '%s': %s" % (cls.filepath, e))
 
         # process the content of the dict
         for path_key, options in path_keys.iteritems():

diff --git a/user_sync/connector/directory_csv.py b/user_sync/connector/directory_csv.py
@@ -56,6 +56,7 @@ def __init__(self, caller_options):
         caller_config = user_sync.config.DictConfig('%s configuration' % self.name, caller_options)
         builder = user_sync.config.OptionsBuilder(caller_config)
         builder.set_string_value('delimiter', None)
+        builder.set_string_value('string_encoding', 'utf-8')
         builder.set_string_value('first_name_column_name', 'firstname')
         builder.set_string_value('last_name_column_name', 'lastname')
         builder.set_string_value('email_column_name', 'email')
@@ -73,6 +74,8 @@ def __init__(self, caller_options):
         logger.debug('%s initialized with options: %s', self.name, options)
         caller_config.report_unused_values(logger)
 
+        # encoding of column values
+        self.encoding = options['string_encoding']
         # identity type for new users if not specified in column
         self.user_identity_type = user_sync.identity_type.parse_identity_type(options['user_identity_type'])
 
@@ -190,7 +193,4 @@ def get_column_value(self, row, column_name):
         :type column_name: str
         '''
         value = row.get(column_name)
-        if (value == ''):
-            value = None
-        return value
-
+        return value.decode(self.encoding) if value else None
diff --git a/user_sync/connector/directory_ldap.py b/user_sync/connector/directory_ldap.py
@@ -20,7 +20,6 @@
 
 import string
 
-import keyring
 import ldap.controls.libldap
 
 import user_sync.config
@@ -64,6 +63,7 @@ def __init__(self, caller_options):
         builder.set_string_value('group_filter_format', '(&(|(objectCategory=group)(objectClass=groupOfNames)(objectClass=posixGroup))(cn={group}))')
         builder.set_string_value('all_users_filter', '(&(objectClass=user)(objectCategory=person)(!(userAccountControl:1.2.840.113556.1.4.803:=2)))')
         builder.set_bool_value('require_tls_cert', False)
+        builder.set_string_value('string_encoding', 'utf-8')
         builder.set_string_value('user_identity_type_format', None)
         builder.set_string_value('user_email_format', '{mail}')
         builder.set_string_value('user_username_format', None)
@@ -79,6 +79,7 @@ def __init__(self, caller_options):
         self.logger = logger = user_sync.connector.helper.create_logger(options)
         logger.debug('%s initialized with options: %s', self.name, options)
 
+        LDAPValueFormatter.encoding = options['string_encoding']
         self.user_identity_type = user_sync.identity_type.parse_identity_type(options['user_identity_type'])
         self.user_identity_type_formatter = LDAPValueFormatter(options['user_identity_type_format'])
         self.user_email_formatter = LDAPValueFormatter(options['user_email_format'])
@@ -367,19 +368,20 @@ def iter_search_result(self, base_dn, scope, filter_string, attributes):
             raise
 
 class LDAPValueFormatter(object):
+    encoding = 'utf-8'
+
     def __init__(self, string_format):
         '''
         :type string_format: str
-        '''        
-        if (string_format == None): 
+        '''
+        if (string_format == None):
             attribute_names = []
         else:
             formatter = string.Formatter()
             attribute_names = [item[1] for item in formatter.parse(string_format) if item[1]]
-
         self.string_format = string_format        
         self.attribute_names = attribute_names
-        
+
     def get_attribute_names(self):
         '''
         :rtype list(str)
@@ -402,17 +404,17 @@ def generate_value(self, record):
                     break
                 values[attribute_name] = value
             if values is not None:
-                result = self.string_format.format(**values)
+                result = self.string_format.format(**values).decode(self.encoding)
         return (result, attribute_name)
 
-    @staticmethod
-    def get_attribute_value(attributes, attribute_name):
+    @classmethod
+    def get_attribute_value(cls, attributes, attribute_name):
         '''
         :type attributes: dict
         :type attribute_name: str
         '''    
         if attribute_name in attributes:
             attribute_value = attributes[attribute_name]
             if (len(attribute_value) > 0):
-                return attribute_value[0]
+                return attribute_value[0].decode(cls.encoding)
         return None
diff --git a/user_sync/connector/umapi.py b/user_sync/connector/umapi.py
@@ -65,7 +65,7 @@ def __init__(self, name, caller_options):
         options['enterprise'] = enterprise_options = enterprise_builder.get_options() 
         self.options = options
         self.logger = logger = helper.create_logger(options)
-        server_config.report_unused_values(logger)
+        if server_config: server_config.report_unused_values(logger)
         logger.debug('UMAPI initialized with options: %s', options)
 
         # set up the auth dict for umapi-client

diff --git a/user_sync/helper.py b/user_sync/helper.py
@@ -37,9 +37,11 @@ def open_file(name, mode, buffering = -1):
 
 def normalize_string(string_value):
     '''
-    :type string_value: str
+    Normalize a unicode or regular string
+    :param string_value: either a unicode or regular string or None
+    :return: the same type that came in
     '''
-    return string_value.strip().lower() if string_value != None else None    
+    return string_value.strip().lower() if string_value is not None else None
 
 def guess_delimiter_from_filename(filename):
     '''

diff --git a/user_sync/rules.py b/user_sync/rules.py
@@ -93,6 +93,7 @@ def __init__(self, caller_options):
         # in the secondary umapis (and exclude all that don't match).  Finally,
         # we keep track of user keys (in any umapi) that we have updated, so
         # we can correctly report their count.
+        self.adobe_user_count = 0
         self.included_user_keys = set()
         self.excluded_user_count = 0
         self.updated_user_keys = set()
@@ -172,7 +173,7 @@ def log_action_summary(self, umapi_connectors):
         self.action_summary['directory_users_read'] = len(self.directory_user_by_user_key)
         self.action_summary['directory_users_selected'] = len(self.filtered_directory_user_by_user_key)
         # find the total number of adobe users and excluded users
-        self.action_summary['adobe_users_read'] = len(self.included_user_keys) + self.excluded_user_count
+        self.action_summary['adobe_users_read'] = self.adobe_user_count
         self.action_summary['adobe_users_excluded'] = self.excluded_user_count
         self.action_summary['adobe_users_updated'] = len(self.updated_user_keys)
         # find out the number of users that have no changes; this depends on whether
@@ -752,6 +753,7 @@ def update_umapi_users_for_connector(self, umapi_info, umapi_connector):
 
     def is_umapi_user_excluded(self, in_primary_org, user_key, current_groups):
         if in_primary_org:
+            self.adobe_user_count += 1
             # in the primary umapi, we actually check the exclusion conditions
             identity_type, username, domain = self.parse_user_key(user_key)
             if identity_type in self.exclude_identity_types:
@@ -886,7 +888,7 @@ def get_user_key(self, id_type, username, domain, email=None):
             domain = ""
         elif not domain:
             return None
-        return id_type + ',' + username + ',' + domain
+        return unicode(id_type) + u',' + unicode(username) + u',' + unicode(domain)
 
     def parse_user_key(self, user_key):
         '''Returns the identity_type, username, and domain for the user.