In [46]:
'''transform_clients() will standardize the formatting on client/firm data column'''
'''
- remove information in parentheses () that is at the end of the string
- all upper case
- remove leading and trailing space
- remove characters like '.', ','
- remove extra space in between characters 
- use '&' instead of '+' and 'and'

'''
def transform_clients(series):
    End_String_in_Parenthese = r"\(.*\)$"
    return  (series
             # remove content in parentheses () which is at the end of the string
             .str.replace(End_String_in_Parenthese, "")
             .str.upper()
             .str.strip()
             .str.replace(r'[.,]', '')
             .str.replace(r'\s{2,}', ' ')
             .str.replace(r'[+(and)]', '&'))

In [47]:
'''findApproximateMatch() will find best match in one data frame for each value of a column of another database'''
'''
- value_df and value_col is for the data that wants to find match
- options_df and options_col is where to look for matches
- excludeSelf defaults as False, it means exclude self value in options so that best match will be other value than self.
'''

def findApproximateMatch(value_df, value_col, options_df, options_col, excludeSelf=False):
    from fuzzywuzzy import fuzz
    from fuzzywuzzy import process
    matches = []
    for value in value_df[value_col]:
        if excludeSelf:
            matches.append(process.extractOne(value, options_df.loc[options_df[options_col] != value, options_col], 
                                                                    scorer=fuzz.token_sort_ratio))
        else:
            matches.append(process.extractOne(value, options_df[options_col], scorer=fuzz.token_sort_ratio))
    return matches

In [54]:
'''transform_addresses() will standardize the formatting on address data column'''
'''
- all upper case
- remove leading and trailing space
- remove extra space in between characters 

'''
def transform_addresses(col):
    return (col
            .str.strip()
            .str.upper()
            .str.replace(r'\s{2,}',' '))

In [2]:
'''
- remove leading and trailing space
- remove extra space in between characters 
- remove special characters at the end of string
- replace any special character that is not '-' as '-'
'''
def clean_zip_cols(col):
    return (col.str.strip()
               .str.upper()
               .str.replace(r"\W$", '')
               .str.replace(r"[^\w-]","-"))

In [None]:
'''
convert zip code to str and filled each with leading 0 to be 5 digits minimun
'''
def fillzip_leading_0(col):
    return (col.astype(str).str.pad(width=5,side='left',fillchar='0'))

In [None]:
'''
Check if a value in any of the selected columns of a dataframe row-wise, 
and return the ***check result*** in boolean value
'''
def filter_rows_with_val(df, cols_to_check, val):
    containVal = False
    for col in cols_to_check:
        containVal = containVal | df[col].str.contains(val)
    return containVal

In [2]:
'''
Check if a value in any of the selected columns of a dataframe row-wise, 
and return the ***filtered df***
'''
# re_val_in_capture_group = False --> if regex value is not to capture a group, str.contains() can apply
# if regex captures group, use str.extract() to capture matches and then check if isna()

def filter_rows_with_val2(df, cols_to_check, val, anycol=True, val_in_re_capture_group=False):
    if anycol:
        check = False
        for col in cols_to_check:
            if val_in_re_capture_group:
                check = check | (~(df[col].str.extract(val).isna()))
            check = check | df[col].str.contains(val)
    # if val in all cols
    else: 
        check = 1
        for col in cols_to_check:
            if val_in_re_capture_group:
                check = check & (~(df[col].str.extract(val).isna()))
            check = check & df[col].str.contains(val)
    return df[check]