# Purpose
- To illustrate the use of fuzzy string matching with the fuzzywuzzy library.
    - 

# Links
- https://www.datacamp.com/community/tutorials/fuzzy-string-python
- https://github.com/seatgeek/fuzzywuzzy/blob/master/fuzzywuzzy/fuzz.py

In [15]:
import pandas as pd
from pandas import DataFrame, Series
from fuzzywuzzy import fuzz, process

# pandas options
pd.set_option('display.max_columns', None)  # Shows all columns in DataFrames. See http://pandas.pydata.org/pandas-docs/stable/options.html
pd.set_option('display.max_rows', None) # Shows all rows in DataFrames.
pd.set_option('display.width', 5000)
pd.set_option('display.multi_sparse', False)  #  Display every cell (for multi-level index).
pd.set_option('display.max_colwidth', -1)  # Display full contents of each column.

def get_best_match_string(str_query, l_options):
    """ Uses fuzzywuzzy's process.extractOne to return best matching string. 
   
    'scorer' is set to fuzz.token_set_ratio algorithm, instead of the default fuzz.WRatio scorer.
    
    'score_cutoff' is set at a higher bar, so that the match must be reasonably probable, otherwise system returns a NaN.
    This is to catch situations whereby the query string is obviously not in the right table, and we don't wish to return capricious results.
    """
    tup_ret = process.extractOne(str_query, l_options, scorer=fuzz.token_set_ratio, score_cutoff=75)
    if tup_ret is not None:
        return tup_ret[0]
    else:
        return np.nan

In [12]:
df_db1 = pd.read_excel('C:/1/Example - Sample Company Name Data - Illustrate fuzzywuzzy.xlsx', sheet_name='db1')
df_db2 = pd.read_excel('C:/1/Example - Sample Company Name Data - Illustrate fuzzywuzzy.xlsx', sheet_name='db2')

l_options = df_db2['co_name_2'].unique()  # Get list of allowed target values.

df_db1['co_name_2_matched'] = df_db1['co_name_1'].apply(lambda x: get_best_match_string(x, l_options))

In [14]:
df_merge = pd.merge(df_db1, df_db2, how='left', left_on=['co_name_2_matched'], right_on=['co_name_2'])
df_merge


Unnamed: 0,symbol,co_name_1,co_name_2_matched,co_name_2,stock_price
0,C38U.SI,CapitaLand Mall Trust,Capita Mall Trust,Capita Mall Trust,2.36
1,O39.SI,Oversea-Chinese Banking Corporation Limited (OCBC),OCBC,OCBC,11.04
2,S58.SI,SATS Ltd.,SATS Ltd,SATS Ltd,5.05
3,G13.SI,Genting Singapore Limited,Gneting Singapore Limited,Gneting Singapore Limited,1.01
4,C52.SI,ComfortDelGro Corporation Limited,ComfortDelGro,ComfortDelGro,2.43
5,Z74.SI,Singapore Telecommunications Limited,Gneting Singapore Limited,Gneting Singapore Limited,1.01
6,D01.SI,Dairy Farm International Holdings Limited,Dairy Farm,Dairy Farm,7.76
7,C31.SI,CapitaLand Limited,,,


In [16]:
# TROUBLESHOOT MATCHING #
process.extract('CapitaLand Limited', l_options, limit=20, scorer=fuzz.token_set_ratio)

[('Gneting Singapore Limited', 56),
 ('CapLand', 56),
 ('Capita Mall Trust', 51),
 ('SingTel', 32),
 ('SATS Ltd', 31),
 ('Dairy Farm', 29),
 ('ComfortDelGro', 26),
 ('OCBC', 9)]