### Fuzzy string matching in Python

[fuzzywuzzy](https://marcobonzanini.com/2015/02/25/fuzzy-string-matching-in-python/)

In [2]:
import fuzzywuzzy
from fuzzywuzzy import process

### vLookup in Python

[how to do a vlookup in Python](https://michaeljsanders.com/2017/04/17/python-vlookup.html)

- `.map()` with a dictionary
- `.merge()` with a left join

In [3]:
import numpy as np
import pandas as pd
import os
pd.set_option('display.max_rows', 6)

In [4]:
script_dir = '\\Users\\Catel\\Google Drive\\US and Canada Cases\\trademark not in report yet\\Scripts'

In [5]:
output_dir = '\\Users\\Catel\\Google Drive\\US and Canada Cases'

In [49]:
# clients on file
clients = pd.read_excel(output_dir + '/' + 'clients.xlsx')
clients

Unnamed: 0,委托人,委托人性质,委托人其他使用名
0,3COR,合作所,
1,"ADAMS GRUMBLES, LLP",合作所,
2,ALPHABET BRAND COMPANY,企业,
...,...,...,...
220,"HAROLD W. ASHENMIL, Q.C.",合作所,
221,LUXURY & LAYLA INC.,企业,
222,THOMAS & BETTS CORPORATION,企业,


In [50]:
# cases on file (extract those need to look up for 委托人性质)
cases = pd.read_excel(output_dir + '/' + 'final20190103115338.xlsx')
cases = cases.set_index(np.arange(len(cases)))

In [51]:
cases_for_lookup = cases.loc[cases.委托人性质.isna()]
cases_for_lookup

Unnamed: 0,委托人,委托人国别,本所案号,案件类型,申请人,申请人国别,商标名称,类别,商标号,立案日,委托人性质,Year,Month
7072,BCF S.E.N.C.R.L. / LLP,加拿大,UIT1806856.CA.1,变更代理人,芭拉多斯保安系统巴哈马公司,巴哈马,PARADOX FIRE,9,8510953,2018-11-26,,2018,11
7073,BCF S.E.N.C.R.L. / LLP,加拿大,UIT1806857.CA.1,变更代理人,芭拉多斯保安系统巴哈马有限公司,巴哈马,PARADOX IMPERIAL,9,6962291,2018-11-26,,2018,11
7074,BCF S.E.N.C.R.L. / LLP,加拿大,UIT1806858.CA.1,变更代理人,芭拉多斯保安系统巴哈马有限公司,巴哈马,MAMA；PARADOX,9,6424404,2018-11-26,,2018,11
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7513,"GREENBERG TRAURIG, LLP(BOSTON)",美国,UIT1807678.AS.1,转让,三伟达保健公司,美国,TRIPLE SENSE TECHNOLOGY及图形,10,34776257,2018-12-29,,2018,12
7514,"GREENBERG TRAURIG, LLP(BOSTON)",美国,UIT1807679.AS.1,转让,三伟达保健公司,美国,TRIPLE SENSE TECHNOLOGY及图形,35,34794844,2018-12-29,,2018,12
7515,"GREENBERG TRAURIG, LLP(BOSTON)",美国,UIT1807680.AS.1,转让,三伟达保健公司,美国,TRIPLE SENSE TECHNOLOGY及图形,42,34793658,2018-12-29,,2018,12


#### Get best match from fuzzy look up result of one of the cases

In [52]:
testResult = process.extractOne('HERSCHEL SUPPLY COMPANY LTD.', clients['委托人'])
testResult

('HERSCHEL SUPPLY COMPANY LTD.', 100, 72)

In [53]:
testResult = process.extractOne(cases_for_lookup.iloc[0]['委托人'], clients['委托人'])
testResult

('BCF S.E.N.C.R.L. / LLP', 100, 12)

#### fuzzy lookup and best match on all cases

In [54]:
bestMatches = cases_for_lookup['委托人'].apply(lambda x: process.extractOne(x, clients['委托人']) if process.extractOne(x, clients['委托人'])[1] > 90 else process.extractOne(x, clients['委托人其他使用名']) )

In [55]:
bestMatches

7072            (BCF S.E.N.C.R.L. / LLP, 100, 12)
7073            (BCF S.E.N.C.R.L. / LLP, 100, 12)
7074            (BCF S.E.N.C.R.L. / LLP, 100, 12)
                          ...                    
7513    (GREENBERG TRAURIG, LLP(BOSTON), 100, 65)
7514    (GREENBERG TRAURIG, LLP(BOSTON), 100, 65)
7515    (GREENBERG TRAURIG, LLP(BOSTON), 100, 65)
Name: 委托人, Length: 444, dtype: object

In [56]:
scores = [x[1] for x in bestMatches]
results = [x[0] for x in bestMatches]
indices = [x[2] for x in bestMatches]

In [57]:
df = pd.DataFrame({'Original': cases_for_lookup['委托人'],
                   'Score': scores,
                   'Result': results, 
                  "ClientIndex": indices})
df

Unnamed: 0,Original,Score,Result,ClientIndex
7072,BCF S.E.N.C.R.L. / LLP,100,BCF S.E.N.C.R.L. / LLP,12
7073,BCF S.E.N.C.R.L. / LLP,100,BCF S.E.N.C.R.L. / LLP,12
7074,BCF S.E.N.C.R.L. / LLP,100,BCF S.E.N.C.R.L. / LLP,12
...,...,...,...,...
7513,"GREENBERG TRAURIG, LLP(BOSTON)",100,"GREENBERG TRAURIG, LLP(BOSTON)",65
7514,"GREENBERG TRAURIG, LLP(BOSTON)",100,"GREENBERG TRAURIG, LLP(BOSTON)",65
7515,"GREENBERG TRAURIG, LLP(BOSTON)",100,"GREENBERG TRAURIG, LLP(BOSTON)",65


##### compare original vs. result with matching score above or equal to 95

In [58]:
df[df.Score >= 95]['Original'].unique()

array(['BCF S.E.N.C.R.L. / LLP', 'OYEN WIGGS GREEN & MUTALA LLP',
       'LADAS & PARRY LLP(NEW YORK)',
       'HARMAN INTERNATIONAL INDUSTRIES, INCORPORATED', 'YOUNIQUE, LLC',
       'ITT MANUFACTURING ENTERPRISES LLC', 'MCCARTER & ENGLISH, LLP',
       'SUNSTEIN KANN MURPHY & TIMBERS LLP',
       'WARNER NORCROSS & JUDD LLP ',
       'WOODARD, EMHARDT, MORIARTY, MCNETT & HENRY LLP', 'BUCHALTER',
       'OLSHAN FROME WOLOSKY LLP', 'DOWNS RACHLIN MARTIN PLLC',
       'SMART & BIGGAR', 'KLARQUIST SPARKMAN, LLP', 'ZIFF DAVIS, LLC',
       'MINERALS TECHNOLOGIES INC.', 'HAROLD W. ASHENMIL, Q.C.',
       'ICON HEALTH & FITNESS, INC.', 'YOUNG LIVING ESSENTIAL OILS',
       'FOX ROTHSCHILD LLP', 'FOLEY & LARDNER LLP (WASHINGTON)',
       'DLA PIPER LLP (US)', 'PATTERSON & SHERIDAN, LLP(HOUSTON)',
       'OSLER, HOSKIN & HARCOURT LLP',
       'PATTERSON THUENTE CHRISTENSEN PEDERSEN, P.A.(MINNEAPOLIS)',
       'FITZPATRICK, CELLA, HARPER & SCINTO',
       'KNOBBE MARTENS OLSON & BEAR LLP', 'IP

In [59]:
df[df.Score >= 95]['Result'].unique()

array(['BCF S.E.N.C.R.L. / LLP', 'OYEN WIGGS GREEN & MUTALA LLP',
       'LADAS & PARRY LLP(NEW YORK)',
       'HARMAN INTERNATIONAL INDUSTRIES, INCORPORATED', 'YOUNIQUE, LLC',
       'ITT MANUFACTURING ENTERPRISES LLC', 'MCCARTER & ENGLISH, LLP',
       'SUNSTEIN KANN MURPHY & TIMBERS LLP', 'WARNER NORCROSS & JUDD LLP',
       'WOODARD, EMHARDT, MORIARTY, MCNETT & HENRY LLP', 'BUCHALTER',
       'OLSHAN FROME WOLOSKY LLP', 'DOWNS RACHLIN MARTIN PLLC',
       'SMART & BIGGAR', 'KLARQUIST SPARKMAN, LLP', 'ZIFF DAVIS, LLC',
       'MINERALS TECHNOLOGIES INC.', 'HAROLD W. ASHENMIL, Q.C.',
       'ICON HEALTH & FITNESS, INC.', 'YOUNG LIVING ESSENTIAL OILS',
       'FOX ROTHSCHILD LLP', 'FOLEY & LARDNER LLP (WASHINGTON)',
       'DLA PIPER LLP (US)', 'PATTERSON & SHERIDAN, LLP(HOUSTON)',
       'OSLER, HOSKIN & HARCOURT LLP',
       'PATTERSON THUENTE PEDERSEN, P.A. (MINNEAPOLIS)',
       'FITZPATRICK, CELLA, HARPER & SCINTO',
       'KNOBBE MARTENS OLSON & BEAR LLP', 'IPR LAW GROUP',
     

In [60]:
# export result >= 95 to excel for review
notLowerThan_95 = df[df.Score >= 95]
notLowerThan_95.to_excel('notLowerThan_95.xlsx')

##### compare original vs. result with matching score lower than 95

In [43]:
# export result < 95 to excel for review and add client (final.xlsx - 'clients' sheet), 
# once added, rerun the above cells until all client matches found
lowerThan_95 = df[df.Score < 95]
lowerThan_95.to_excel('lowerThan_95.xlsx')

- check lowerThan_95.xlsx file, if the client not in clients file yet, add it to `clients.xlsx`. 
- rerun the code to load in the updated clients file and do the matching again
- when no more record in lowerThan_95.xlsx file, check notLowerThan_95.xlsx file, make sure the matching results are correct
- go on into the following seesions to update case records.

#### Fill in 委托人性质 in cases dataframe

In [44]:
cases_rows_toUpdate = cases_for_lookup.index
cases_rows_toUpdate

Int64Index([7072, 7073, 7074, 7075, 7076, 7077, 7078, 7079, 7080, 7081,
            ...
            7506, 7507, 7508, 7509, 7510, 7511, 7512, 7513, 7514, 7515],
           dtype='int64', length=444)

In [45]:
cases.loc[cases_rows_toUpdate, '委托人性质']

7072    NaN
7073    NaN
7074    NaN
       ... 
7513    NaN
7514    NaN
7515    NaN
Name: 委托人性质, Length: 444, dtype: object

#### Update 委托人性质 in cases

In [61]:
updateValues委托人性质 = df.merge(clients, left_on="ClientIndex", right_index = True)['委托人性质'].tolist()
updateValues委托人性质

['合作所',
 '合作所',
 '合作所',
 '合作所',
 '合作所',
 '合作所',
 '合作所',
 '合作所',
 '合作所',
 '合作所',
 '合作所',
 '合作所',
 '合作所',
 '合作所',
 '合作所',
 '合作所',
 '合作所',
 '合作所',
 '合作所',
 '合作所',
 '合作所',
 '合作所',
 '合作所',
 '合作所',
 '合作所',
 '合作所',
 '合作所',
 '合作所',
 '合作所',
 '合作所',
 '合作所',
 '合作所',
 '合作所',
 '合作所',
 '合作所',
 '合作所',
 '合作所',
 '合作所',
 '合作所',
 '合作所',
 '合作所',
 '合作所',
 '合作所',
 '合作所',
 '合作所',
 '合作所',
 '合作所',
 '合作所',
 '合作所',
 '合作所',
 '合作所',
 '合作所',
 '合作所',
 '合作所',
 '合作所',
 '合作所',
 '合作所',
 '合作所',
 '合作所',
 '合作所',
 '合作所',
 '合作所',
 '合作所',
 '合作所',
 '合作所',
 '合作所',
 '合作所',
 '合作所',
 '合作所',
 '合作所',
 '合作所',
 '合作所',
 '合作所',
 '合作所',
 '合作所',
 '合作所',
 '合作所',
 '合作所',
 '合作所',
 '合作所',
 '合作所',
 '合作所',
 '合作所',
 '合作所',
 '合作所',
 '合作所',
 '合作所',
 '合作所',
 '合作所',
 '合作所',
 '合作所',
 '合作所',
 '合作所',
 '合作所',
 '合作所',
 '合作所',
 '合作所',
 '合作所',
 '合作所',
 '合作所',
 '合作所',
 '合作所',
 '合作所',
 '合作所',
 '合作所',
 '合作所',
 '合作所',
 '合作所',
 '合作所',
 '合作所',
 '合作所',
 '合作所',
 '合作所',
 '合作所',
 '合作所',
 '合作所',
 '合作所',
 '合作所',
 '合作所',
 '合作所',
 '合作所',
 '合作所',
 '合作所',
 '合作所',
 '合作所',


In [62]:
cases.loc[cases_rows_toUpdate, '委托人性质'] = updateValues委托人性质

In [63]:
cases.loc[cases_rows_toUpdate, '委托人性质']

7072    合作所
7073    合作所
7074    合作所
       ... 
7513    合作所
7514    合作所
7515    合作所
Name: 委托人性质, Length: 444, dtype: object

In [64]:
# verify if all cases have 委托人性质 filled in
cases.loc[cases.委托人性质.isna()]

Unnamed: 0,委托人,委托人国别,本所案号,案件类型,申请人,申请人国别,商标名称,类别,商标号,立案日,委托人性质,Year,Month


#### Update 委托人 in cases

In [67]:
cases.loc[cases_rows_toUpdate, '委托人'] = clients.iloc[df['ClientIndex']]['委托人'].tolist()

### Save updated case file to excel

In [68]:
# export the updated cases to final case file.
cases.to_excel('final.xlsx', header = True, index=False, encoding='utf-8')