#### Install package and library

In [None]:
!pip install Pattern
!pip install cleanco
!pip install jaro_winkler
!pip install fuzzywuzzy

In [75]:
import pandas as pd
import numpy as np
import difflib
import random
from cleanco import cleanco
import jaro
import string
from fuzzywuzzy import fuzz,process
from pattern.en import pluralize, singularize
import re
pd.set_option('display.max_rows', None)

#### Download Data

In [4]:
comp=pd.read_stata(r"E:\RA\University of North Carolina at Chapel Hill\Gropper, Michael - Hedge Fund Activism for Ziming\01a Raw Data\Compustat\names.dta")
shark=pd.read_excel(r"E:\RA\University of North Carolina at Chapel Hill\Gropper, Michael - Hedge Fund Activism for Ziming\01a Raw Data\Factset Shark Repellent\FactSet SharkRepllent Data (Pulled 2019-11-19).xlsx").drop(index=[0,1,2,4])
shark.columns=shark.loc[3]
shark=shark.drop(index=3).reset_index().drop('index',axis=1)
bg=pd.read_stata(r"E:\RA\University of North Carolina at Chapel Hill\Gropper, Michael - Hedge Fund Activism for Ziming\01b BGT Data\00_BGT_Firm_Names_revised_for_ziming.dta")
#create name
c_name=pd.DataFrame(comp['conm']).astype(str)
s_name=pd.DataFrame(shark.iloc[:,7]).rename({'Company Name':'conm'}, axis=1)
bg_name_50=pd.DataFrame(bg.name_bgt[bg['total_postings_bgt']>=50])
bg_name_full=pd.DataFrame(bg.name_bgt)

#### Provide cleaned versions of names
1. 
 `cleanco` processes company names, providing cleaned versions of the names by stripping away terms indicating organization type (such as "Ltd." or "Corp").  
- Using a database of organization type terms, It also provides an utility to deduce the type of organization, in terms of US/UK business entity types (ie. "limited liability company" or "non-profit"). 

- Details about this package can be found at https://pypi.org/project/cleanco/

- I also change uppercase letter to lowercase.


In [5]:
#clean name
#remove organization type and thansfer to lower case
s_name1={}.fromkeys(list(map(lambda x: cleanco(x.lower()).clean_name(), s_name.conm))).keys()
c_name1={}.fromkeys(list(map(lambda x: cleanco(x.lower()).clean_name(), c_name.conm))).keys()
bg_name1={}.fromkeys(list(map(lambda x: cleanco(x.lower()).clean_name(), bg_name_50.name_bgt))).keys()
bg_name1_full={}.fromkeys(list(map(lambda x: cleanco(x.lower()).clean_name(), bg_name_full.name_bgt))).keys()

#### StrSimilarity Function

In [6]:
class StrSimilarity2p: 
    def __init__(self,word):
        self.word=word

    def Compared(self,str_list):
        dict_data={}
        sarticiple=self.word
        for strs,strs_word in str_list.items():  
            num=0
            str_num=0
            for str1 in strs_word:
                if str1 in sarticiple:
                    num=num+1
                    str_num=str_num+len(str1)
                else:
                    num=num    
                    str_num=str_num
            
            if num>=max(len(strs_word)-2,len(strs_word)/2+0.1):#match num with search word  
                dict_data[strs]=[num,str_num]
        return dict_data


    def NumChecks(self,dict_data):
        
        list_data = sorted(dict_data.items(), key=lambda asd:asd[1], reverse=True)
        json_data = {}
        datas = list_data[:2]
        for data in datas:
            json_data[data[0]]=data[1]
        return json_data

    def MMedian(self,dict_data):
        median_list={}
        length = len(re.sub(' ','',self.word))
        l_num=len(self.word.split())
        
        for k,v in dict_data.items():
            if (min(len(k.split()),l_num)>2 and abs(min(len(k.split()),l_num)-v[0])<=1) or (min(len(k.split()),l_num)<=2 and min(len(k.split()),l_num)-v[0]==0):
                if v[0]==min(len(k.split()),l_num) and max(len(k.split()),l_num)-v[0]<=2:
                    xx=-1
                else:
                    xx=abs(v[0]-min(len(k.split()),l_num))*0.1/min(len(k.split()),l_num)+abs(v[1]-min(len(k),length))*0.1/min(len(k),length)
            else:
                xx=1     
          
            median_list[k] = xx
        return median_list

    def Appear(self,dict_data):
          
        json_data={}
        for k,v in dict_data.items():
            fraction = difflib.SequenceMatcher(None, self.word, k).quick_ratio()-v
            json_data[k]=fraction
        tulp_data = sorted(json_data.items(), key=lambda asd:asd[1], reverse=True)
        return tulp_data[0][0],tulp_data[0][1]   


def main2(str_query,str_list1):
    query_str =str_query
    str_list=str_list1
    ss = StrSimilarity2p(query_str)
    list_data= ss.Compared(str_list)
    num = ss.NumChecks(list_data)
    if num!={}:
        mmedian = ss.MMedian(num)
        name_match=ss.Appear(mmedian)[0]
        score=ss.Appear(mmedian)[1] 
    else:
        name_match=None
        score=None
    return name_match,score
    

#### Function Input

This `StrSimilarity` function require the input to satisfy some characters

- Search Dictionary

In [17]:
#1.Compustat as dictionary
c_name1=sorted(c_name1, key=len)
dict_c={}
dict_c_origin={}
for str1 in c_name1:
    ss=list(map(lambda x: (' '+singularize(x)+' '), list(str1.strip().translate(str.maketrans('', '', string.punctuation)).split())))
    str2=' '.join(ss)
    dict_c[str2]=ss
    dict_c_origin[str2]=str1
    
#2. Burining Glass as dictionary
#choose subset:'total_postings_bgt'>=50
bg_name1=sorted(bg_name1, key=len)
dict_bg={}
dict_bg_origin={}
for str1 in bg_name1:
    ss=list(map(lambda x: (' '+singularize(x)+' '), list(str1.strip().translate(str.maketrans('', '', string.punctuation)).split())))
    str2=' '.join(ss)
    dict_bg[str2]=ss
    dict_bg_origin[str2]=str1


- Query List

In [None]:
random.seed(123)

In [115]:
#for test purpose, just choose 1000 names random;y from query list
#1. Burning Glass as query list
query_bg1=random.sample(bg_name1,1000)
query_bg=list(' '.join(map(lambda x: ' '+singularize(x)+' ', list(str1.strip().translate(str.maketrans('', '', string.punctuation)).split()))) for str1 in query_bg1)
query_bg_origin=pd.DataFrame(query_bg1,index=query_bg).to_dict('dict')[0]

#2. Compustat as query list
query_c1=random.sample(c_name1,1000)
query_c=list(' '.join(map(lambda x: ' '+singularize(x)+' ', list(str1.strip().translate(str.maketrans('', '', string.punctuation)).split()))) for str1 in query_c1)
query_c_origin=pd.DataFrame(query_c1,index=query_c).to_dict('dict')[0]

#### Ready to test!
1. Test the performance of `StrSimilarity`
- Find the reasonable threshold
- Estimate match accuracy

#### 1. Using`query list=Burining Glass` and `search dictionary=Compustat`

(1) Performace

In [116]:
%%time
result=list(map(lambda x:(main2(x,dict_c)),query_bg))
df_bg2c=pd.DataFrame(result,columns=['match_name','score'],index=query_bg).sort_values(by='score',ascending=False)

Wall time: 25.2 s


In [99]:
#replace the revised name back to origin names
df_bg2c=df_bg2c.replace(dict_c_origin).rename(index=query_bg_origin)

(2) Possible Threshold

- threshold around 0.89

In [82]:
#random sample 1
#0.86
df_bg2c[df_bg2c.score>0.86]

Unnamed: 0,match_name,score
jack in the box,jack in the box,2.0
nanostring technologies,nanostring technologies,2.0
brp,brp,2.0
wm wrigley jr,wrigley (wm) jr,2.0
dow,dow,2.0
cardiac pacemakers,cardiac pacemakers,2.0
diversey,diversey,2.0
halliburton,halliburton,2.0
ies holdings,ies holdings,2.0
cgg,cgg,2.0


In [83]:
print(len(df_bg2c[df_bg2c.score>0.86]))

165


In [95]:
#random sample 2
#0.86
df_bg2c[df_bg2c.score>0.86]

Unnamed: 0,match_name,score
transamerica,transamerica,2.0
te connectivity,te connectivity,2.0
corrections corp of america,corrections corp of america,2.0
meridian bioscience,meridian bioscience,2.0
radisys,radisys,2.0
inteliquent,inteliquent,2.0
bourns,bourns,2.0
wright medical group,wright medical group,2.0
mutualfirst financial,mutualfirst financial,2.0
converse,converse,2.0


In [96]:
print(len(df_bg2c[df_bg2c.score>0.86]))

159


In [100]:
#random sample 3
#0.86
df_bg2c[df_bg2c.score>0.86]

Unnamed: 0,match_name,score
cox communications,cox communications,2.0
asbury automotive group,asbury automotive group,2.0
rambus,rambus,2.0
eresearchtechnology,eresearchtechnology,2.0
glaukos,glaukos,2.0
icon health & fitness,icon health & fitness,2.0
cypress semiconductor,cypress semiconductor,2.0
keystone foods,keystone foods,2.0
tibco software,tibco software,2.0
aon,aon,2.0


In [101]:
print(len(df_bg2c[df_bg2c.score>0.86]))

146


(3) Estimated Accuracy Ratio
According to 10 random sample test, the estimated accuracy ratio for `StrSimilarity` is 

#### 1. Using`query list=Compustat` and `search dictionary=Burning Glass`

(1) Performace

In [117]:
%%time
result=list(map(lambda x:(main2(x,dict_bg)),query_c))
df_c2bg=pd.DataFrame(result,columns=['match_name','score'],index=query_c).sort_values(by='score',ascending=False)

Wall time: 51.2 s


In [118]:
#replace the revised name back to origin names
df_c2bg=df_c2bg.replace(dict_bg_origin).rename(index=query_c_origin)

(2) Possible Threshold

- threshold around 0.90

In [105]:
#random sample 1
#0.86
df_c2bg[df_c2bg.score>0.86]

Unnamed: 0,match_name,score
sunopta,sunopta,2.0
integer holdings,integer holdings,2.0
crestwood equity partners,crestwood equity partners,2.0
organic,organic,2.0
lxrandco,lxrandco,2.0
american home shield,american home shield,2.0
orion industries,orion industries,2.0
guidewire software,guidewire software,2.0
boomtown,boomtown,2.0
community bank system,community bank systems,2.0


In [106]:
print(len(df_c2bg[df_c2bg.score>0.86]))

497


In [110]:
#random sample 2
#0.86
df_c2bg[df_c2bg.score>0.86]

Unnamed: 0,match_name,score
bioverativ,bioverativ,2.0
anchor glass container,anchor glass container,2.0
anacomp,anacomp,2.0
qcr holdings,qcr holdings,2.0
innova,innova,2.0
penske automotive group,penske automotive group,2.0
repligen,repligen,2.0
innovex,innovex,2.0
green plains,green plains,2.0
usg,usg,2.0


In [111]:
print(len(df_c2bg[df_c2bg.score>0.86]))

456


In [119]:
#random sample 3
#0.86
df_c2bg[df_c2bg.score>0.86]

Unnamed: 0,match_name,score
bb&t,bb&t,2.0
pharmerica,pharmerica,2.0
cigna,cigna,2.0
johns manville,johns manville,2.0
first financial service,first financial service,2.0
glaukos,glaukos,2.0
ies holdings,ies holdings,2.0
verisk analytics,verisk analytics,2.0
gatekeeper systems,gatekeeper systems,2.0
ebix,ebix,2.0


In [120]:
print(len(df_c2bg[df_c2bg.score>0.86]))

508


(3) Estimated Accuracy Ratio
According to 10 random sample test, the estimated accuracy ratio for `StrSimilarity` is 