In [1]:
## SETUP 
imports = ['wrds', 'pandas as pd', 'os', 're', 'pickle', 'numpy as np', 'from name_matching.name_matcher import NameMatcher',
          'from joblib import Parallel, delayed', 'from IPython.display import display, HTML, clear_output', 'random',
          'unicodedata','sys']
for command in imports:
    if command.startswith('from'): exec(command)
    else: exec('import ' + command)

if not os.getcwd().endswith('Big Data'):
    os.chdir('../..')

sys.path.append('trade_data_code/2_python')
import A_helper_functions as hf

In [None]:
########################################################################################
# Run Matching 
########################################################################################
############  
### DEFINE PARAMETERS AND IMPORT DATA / Matching Function
############  
init_matches = 50; final_matches = 5; cores =  os.cpu_count() - 10; 
chunks = cores*10

def matching_wrapper(index):
    #output progress 
    clear_output(wait=True)
    print(f"{round(index/chunks*100, 2)}%")
    
    temp_firms = firm_chunks[index]
    #run first version of matcher on all words 
    matches = matcher.match_names(to_be_matched=temp_firms, column_matching='company_cleaned')
    results = (pd.wide_to_long(matches,stubnames=["match_name", "score", "match_index"], i="original_name", j="match",suffix="_\d+")
               .reset_index()[['original_name', 'match_name', 'score']]
               .rename(columns={'original_name': 'company_cleaned', 'match_name': 'admin_name_cleaned', 'score': 'raw_score'})
               .merge(temp_firms[['company_cleaned', 'company_stripped']], how = 'left', on = 'company_cleaned')
               .merge(sirens_to_match[['admin_name_cleaned', 'admin_name_stripped']], how = 'left', on = 'admin_name_cleaned'))
    company_chunks = [group for _, group in results.groupby('company_cleaned')]

    ### reun the second version of matcher only on words from initial list 
    results = []
    temp_matcher = NameMatcher(number_of_matches=init_matches, legal_suffixes=False, common_words= False, top_n= init_matches, verbose=False)
    temp_matcher.set_distance_metrics(['bag', 'typo', 'refined_soundex'])
    for chunk in company_chunks:
        chunk = chunk.reset_index()
        try:
            temp_matcher.load_and_process_master_data(column='admin_name_stripped', df_matching_data=chunk, transform=True)
            temp_matches = temp_matcher.match_names(to_be_matched=chunk.iloc[0], column_matching='company_stripped')
            temp_results = (pd.wide_to_long(temp_matches,stubnames=["match_name", "score", "match_index"], i="original_name", j="match",suffix="_\d+")
                            .reset_index()[['original_name', 'match_name', 'score']]
                            .rename(columns={'original_name': 'company_stripped', 'match_name': 'admin_name_stripped', 'score': 'stripped_score'})
                            .drop_duplicates()
                            .merge(chunk, how = 'right')
                            .assign(match_index = lambda df: df.groupby(['stripped_score']).ngroup()))

        except Exception as e:
            print(f"Error processing company: {chunk.loc[0,'company_cleaned']} in index {index}") 
            temp_results = chunk.assign(match_index = lambda df: df.groupby(['raw_score']).ngroup())

        temp_results = (temp_results
                        .assign(match_index = lambda df: df['match_index'].max() - df['match_index'] + 1)
                        .sort_values(['match_index','raw_score'], ascending = [True, False])
                        .loc[lambda df: df['match_index'].le(final_matches)])
        results.append(temp_results)
    results = pd.concat(results, ignore_index = True)[['company_cleaned', 'admin_name_cleaned', 'raw_score', 'stripped_score', 'match_index']]
    return(results)

############  
### Prepare the lists of firm names / check for exact matches 
############ 
firms_to_match = (pd.read_parquet('data/2_processed/linkedin/france_affiliated_firms_cleaned.parquet')
                 .loc[lambda c: c['likely_french']] # & c['data_eligible']
                 .drop_duplicates(subset='company_cleaned')
                 [['company_cleaned','company_stripped']])


sirens_to_match = (pd.read_parquet('data/2_processed/admin/siren_admin.parquet')
                   .drop_duplicates(subset='admin_name_cleaned')
                  [['admin_name_cleaned', 'admin_name_stripped']])

initial_matches = (
    pd.merge(firms_to_match[['company_cleaned']], sirens_to_match[['admin_name_cleaned']], how = 'inner',
             left_on = 'company_cleaned', right_on = 'admin_name_cleaned')
    .assign(raw_score = 100, stripped_score = 100, match_index = 1))

############  
### match remaining firms 
############ 
remaining_to_match = firms_to_match.loc[lambda c: ~c['company_cleaned'].isin(initial_matches['company_cleaned']) &
                                        c['company_stripped'].isin(sirens_to_match['admin_name_stripped']) &
                                       ~c['company_stripped'].eq("")]
firm_chunks = np.array_split(remaining_to_match, chunks)
matcher = NameMatcher(number_of_matches=init_matches, legal_suffixes=False, common_words= False, top_n= init_matches, verbose=False)
matcher.set_distance_metrics(['bag', 'typo', 'refined_soundex'])
matcher.load_and_process_master_data(column='admin_name_cleaned', df_matching_data=sirens_to_match, transform=True)

############  
### output results 
############ 
matching_output = Parallel(n_jobs=cores, backend='multiprocessing')(delayed(matching_wrapper)(index) for index in range(chunks))
matching_output = pd.concat(matching_output, ignore_index = True)
matching_output = pd.concat([initial_matches, matching_output], ignore_index=True)
matching_output = (
    matching_output
    .merge(pd.read_parquet('data/2_processed/admin/siren_admin.parquet')[['admin_name','admin_name_cleaned', 'siren']],
             how = 'left', on = 'admin_name_cleaned')
    .merge(pd.read_parquet('data/2_processed/linkedin/france_affiliated_firms_cleaned.parquet')[['rcid', 'company', 'company_cleaned']],
           how = 'left', on = 'company_cleaned')
    [['company','admin_name','company_cleaned', 'admin_name_cleaned', 'rcid', 'siren', 'raw_score', 'stripped_score', 'match_index']]
    .assign(match_group_size = lambda df: df.groupby(['company', 'match_index'])['match_index'].transform('size')))

matching_output.to_parquet('data/2_processed/admin/fuzzy_matching_output_raw.parquet')

In [63]:
########################################################################################
##### Check Performance of Matching Metrics
########################################################################################
def matching_performance_check(df):
    if not 'match_likelihood' in df.columns:
        df = (df[df['rcid'].isin(random.sample(list(df['rcid'].unique()), sample_size))]
                   .assign(match_likelihood = -10))
    for rcid in df['rcid'].unique():
        temp = df.loc[lambda c:c['rcid'].eq(rcid)]
        if max(temp['match_likelihood'])== -10:
            display(HTML(f"<span style='font-size:20px;'>Name to match: {temp['company'].iloc[0]}</span>"))
            print('')
            for j in range(len(temp)):
                display(HTML(f"<span style='font-size:15px;'> {temp['admin_name'].iloc[j]} </span>"))
            user_input = input("best fit = ")
            while user_input not in ['1','2','3','break']:
                user_input = input("best fit = ")
            clear_output(wait=True)
            if user_input == 'break':
                break
            else:
                df.loc[lambda c: c['rcid'].eq(rcid), 'match_likelihood'] = int(user_input)-2
    return(df)
    
firms_to_match = (pd.read_parquet('data/2_processed/linkedin/france_affiliated_firms_cleaned.parquet')
                 .loc[lambda c: c['likely_french']]) # & c['data_eligible']

############  
### Identify firms matched by LEI / exact matches / fuzzy matches  
############ 
lei_matched = (
    pd.merge(pd.read_parquet('data/2_processed/admin/LEI_siren_crosswalk.parquet'),
             pd.read_parquet('data/2_processed/admin/siren_admin.parquet')[['siren','admin_name']],
             left_on = 'lei_siren', right_on = 'siren')
    [['lei', 'lei_country', 'siren', 'admin_name']]
    .merge(firms_to_match[['rcid', 'company', 'lei']], on = 'lei')
    .assign(method = 'lei'))


clean_matched = (
    pd.read_parquet('data/2_processed/admin/fuzzy_matching_output_raw.parquet')
    .loc[lambda c:
         ~c['rcid'].isin(lei_matched['rcid']) 
        & c['raw_score'].eq(100)
        & c['match_group_size'].eq(1)]
    .assign(method = 'clean'))

strip_matched = (
    pd.read_parquet('data/2_processed/admin/fuzzy_matching_output_raw.parquet')
   .loc[lambda c: ~c['rcid'].isin(lei_matched['rcid'])
        & ~c['raw_score'].eq(100)
        & c['stripped_score'].eq(100)]
    .loc[lambda c: c['match_group_size'].eq(1)]
    .assign(method = 'strip'))

############  
### Test their match rates 
### all have match rates above 75% which we set as the cutoff so 
### just use all three to make the final dictionary 
############ 
testing_matches = False
if testing_matches:
    random.seed(42); sample_size = 100;
    lei_matched_sample = matching_performance_check(lei_matched)
    clean_matched_sample = matching_performance_check(clean_matched)
    strip_matched_sample = matching_performance_check(strip_matched)

    lei_matched_sample.to_parquet('data/2_processed/admin/lei_matching_performance.parquet')
    clean_matched_sample.to_parquet('data/2_processed/admin/clean_matching_performance.parquet')
    strip_matched_sample.to_parquet('data/2_processed/admin/strip_matching_performance.parquet')

dictionary_complete = (
    pd.concat([lei_matched,clean_matched, strip_matched], ignore_index = True)[['rcid', 'siren','method']]
    .drop_duplicates())
dictionary_complete.to_parquet('data/2_processed/admin/fuzzy_matching_output_final.parquet')


########### 
### Generate Descriptive stats about 
### matching 
##########
matching_descriptives_base = (
    pd.read_parquet('data/2_processed/linkedin/france_affiliated_firms_cleaned.parquet')
    .loc[lambda c: c['likely_french']]
    .merge(dictionary_complete, how = 'left')
    .assign(method=lambda df: df['method'].apply(lambda x: "unmatched" if pd.isna(x) else x))
    .assign(matched =lambda df: ~df['method'].eq('unmatched'), 
            has_lei = lambda df: ~df['lei'].isna())
)

match_performance = (
    matching_descriptives_base.groupby(['method'])
    .size().reset_index(name='count')
    .assign(percent = lambda df: df['count'] / df['count'].sum()*100))

mean_descriptives = (matching_descriptives_base.groupby(['matched'])
 .agg({'share_comp_french':'mean','emp_total':'mean','emp_data': 'mean',
       'comp_total':'mean','comp_data': 'mean', 'cost_per_worker':'mean', 'subsidiary': 'mean',
       'year_founded':'mean', 'has_lei': 'mean'}))


median_descriptives = (matching_descriptives_base.groupby(['matched'])
 .agg({'share_comp_french':'median','emp_total':'median','emp_data': 'median',
       'comp_total':'median','comp_data': 'median', 'cost_per_worker':'median',
       'year_founded':'median'}))