In [3]:
## SETUP 
imports = ['wrds', 'pandas as pd', 'os', 're', 'pickle', 'numpy as np', 'from name_matching.name_matcher import NameMatcher',
          'from joblib import Parallel, delayed', 'from IPython.display import display, HTML, clear_output', 'random',
          'unicodedata','sys', 'from langdetect import detect, DetectorFactory']
for command in imports:
    if command.startswith('from'): exec(command)
    else: exec('import ' + command)

if not os.getcwd().endswith('Big Data'):
    os.chdir('../..')

sys.path.append('trade_data_code/2_python')
import A_helper_functions as hf
processed_linkedin = '1) data/15_revelio_data/1_inputs/b_processed_data/linkedin/'
processed_admin = '1) data/15_revelio_data/1_inputs/b_processed_data/admin/'

In [5]:
non_french_admin = (
    pd.read_parquet(processed_linkedin + 'firm_lvl_info_all_potential_french_firms.parquet')
    .loc[lambda x: ~x['admin_french'] &
    ~x['rcid'].isin(pd.read_parquet(processed_linkedin +'non_french_admin_pt1.parquet')['rcid'])
    ])

num_chunks = 500
temp_direct = os.path.join(processed_linkedin, 'temp')
os.makedirs(temp_direct, exist_ok=True)
db = wrds.Connection(wrds_username='am0195')
chunks = np.array_split(non_french_admin['rcid'].dropna().unique(), num_chunks)
french_users = pd.read_parquet( processed_linkedin + 'all_french_users.parquet')

for index in reversed(range(num_chunks)):
    file_path = os.path.join(temp_direct, f"temp{index}.parquet")
    if not os.path.exists(file_path):
        clear_output(wait=True)
        
        print(f"{round(100 * (index + 1) / num_chunks, 2)}%")
        params = {'rcid_list': tuple(chunks[index].tolist())}
        temp = db.raw_sql(
            """
            SELECT rcid,user_id,country, weight, total_compensation
            FROM revelio.individual_positions 
            WHERE rcid IN %(rcid_list)s
            """, 
            params= params)
        temp.loc[temp['user_id'].isin(french_users['user_id']),'country' ] = 'France'
        temp = (
            temp.loc[~temp['country'].isna()]
            .assign(french = lambda x: x['country'].eq('France'),
                    comp = lambda x: x['weight'] * x['total_compensation'])
            .groupby('rcid', as_index=False)
            .apply(lambda g: pd.Series({
                  'share_comp_french': g.loc[g['french'], 'comp'].sum() / g['comp'].sum(),
                  'share_emp_french': g.loc[g['french'], 'weight'].sum() / g['weight'].sum()})).reset_index()
            .assign(role_french = lambda x: x['share_comp_french'].ge(.5) | x['share_emp_french'].ge(.5))
            )[['rcid', 'role_french']].to_parquet(file_path)


49.2%


  .apply(lambda g: pd.Series({


In [None]:
## Run Matching 
############  
### DEFINE PARAMETERS AND IMPORT DATA / Matching Function
############  
init_matches = 50; final_matches = 5; cores = max(os.cpu_count() - 5, 1);
chunks = cores*10

def matching_wrapper(index):
    #output progress 
    clear_output(wait=True)
    print(f"{round(index/chunks*100, 2)}%")
    
    temp_firms = firm_chunks[index]
    #run first version of matcher on all words 
    matches = matcher.match_names(to_be_matched=temp_firms, column_matching='company_cleaned')
    results = (pd.wide_to_long(matches,stubnames=["match_name", "score", "match_index"], i="original_name", j="match",suffix="_\d+")
               .reset_index()[['original_name', 'match_name', 'score']]
               .rename(columns={'original_name': 'company_cleaned', 'match_name': 'admin_name_cleaned', 'score': 'raw_score'})
               .merge(temp_firms[['company_cleaned', 'company_stripped']], how = 'left', on = 'company_cleaned')
               .merge(sirens_to_match[['admin_name_cleaned', 'admin_name_stripped']], how = 'left', on = 'admin_name_cleaned'))
    company_chunks = [group for _, group in results.groupby('company_cleaned')]

    ### reun the second version of matcher only on words from initial list 
    results = []
    temp_matcher = NameMatcher(number_of_matches=init_matches, legal_suffixes=False, common_words= False, top_n= init_matches, verbose=False)
    temp_matcher.set_distance_metrics(['bag', 'typo', 'refined_soundex'])
    for chunk in company_chunks:
        chunk = chunk.reset_index()
        try:
            temp_matcher.load_and_process_master_data(column='admin_name_stripped', df_matching_data=chunk, transform=True)
            temp_matches = temp_matcher.match_names(to_be_matched=chunk.iloc[0], column_matching='company_stripped')
            temp_results = (pd.wide_to_long(temp_matches,stubnames=["match_name", "score", "match_index"], i="original_name", j="match",suffix="_\d+")
                            .reset_index()[['original_name', 'match_name', 'score']]
                            .rename(columns={'original_name': 'company_stripped', 'match_name': 'admin_name_stripped', 'score': 'stripped_score'})
                            .drop_duplicates()
                            .merge(chunk, how = 'right')
                            .assign(match_index = lambda df: df.groupby(['stripped_score']).ngroup()))

        except Exception as e:
            print(f"Error processing company: {chunk.loc[0,'company_cleaned']} in index {index}") 
            temp_results = chunk.assign(match_index = lambda df: df.groupby(['raw_score']).ngroup())

        temp_results = (temp_results
                        .assign(match_index = lambda df: df['match_index'].max() - df['match_index'] + 1)
                        .sort_values(['match_index','raw_score'], ascending = [True, False])
                        .loc[lambda df: df['match_index'].le(final_matches)])
        results.append(temp_results)
    results = pd.concat(results, ignore_index = True)[['company_cleaned', 'admin_name_cleaned', 'raw_score', 'stripped_score', 'match_index']]
    return(results)

############  
### Check for LEI matches 
############ 
french_leis = pd.read_parquet(processed_admin +'LEI_siren_crosswalk.parquet')[['lei', 'lei_siren']].rename(columns={'lei_siren': 'firmid'})
firms_to_match = (pd.read_parquet(processed_linkedin + 'firm_lvl_info_all_potential_french_firms.parquet'))
lei_matched = (pd.merge(french_leis, firms_to_match[['rcid','lei']])
              .assign(match_method = 'lei')
              [['rcid', 'firmid', 'match_method']])

############  
### Check for exact matches on cleaned names 
############ 
firms_to_match = (firms_to_match.loc[lambda x: x['french_eligible'] & ~x['rcid'].isin(lei_matched['rcid'])]
                  .assign(count=lambda x: x.groupby('company_cleaned')['company_cleaned'].transform('count'))
                  .loc[lambda x: x['count'] == 1]
                 [['rcid','company_cleaned','company_stripped']])     

sirens_to_match = (pd.read_parquet(processed_admin + 'siren_admin.parquet')
                   .rename(columns = {'siren': 'firmid'})
                   .loc[lambda x: ~x['firmid'].isin(lei_matched['firmid'])]
                   .assign(count=lambda x: x.groupby('admin_name_cleaned')['admin_name_cleaned'].transform('count'))
                   .loc[lambda x: x['count'] == 1]
                   [['admin_name_cleaned', 'admin_name_stripped','firmid']])
clean_matched = (
    pd.merge(firms_to_match, sirens_to_match, how = 'inner',
             left_on = 'company_cleaned', right_on = 'admin_name_cleaned')
    .assign(match_method = 'cleaned')
    [['rcid','firmid', 'match_method']])


############  
### match remaining firms 
############ 
sirens_to_match = (sirens_to_match.loc[lambda x: ~x['firmid'].isin(clean_matched['firmid'])] 
                   .assign(count=lambda x: x.groupby('admin_name_stripped')['admin_name_stripped'].transform('count')))

firms_to_match = (firms_to_match
                  .loc[lambda c: ~c['rcid'].isin(clean_matched['rcid']) 
                  & c['company_stripped'].isin(sirens_to_match.loc[lambda x: x['count'].eq(1), 'admin_name_stripped'])])

firm_chunks = np.array_split(firms_to_match[['company_cleaned', 'company_stripped']], chunks)
matcher = NameMatcher(number_of_matches=init_matches, legal_suffixes=False, common_words= False, top_n= init_matches, verbose=False)
matcher.set_distance_metrics(['bag', 'typo', 'refined_soundex'])
matcher.load_and_process_master_data(column='admin_name_cleaned',
                                     df_matching_data =sirens_to_match[['admin_name_cleaned', 'admin_name_stripped']],
                                     transform=True)

matching_output = Parallel(n_jobs=cores, backend='multiprocessing')(delayed(matching_wrapper)(index) for index in range(chunks))
strip_matched = (pd.concat(matching_output, ignore_index = True)
                 .loc[lambda x: x['match_index'].eq(1) & x['stripped_score'].eq(100)]
                 .assign(match_method = 'strip',
                         count=lambda x: x.groupby('company_cleaned')['company_cleaned'].transform('count'))
                 .loc[lambda x: x['count'] == 1]
                 .merge(sirens_to_match[['admin_name_cleaned', 'firmid']])
                 .merge(firms_to_match[['company_cleaned', 'rcid']])
                 [['rcid','firmid', 'match_method']])

all_matches = pd.concat([lei_matched, clean_matched, strip_matched], ignore_index = True)
final_output = (pd.read_parquet(processed_linkedin + 'firm_lvl_info_all_potential_french_firms.parquet')
              .merge(all_matches, how = 'left', on = 'rcid'))
final_output.to_parquet(processed_linkedin + 'firm_lvl_info_all_potential_french_firms.parquet')
    