In [None]:
## SETUP 
imports = ['wrds', 'pandas as pd', 'os','math', 'glob',
           're', 'pickle', 'numpy as np',
           'from name_matching.name_matcher import NameMatcher',
          'from joblib import Parallel, delayed',
          'from IPython.display import display, HTML, clear_output',
          'unicodedata', 'sys', 'numpy as np', 'shutil', 'itertools', 'from langdetect import detect, DetectorFactory',
          'shutil']
for command in imports:
    if command.startswith('from'): exec(command)
    else: exec('import ' + command)

if not os.getcwd().endswith('Big Data'):
    os.chdir('../..')
sys.path.append('trade_data_code/2_python')
import A_helper_functions as hf
raw_admin = '1) data/15_revelio_data/1_inputs/a_raw_data/admin/'
processed_linkedin = '1) data/15_revelio_data/1_inputs/b_processed_data/linkedin/'
processed_admin = '1) data/15_revelio_data/1_inputs/b_processed_data/admin/'

In [None]:
## GENERATE THE LIST OF SIREN / FIRM NAMES 
## set parameters and define the wrapper 
chunks =  os.cpu_count() - 10; cut_off = .01
def wrapper(index, function):
    if function == "clean":
        return(hf.clean_firm_names(siren_chunks[index], "admin_name", False))
    else:
        return(hf.strip_words(siren_chunks[index], 'admin_name_cleaned', common_words))          

################
### IMPORT THE SIREN NUMBERS 
################
siren_numbers = (
    ##import
    pd.read_csv(raw_admin + '1_StockUniteLegaleHistorique_utf8.csv',
                usecols=['denominationUniteLegale', 'siren', 'dateDebut', 'dateFin', 
                         'activitePrincipaleUniteLegale','nomenclatureActivitePrincipaleUniteLegale', 'etatAdministratifUniteLegale'],
                dtype = {'siren': 'str'})
    
    #rename columns
    .rename(columns={'denominationUniteLegale': 'admin_name', 'dateDebut': 'start_date', 'dateFin': 'end_date', 'etatAdministratifUniteLegale': 'status',
                     'activitePrincipaleUniteLegale' : 'industry', 'nomenclatureActivitePrincipaleUniteLegale' : 'industry_system'}) 
   
    # fix date variables 
     .assign(start_date=lambda df: pd.to_datetime(df['start_date'], errors='coerce'),
             end_date=lambda df: pd.to_datetime(df['end_date'], errors='coerce'))
    # filter 
    .loc[lambda df: df['admin_name'].notna() & ~df['admin_name'].eq('[ND]') & ~df['status'].eq('C')]
)
################
### NOTE THE INDUSTRIES OF EACH SIREN FOR THE PERIOD OF INTEREST 
################
industry_year_dta = []
for year in range(2008,2024):
    industry_year_dta.append(
        siren_numbers.loc[lambda c: c['start_date'].dt.year.le(year) & c['end_date'].dt.year.ge(year)]
        .sort_values(by = ['siren','end_date'], ascending = [True,False])
        .groupby('siren').head(1)
       .assign(year = year)
       [['siren','year', 'industry', 'industry_system']]
    )
pd.concat(industry_year_dta, ignore_index = True).to_parquet(processed_admin +'siren_industry_year.parquet')

################
### RETRIEVE THE START AND END DATE OF THE SIREN / Name Combo (this method is orders of magnitudes faster than aggregating)
################
siren_numbers['combo'] = siren_numbers['siren'] + siren_numbers['admin_name']
siren_numbers['combo'] = siren_numbers['siren'] + siren_numbers['admin_name']
start_dates = (siren_numbers.loc[lambda c: c['start_date'].notna()]
               .sort_values(['combo','start_date'])
               .groupby(['combo']).head(1)[['siren','admin_name','start_date']])

na_end_dates = (siren_numbers.loc[lambda c: c['end_date'].isna()]
                .drop_duplicates(subset = 'combo')[['siren','admin_name','combo','end_date']])

end_dates = (siren_numbers.loc[lambda c: ~c['combo'].isin(na_end_dates['combo'])]
            .sort_values(['combo','end_date'], ascending = [True, False])
            .groupby('combo').head(1))
siren_numbers = (pd.merge(start_dates, pd.concat([na_end_dates,end_dates])[['siren','admin_name','end_date']], how = 'outer')
                .loc[lambda x: x['end_date'].dt.year.gt(2007) | x['end_date'].isna()])

################
## Generate Cleaned Names 
################
siren_chunks = np.array_split(siren_numbers,chunks); 
siren_numbers = pd.concat(Parallel(n_jobs=chunks, backend='multiprocessing')
                          (delayed(wrapper)(index, 'clean') for index in range(chunks)), ignore_index = True)

#establish / remove the list of common words 
word_counts = siren_numbers['admin_name_cleaned'].str.split(expand = True).stack().value_counts()
common_words = set(word_counts[word_counts > np.max(word_counts) * cut_off].index)
common_words = {word for word in common_words if not word.isnumeric()}
with open(processed_admin + 'common_words.txt', 'w') as file: file.write('\n'.join(common_words))


### use the common words to clean the firms: 
siren_chunks = np.array_split(siren_numbers,chunks)
siren_numbers = pd.concat(Parallel(n_jobs=chunks, backend='multiprocessing')
                          (delayed(wrapper)(index, 'strip') for index in range(chunks)), ignore_index = True)
siren_numbers.to_parquet(processed_admin +'siren_admin.parquet'

In [None]:
##Generate a list of all french users
db = wrds.Connection(wrds_username='am0195')
french_users = db.raw_sql("""
SELECT user_id 
FROM revelio.individual_user 
WHERE user_country = 'France'
""")
french_users.to_parquet( processed_linkedin + 'all_french_users.parquet')
db.close()

In [None]:
## Generate OUR ROLE DICTIONARY 
to_drop = ['role_k1000', 'role_k500', 'role_k300', 'role_k150', 'role_k50', 'job_category', 'onet_code', 'onet_title','role_id']
db = wrds.Connection(wrds_username='am0195')
roles = (db.raw_sql("select * from revelio.individual_role_lookup")
              .applymap(lambda x: x.lower() if isinstance(x, str) else x))

roles['role_id'] = roles.index
roles['total'] = True
roles['engineer'] = roles['job_category'] == 'engineer'
roles['data'] = (
    (roles['role_k50'] == 'data analyst') |
    (roles['role_k150'].str.contains('data', na=False)) |
    (roles['onet_title'].str.contains('database', na=False)) |
    (roles['role_k1500'].str.contains('data center', na=False))
)
roles['data_analyst'] = (roles['data'] & 
                         roles['role_k50'].str.contains('analyst', na=False) |
                         roles['role_k1500'].str.contains('intelligence', na=False))
roles['data_engineer'] = (roles['data'] & ~roles['data_analyst'])
                         
                         
rnd = pd.read_excel(raw_admin + "ONET_RandD_roles.xlsx").assign(rnd=True)[['Code', 'rnd']]
stem = pd.read_excel(raw_admin + "ONET_stem_roles.xlsx").assign(stem=True)[['Code', 'stem']]
roles = (roles.merge(rnd, left_on ="onet_code", right_on="Code", how="left").drop('Code', axis = 1)
           .merge(stem, left_on ="onet_code", right_on="Code", how="left").drop('Code', axis = 1)
            .assign(rnd=lambda x: x['rnd'].fillna(False))
           .assign(stem=lambda x: x['stem'].fillna(False))
         .applymap(lambda x: int(x) if isinstance(x, bool) else x))
roles['non_data_rnd'] = roles['rnd'] & ~roles['data'];
roles.drop(to_drop, axis = 1)
roles.to_csv(processed_linkedin +'revelio_role_dict.csv', index = False)

In [None]:
## GEN LIST OF FRENCH FACTSET IDS 
db.raw_sql("""
select factset_entity_id 
from factset.edm_standard_address 
where iso_country = 'FR'
""").to_parquet(processed_admin +'factset_french_domiciled.parquet')
db.close

In [None]:
## OBSERVE THE NUMBER OF DATA WORKERS IN EACH MARKET 
# set parameters  
db = wrds.Connection(wrds_username='am0195')
year_range = range(2008,2024)
data_roles = pd.read_csv(processed_linkedin +'revelio_role_dict.csv') \
    .loc[lambda x: x['data'].eq(1), 'role_k1500'] \
    .tolist()
temp_direct = processed_linkedin +'temp_ctry_output'
os.makedirs(temp_direct, exist_ok=True)
linkedin_to_iso_cross_walk = pd.read_csv(processed_admin +'linkedin_to_iso_crosswalk.csv')
    

# define helper functions 
def collapse_wrapper(year, data_output):
    data_output['startdate'] = pd.to_datetime(data_output['startdate'], errors='coerce')
    data_output['enddate'] = pd.to_datetime(data_output['enddate'], errors='coerce')
    data_output = (
        data_output.assign(
            valid=lambda x: x['startdate'].dt.year.le(year) & (x['enddate'].isna() | x['enddate'].dt.year.ge(year)),
            comp =lambda x: x['total_compensation']*x['weight']).
        loc[lambda x: x['valid']].
        groupby('ctry').agg(
            ctry_data_empl=('weight', 'sum'),
            ctry_data_comp=('comp', 'sum')
        ).reset_index().assign(year = year))
    return(data_output)


## GENERATE THE NUMBER / COMPENSATION FOR ALL DATA ROLES IN EACH COUNTRY 
possible_combos = pd.DataFrame(itertools.product(linkedin_to_iso_cross_walk['ctry'].unique(), year_range), columns=['ctry', 'year'])
data_roles_output = (
    db.raw_sql(
        """
       SELECT country, weight, total_compensation, startdate, enddate 
       FROM revelio.individual_positions 
       WHERE role_k1500 IN %(data_roles)s
       """, 
        params= {"data_roles": tuple(data_roles)})
    .merge(linkedin_to_iso_cross_walk))

data_roles_output = (
    pd.concat([collapse_wrapper(year, data_roles_output) for year in year_range])
    .merge(possible_combos, how = 'right')
    .assign(ctry_data_empl=lambda x: x['ctry_data_empl'].fillna(0),
            ctry_data_comp=lambda x: x['ctry_data_comp'].fillna(0))
)
data_roles_output.to_parquet(processed_linkedin +'data_roles_in_all_countries.parquet')

In [None]:
## Observe the Origin Universities /graduation dates of French Data Scientists
year_range = range(2008,2024)
cores =  os.cpu_count() - 10; 
data_roles = pd.read_csv(processed_linkedin +'revelio_role_dict.csv').loc[lambda x: x['data'].eq(1), 'role_k1500'].tolist()
params = {"data_roles": tuple(data_roles)}
role_output = db.raw_sql(
    """
    SELECT user_id, weight, total_compensation, startdate, enddate 
    FROM revelio.individual_positions 
    WHERE role_k1500 IN %(data_roles)s AND country = 'France'
    """, 
    params=params)

params = {"data_ids": tuple(role_output['user_id'].tolist())}
user_output = (
    db.raw_sql(
        """
        SELECT *
        FROM revelio.individual_user_education 
        where user_id IN %(data_ids)s
        """,
        params= params 
        ) 
    .loc[lambda x: ~x['rsid'].isna() & ~x['enddate'].isna()] 
    .assign(startdate =lambda x: pd.to_datetime(x['startdate'], errors='coerce'),
            enddate =lambda x: pd.to_datetime(x['enddate'], errors='coerce')) 
    .assign(grad_year=lambda x: x['enddate'].dt.year))

def university_collapse_wrapper(year):
    temp = (
        role_output.assign(startdate =lambda x: pd.to_datetime(x['startdate'], errors='coerce'),
                              enddate =lambda x: pd.to_datetime(x['enddate'], errors='coerce')) 
        .assign(valid=lambda x: x['startdate'].dt.year.le(year) & (x['enddate'].isna() | x['enddate'].dt.year.ge(year)),
                comp =lambda x: x['total_compensation']*x['weight'])
        .loc[lambda x: x['valid']] 
        .merge(user_output.rename(columns = {'startdate': 'uni_start_date','enddate': 'uni_end_date'})) 
        .loc[lambda x: x['uni_end_date'].le(x['startdate'])]
        .loc[lambda x: x.groupby('user_id')['uni_end_date'].idxmax()]
        .loc[lambda x: x['university_country'].eq('France')]
        .groupby(['university_name', 'grad_year', 'rsid', 'university_location', 'university_country'])
        .agg(data_grads =('weight', 'sum'),
             comp_weighted_data_grads=('comp', 'sum'))
        .assign(observation_year = year)
    ).reset_index()
    return(temp)

yr_lvl_dta_uni = pd.concat(Parallel(n_jobs=cores, backend='multiprocessing')(delayed(university_collapse_wrapper)(year) for year in year_range),ignore_index = True)
yr_lvl_dta_uni.to_parquet(processed_linkedin +'data_grads_across_france.parquet')

### GENERATE A LIST OF LOCATIONS SO WE CAN START TRYING TO MATCH TO FIRM DATA 
simple_uni_x_location = (
    pd.merge(yr_lvl_dta_uni,
             yr_lvl_dta_uni.groupby('university_location', as_index=False)['data_grads'].max())
    [['university_name', 'university_location']].drop_duplicates()
    .merge(yr_lvl_dta_uni[['university_name','university_location']]
           .drop_duplicates()
           .assign(num_unis = lambda df: df.groupby('university_location')['university_location'].transform('count'))
           [['university_location', 'num_unis']].drop_duplicates())
)
simple_uni_x_location.to_excel(processed_admin +'uni_x_location_raw.xlsx', index=False)

In [None]:
## Generate A CROSS BETWEEN LEI AND SIREN CODES 
#When INSEE is the managing Local Operating Unit (LOU) or the firm is french, it identifies firms with SIREN codes.
##The initial list of LEI codes is provided by the GLIEF
##(https://search.gleif.org/#/search/simpleSearch=France&fulltextFilterId=LEIREC_FULLTEXT&currentPage=1&perPage=15&expertMode=false).

# File path
file_path = raw_admin + '20241105-0000-gleif-goldencopy-lei2-golden-copy.csv'

# Columns of interest and their new names
interest_cols = ['LEI','Entity.LegalName','Entity.LegalAddress.Country', 'Entity.RegistrationAuthority.RegistrationAuthorityEntityID','Registration.ManagingLOU']
new_names = ["lei", "lei_name", 'lei_country', 'lei_siren',"managing_lou"]
crosswalk = pd.read_csv(file_path, usecols=interest_cols, low_memory=False)
crosswalk.columns = new_names

crosswalk = (crosswalk.assign(insee_registered = lambda c: c['managing_lou'] == '969500Q2MA9VBQ8BG884',
             lei_siren = lambda c: c['lei_siren'].fillna('').astype(str).apply(lambda x: re.sub(r'[^a-zA-Z0-9]', '',x)))
             .loc[lambda x: (x['lei_country'].eq("FR") | x['insee_registered'])])

crosswalk.to_parquet(processed_admin +'LEI_siren_crosswalk.parquet')

In [None]:
## FIND ALL FIRMS THAT HAVE HIRED A FRENCH USER 
french_users = pd.read_parquet(processed_linkedin + 'all_french_users.parquet')
num_chunks = 500
temp_direct = processed_linkedin + 'temp_role'
os.makedirs(temp_direct, exist_ok=True)
db = wrds.Connection(wrds_username='am0195')
chunks = np.array_split(french_users['user_id'].unique(), num_chunks)

for index in range(num_chunks):
    file_path = temp_direct + "/temp" + str(index) + ".parquet"
    if not os.path.exists(file_path):    
        clear_output(wait=True)
        print(str(round(100*(index+1)/num_chunks,2))+ '%')
        params = {'user_id_list': tuple(chunks[index].tolist())}
        temp = db.raw_sql(
            """
            SELECT DISTINCT rcid
            FROM revelio.individual_positions 
            WHERE user_id IN %(user_id_list)s AND rcid IS NOT NULL
            """, 
            params= params)
        temp.to_parquet(file_path)

output = (pd.concat([pd.read_parquet(file) for file in glob.glob(temp_direct + "/*.parquet")],ignore_index = True)
         .drop_duplicates().loc[lambda x: ~x['rcid'].isna()])
output.to_parquet(processed_linkedin + 'all_french_user_rcids.parquet')
shutil.rmtree(temp_direct)

In [None]:
## FIND ALL FIRMS THAT HAVE A POSITION LOCATED IN FRANCE 
french_roles = db.raw_sql(
    """
    SELECT DISTINCT rcid
    FROM revelio.individual_positions
    WHERE country = 'France' AND rcid IS NOT NULL
    """,
)
french_roles.to_parquet(processed_linkedin + 'all_french_role_rcids.parquet')

In [None]:
## FIND ALL FIRMS THAT ARE HEADQUARTERED IN FRANCE 
french_hqs = db.raw_sql(
            """
            SELECT DISTINCT rcid
            FROM revelio.company_mapping
            WHERE hq_country = 'France' AND rcid IS NOT NULL
            """
        ).to_parquet(processed_linkedin + 'all_french_hq_rcids.parquet')

In [None]:
## FIND ALL FIRMS THAT HAVE A FRENCH LEI 
french_leis = pd.read_parquet(processed_admin +'LEI_siren_crosswalk.parquet')
num_chunks = 50
temp_direct = os.path.join(processed_linkedin, 'temp')
os.makedirs(temp_direct, exist_ok=True)
db = wrds.Connection(wrds_username='am0195')
chunks = np.array_split(french_leis['lei'].dropna().unique(), num_chunks)

for index in reversed(range(num_chunks)):
    file_path = os.path.join(temp_direct, f"temp{index}.parquet")
    if not os.path.exists(file_path):
        clear_output(wait=True)
        lei_list = tuple(chunks[index].tolist())
        print(f"{round(100 * (index + 1) / num_chunks, 2)}%")
        temp = db.raw_sql(
                    """
                    SELECT  rcid
                    FROM revelio.company_mapping
                    WHERE lei IN %(lei_list)s
                    """,  
            params= {"lei_list": lei_list}).to_parquet(file_path)

output = pd.concat([pd.read_parquet(file) for file in glob.glob(os.path.join(temp_direct, "*.parquet"))],ignore_index=True)
output.to_parquet(processed_linkedin + 'all_french_lei_rcids.parquet')
shutil.rmtree(temp_direct)

In [None]:
## GENERATE FIRM LEVEL DATA ON PROSPECTIVE FRENCH COMPANIES 

## SET PARAMETER VALUES 
def safe_detect(x):
    try:
        return detect(x) if pd.notnull(x) else None
    except:
        return None
DetectorFactory.seed = 0 

all_french_rcids = (pd.concat([
    pd.read_parquet(os.path.join(processed_linkedin, 'all_french_role_rcids.parquet')),
    pd.read_parquet(os.path.join(processed_linkedin, 'all_french_user_rcids.parquet')),
    pd.read_parquet(processed_linkedin + 'all_french_hq_rcids.parquet'),
    pd.read_parquet(processed_linkedin + 'all_french_lei_rcids.parquet')])
                    .drop_duplicates())
common_words  = open(processed_admin +'common_words.txt', 'r').read().splitlines()
french_leis = pd.read_parquet(processed_admin +'LEI_siren_crosswalk.parquet')['lei']
french_factset_ids = pd.read_parquet(processed_admin +'factset_french_domiciled.parquet')['factset_entity_id']
processed_admin +'factset_french_domiciled.parquet'

# Prepare for chunking
num_chunks = 500
temp_direct = os.path.join(processed_linkedin, 'temp')
shutil.rmtree(temp_direct)
os.makedirs(temp_direct, exist_ok=True)
db = wrds.Connection(wrds_username='am0195')
chunks = np.array_split(all_french_rcids['rcid'].dropna().unique(), num_chunks)

# SCRAPE / PROCESS DATA FOR EACH CHUNK 
for index in range(num_chunks):
    file_path = os.path.join(temp_direct, f"temp{index}.parquet")
    if not os.path.exists(file_path):
        clear_output(wait=True)
        print(f"{round(100 * (index + 1) / num_chunks, 2)}%")

        rcid_list = tuple(chunks[index].tolist())
        temp = db.raw_sql(
            """
            SELECT *
            FROM revelio.company_mapping
            WHERE rcid IN %(rcid_list)s
            """,
            params={"rcid_list": rcid_list}
        )
        temp = temp.loc[~temp['company'].isna() & ~temp['company'].eq('')]
        temp = hf.clean_firm_names(temp, 'company',True)
        temp = hf.strip_words(temp, 'company_cleaned', common_words)
        temp = (temp
                .assign(description_french = lambda x: (
                    x['slogan'].apply(safe_detect).eq('fr') |
                    x['description'].apply(safe_detect).eq('fr')))
                .assign(admin_french = lambda x: x['hq_country'].eq('France')
                        | x['url'].apply(lambda x: x.split('.')[-1] if isinstance(x, str) else None).eq('fr')
                        | x['factset_entity_id'].isin(french_factset_ids)
                        | x['isin'].str[:2].eq("FR")
                        | x['lei'].isin(french_leis) 
                        | x['cusip'].str[:1].eq("F")
                        | x['firm_type_french_likelihood'].eq('likely french')
                        | x['description_french']
                        | x['hq_country'].eq('France')))
        temp = temp[['rcid', 'company', 'company_cleaned','company_stripped', 'extracted_terms', 'lei',
                    'child_rcid', 'ultimate_parent_rcid', 'hq_street_address',
                     'hq_zip_code', 'hq_city', 'hq_metro_area', 'hq_state', 'hq_country',
                     'hq_region', 'admin_french']]
        temp.to_parquet(file_path)

# Combine and save full output
output = (
    pd.concat([pd.read_parquet(file) for file in glob.glob(os.path.join(temp_direct, "*.parquet"))],ignore_index=True)
    .assign(has_subsid = lambda x: ~x['child_rcid'].isna(), 
             is_subsid = lambda x: ~x['ultimate_parent_rcid'].isna(),
             has_lei = lambda x: ~x['lei'].isna(),
             french_hq = lambda x: np.where(x['hq_country'].isna(), pd.NA, x['hq_country'] == 'France')))

output.to_parquet(processed_linkedin + 'firm_lvl_info_all_potential_french_firms.parquet')

# Clean up
shutil.rmtree(temp_direct)

In [None]:
# Check the Roles for Firms that are not admin_french
non_french_admin = (
    pd.read_parquet(processed_linkedin + 'firm_lvl_info_all_potential_french_firms.parquet')
    .loc[lambda x: ~x['admin_french']])

num_chunks = 500
temp_direct = os.path.join(processed_linkedin, 'temp')
os.makedirs(temp_direct, exist_ok=True)
db = wrds.Connection(wrds_username='am0195')
chunks = np.array_split(non_french_admin['rcid'].dropna().unique(), num_chunks)
french_users = pd.read_parquet( processed_linkedin + 'all_french_users.parquet')

for index in range(num_chunks):
    file_path = os.path.join(temp_direct, f"temp{index}.parquet")
    if not os.path.exists(file_path):
        clear_output(wait=True)
        
        print(f"{round(100 * (index + 1) / num_chunks, 2)}%")
        params = {'rcid_list': tuple(chunks[index].tolist())}
        temp = db.raw_sql(
            """
            SELECT rcid,user_id,country, weight, total_compensation
            FROM revelio.individual_positions 
            WHERE rcid IN %(rcid_list)s
            """, 
            params= params)
        temp.loc[temp['user_id'].isin(french_users['user_id']),'country' ] = 'France'
        temp = (
            temp.loc[~temp['country'].isna()]
            .assign(french = lambda x: x['country'].eq('France'),
                    comp = lambda x: x['weight'] * x['total_compensation'])
            .groupby('rcid', as_index=False)
            .apply(lambda g: pd.Series({
                  'share_comp_french': g.loc[g['french'], 'comp'].sum() / g['comp'].sum(),
                  'share_emp_french': g.loc[g['french'], 'weight'].sum() / g['weight'].sum()})).reset_index()
            .assign(role_french = lambda x: x['share_comp_french'].ge(.5) | x['share_emp_french'].ge(.5))
            )[['rcid', 'role_french']].to_parquet(file_path)
output = pd.concat([pd.read_parquet(file) for file in glob.glob(os.path.join(temp_direct, "*.parquet"))],ignore_index=True)
full_processed_linkedin = (pd.read_parquet(processed_linkedin + 'firm_lvl_info_all_potential_french_firms.parquet')
                           .merge(output, how = 'left')
                          .assign(french_eligible = lambda x: x['role_french'].isna() | x['role_french']))

full_processed_linkedin.to_parquet(processed_linkedin + 'firm_lvl_info_all_potential_french_firms.parquet')
shutil.rmtree(temp_direct)

The matching procedure runs in several steps. 

First, we have the LEI-siren matches for a subset of firms, we identify those and then remove those sirens / rcids from the pool of firms to match. 

Second, we remove all sirens / rcids that have the same cleaned name as another siren / rcid. We won't be able to match these uniquely and therefore won't consider them. 

Third, we match all remaining sirens / rcids based on whether they have a unique match based on their cleaned name. We then remove these matches from the list of candidates.

Fourth. We identify which of the rcids remaining have a unique perfect match based on the stripped name. Those that don't are discarded. We then match based on the stripped name using the fuzzy matching command 


In [None]:
## Run Matching 
############  
### DEFINE PARAMETERS AND IMPORT DATA / Matching Function
############  
init_matches = 50; final_matches = 5; cores = max(os.cpu_count() - 2, 1);
chunks = cores*10

def matching_wrapper(index):
    #output progress 
    clear_output(wait=True)
    print(f"{round(index/chunks*100, 2)}%")
    
    temp_firms = firm_chunks[index]
    #run first version of matcher on all words 
    matches = matcher.match_names(to_be_matched=temp_firms, column_matching='company_cleaned')
    results = (pd.wide_to_long(matches,stubnames=["match_name", "score", "match_index"], i="original_name", j="match",suffix="_\d+")
               .reset_index()[['original_name', 'match_name', 'score']]
               .rename(columns={'original_name': 'company_cleaned', 'match_name': 'admin_name_cleaned', 'score': 'raw_score'})
               .merge(temp_firms[['company_cleaned', 'company_stripped']], how = 'left', on = 'company_cleaned')
               .merge(sirens_to_match[['admin_name_cleaned', 'admin_name_stripped']], how = 'left', on = 'admin_name_cleaned'))
    company_chunks = [group for _, group in results.groupby('company_cleaned')]

    ### reun the second version of matcher only on words from initial list 
    results = []
    temp_matcher = NameMatcher(number_of_matches=init_matches, legal_suffixes=False, common_words= False, top_n= init_matches, verbose=False)
    temp_matcher.set_distance_metrics(['bag', 'typo', 'refined_soundex'])
    for chunk in company_chunks:
        chunk = chunk.reset_index()
        try:
            temp_matcher.load_and_process_master_data(column='admin_name_stripped', df_matching_data=chunk, transform=True)
            temp_matches = temp_matcher.match_names(to_be_matched=chunk.iloc[0], column_matching='company_stripped')
            temp_results = (pd.wide_to_long(temp_matches,stubnames=["match_name", "score", "match_index"], i="original_name", j="match",suffix="_\d+")
                            .reset_index()[['original_name', 'match_name', 'score']]
                            .rename(columns={'original_name': 'company_stripped', 'match_name': 'admin_name_stripped', 'score': 'stripped_score'})
                            .drop_duplicates()
                            .merge(chunk, how = 'right')
                            .assign(match_index = lambda df: df.groupby(['stripped_score']).ngroup()))

        except Exception as e:
            print(f"Error processing company: {chunk.loc[0,'company_cleaned']} in index {index}") 
            temp_results = chunk.assign(match_index = lambda df: df.groupby(['raw_score']).ngroup())

        temp_results = (temp_results
                        .assign(match_index = lambda df: df['match_index'].max() - df['match_index'] + 1)
                        .sort_values(['match_index','raw_score'], ascending = [True, False])
                        .loc[lambda df: df['match_index'].le(final_matches)])
        results.append(temp_results)
    results = pd.concat(results, ignore_index = True)[['company_cleaned', 'admin_name_cleaned', 'raw_score', 'stripped_score', 'match_index']]
    return(results)

############  
### Check for LEI matches 
############ 
french_leis = pd.read_parquet(processed_admin +'LEI_siren_crosswalk.parquet')[['lei', 'lei_siren']].rename(columns={'lei_siren': 'firmid'})
firms_to_match = (pd.read_parquet(processed_linkedin + 'firm_lvl_info_all_potential_french_firms.parquet'))
lei_matched = (pd.merge(french_leis, firms_to_match[['rcid','lei']])
              .assign(match_method = 'lei')
              [['rcid', 'firmid', 'match_method']])

############  
### Check for exact matches on cleaned names 
############ 
firms_to_match = (firms_to_match.loc[lambda x: x['french_eligible'] & ~x['rcid'].isin(lei_matched['rcid'])]
                  .assign(count=lambda x: x.groupby('company_cleaned')['company_cleaned'].transform('count'))
                  .loc[lambda x: x['count'] == 1]
                 [['rcid','company_cleaned','company_stripped']])     

sirens_to_match = (pd.read_parquet(processed_admin + 'siren_admin.parquet')
                   .rename(columns = {'siren': 'firmid'})
                   .loc[lambda x: ~x['firmid'].isin(lei_matched['firmid'])]
                   .assign(count=lambda x: x.groupby('admin_name_cleaned')['admin_name_cleaned'].transform('count'))
                   .loc[lambda x: x['count'] == 1]
                   [['admin_name_cleaned', 'admin_name_stripped','firmid']])
clean_matched = (
    pd.merge(firms_to_match, sirens_to_match, how = 'inner',
             left_on = 'company_cleaned', right_on = 'admin_name_cleaned')
    .assign(match_method = 'cleaned')
    [['rcid','firmid', 'match_method']])


############  
### match remaining firms 
############ 
sirens_to_match = (sirens_to_match.loc[lambda x: ~x['firmid'].isin(clean_matched['firmid'])] 
                   .assign(count=lambda x: x.groupby('admin_name_stripped')['admin_name_stripped'].transform('count')))

firms_to_match = (firms_to_match
                  .loc[lambda c: ~c['rcid'].isin(clean_matched['rcid']) 
                  & c['company_stripped'].isin(sirens_to_match.loc[lambda x: x['count'].eq(1), 'admin_name_stripped'])])

firm_chunks = np.array_split(firms_to_match[['company_cleaned', 'company_stripped']], chunks)
print('starting fuzzy matching')
matcher = NameMatcher(number_of_matches=init_matches, legal_suffixes=False, common_words= False, top_n= init_matches, verbose=False)
matcher.set_distance_metrics(['bag', 'typo', 'refined_soundex'])
matcher.load_and_process_master_data(column='admin_name_cleaned',
                                     df_matching_data =sirens_to_match[['admin_name_cleaned', 'admin_name_stripped']],
                                     transform=True)
matching_output = Parallel(n_jobs=cores, backend='multiprocessing')(delayed(matching_wrapper)(index) for index in range(chunks))




strip_matched = (pd.concat(matching_output, ignore_index = True)
                 .loc[lambda x: x['match_index'].eq(1) & x['stripped_score'].eq(100)]
                 .assign(match_method = 'strip',
                         count=lambda x: x.groupby('company_cleaned')['company_cleaned'].transform('count'))
                 .loc[lambda x: x['count'] == 1]
                 .merge(sirens_to_match[['admin_name_cleaned', 'firmid']])
                 .merge(firms_to_match[['company_cleaned', 'rcid']])
                 [['rcid','firmid', 'match_method']])

all_matches = pd.concat([lei_matched, clean_matched, strip_matched], ignore_index = True)
final_output = (pd.read_parquet(processed_linkedin + 'firm_lvl_info_all_potential_french_firms.parquet')
              .merge(all_matches, on = 'rcid'))
final_output.to_parquet(processed_linkedin + 'firm_lvl_info_all_matched_firms.parquet')
    