In [1]:
## SETUP 
imports = ['wrds', 'pandas as pd', 'os','math', 'glob',
           're', 'pickle', 'numpy as np',
           'from name_matching.name_matcher import NameMatcher',
          'from joblib import Parallel, delayed',
          'from IPython.display import display, HTML, clear_output',
          'unicodedata', 'sys', 'numpy as np', 'shutil', 'itertools']
for command in imports:
    if command.startswith('from'): exec(command)
    else: exec('import ' + command)

if not os.getcwd().endswith('Big Data'):
    os.chdir('../..')
sys.path.append('trade_data_code/2_python')
import A_helper_functions as hf

In [None]:
########################################################################################
# Generate the list of SIREN / Firm Names
########################################################################################
## set parameters and define the wrapper 
chunks =  os.cpu_count() - 10; cut_off = .01
def wrapper(index, function):
    if function == "clean":
        return(hf.clean_firm_names(siren_chunks[index], "admin_name", False))
    else:
        return(hf.strip_words(siren_chunks[index], 'admin_name_cleaned', common_words))          

################
### IMPORT THE SIREN NUMBERS 
################
siren_numbers = (
    ##import
    pd.read_csv('../1_IWH/data/2_patent_tm_scraping/1_raw/1_StockUniteLegaleHistorique_utf8.csv',
                usecols=['denominationUniteLegale', 'siren', 'dateDebut', 'dateFin', 
                         'activitePrincipaleUniteLegale','nomenclatureActivitePrincipaleUniteLegale', 'etatAdministratifUniteLegale'],
                dtype = {'siren': 'str'})
    
    #rename columns
    .rename(columns={'denominationUniteLegale': 'admin_name', 'dateDebut': 'start_date', 'dateFin': 'end_date', 'etatAdministratifUniteLegale': 'status',
                     'activitePrincipaleUniteLegale' : 'industry', 'nomenclatureActivitePrincipaleUniteLegale' : 'industry_system'}) 
   
    # fix date variables 
     .assign(start_date=lambda df: pd.to_datetime(df['start_date'], errors='coerce'),
             end_date=lambda df: pd.to_datetime(df['end_date'], errors='coerce'))
    # filter 
    .loc[lambda df: df['admin_name'].notna() & ~df['admin_name'].eq('[ND]') & ~df['status'].eq('C')]
)
################
### NOTE THE INDUSTRIES OF EACH SIREN FOR THE PERIOD OF INTEREST 
################
industry_year_dta = []
for year in range(2008,2024):
    industry_year_dta.append(
        siren_numbers.loc[lambda c: c['start_date'].dt.year.le(year) & c['end_date'].dt.year.ge(year)]
        .sort_values(by = ['siren','end_date'], ascending = [True,False])
        .groupby('siren').head(1)
       .assign(year = year)
       [['siren','year', 'industry', 'industry_system']]
    )
pd.concat(industry_year_dta, ignore_index = True).to_parquet('data/2_processed/admin/siren_industry_year.parquet')

################
### RETRIEVE THE START AND END DATE OF THE SIREN / Name Combo (this method is orders of magnitudes faster than aggregating)
################
siren_numbers['combo'] = siren_numbers['siren'] + siren_numbers['admin_name']
siren_numbers['combo'] = siren_numbers['siren'] + siren_numbers['admin_name']
start_dates = (siren_numbers.loc[lambda c: c['start_date'].notna()]
               .sort_values(['combo','start_date'])
               .groupby(['combo']).head(1)[['siren','admin_name','start_date']])

na_end_dates = (siren_numbers.loc[lambda c: c['end_date'].isna()]
                .drop_duplicates(subset = 'combo')[['siren','admin_name','combo','end_date']])

end_dates = (siren_numbers.loc[lambda c: ~c['combo'].isin(na_end_dates['combo'])]
            .sort_values(['combo','end_date'], ascending = [True, False])
            .groupby('combo').head(1))
siren_numbers = (pd.merge(start_dates, pd.concat([na_end_dates,end_dates])[['siren','admin_name','end_date']], how = 'outer')
                .loc[lambda x: x['end_date'].dt.year.gt(2007) | x['end_date'].isna()])

################
## Generate Cleaned Names 
################
siren_chunks = np.array_split(siren_numbers,chunks); 
siren_numbers = pd.concat(Parallel(n_jobs=chunks, backend='multiprocessing')
                          (delayed(wrapper)(index, 'clean') for index in range(chunks)), ignore_index = True)

#establish / remove the list of common words 
word_counts = siren_numbers['admin_name_cleaned'].str.split(expand = True).stack().value_counts()
common_words = set(word_counts[word_counts > np.max(word_counts) * cut_off].index)
common_words = {word for word in common_words if not word.isnumeric()}
with open('data/2_processed/admin/common_words.txt', 'w') as file: file.write('\n'.join(common_words))


### use the common words to clean the firms: 
siren_chunks = np.array_split(siren_numbers,chunks)
siren_numbers = pd.concat(Parallel(n_jobs=chunks, backend='multiprocessing')
                          (delayed(wrapper)(index, 'strip') for index in range(chunks)), ignore_index = True)
siren_numbers.to_parquet('data/2_processed/admin/siren_admin.parquet')

In [3]:
########################################################################################
# Generate our role dictionary 
########################################################################################
to_drop = ['role_k1000', 'role_k500', 'role_k300', 'role_k150', 'role_k50', 'job_category', 'onet_code', 'onet_title','role_id']
db = wrds.Connection(wrds_username='am0195')
roles = (db.raw_sql("select * from revelio.individual_role_lookup")
              .applymap(lambda x: x.lower() if isinstance(x, str) else x))


roles['role_id'] = roles.index
roles['total'] = True
roles['engineer'] = roles['job_category'] == 'engineer'
roles['data'] = (
    (roles['role_k50'] == 'data analyst') |
    (roles['role_k150'].str.contains('data', na=False)) |
    (roles['onet_title'].str.contains('database', na=False)) |
    (roles['role_k1500'].str.contains('data center', na=False))
)
roles['data_analyst'] = (roles['data'] & 
                         roles['role_k50'].str.contains('analyst', na=False) |
                         roles['role_k1500'].str.contains('intelligence', na=False))
roles['data_engineer'] = (roles['data'] & ~roles['data_analyst'])
                         
                         
rnd = pd.read_excel("data/1_raw_data/admin/ONET_RandD_roles.xlsx").assign(rnd=True)[['Code', 'rnd']]
stem = pd.read_excel("data/1_raw_data/admin/ONET_stem_roles.xlsx").assign(stem=True)[['Code', 'stem']]
roles = (roles.merge(rnd, left_on ="onet_code", right_on="Code", how="left").drop('Code', axis = 1)
           .merge(stem, left_on ="onet_code", right_on="Code", how="left").drop('Code', axis = 1)
            .assign(rnd=lambda x: x['rnd'].fillna(False))
           .assign(stem=lambda x: x['stem'].fillna(False))
         .applymap(lambda x: int(x) if isinstance(x, bool) else x))
roles['non_data_rnd'] = roles['rnd'] & ~roles['data'];
roles.drop(to_drop, axis = 1)
roles.to_csv('data/2_processed/linkedin/revelio_role_dict.csv', index = False)

Loading library list...
Done


In [None]:
########################################################################################
# Generate our list of french factset ids
########################################################################################
(db.raw_sql("select factset_entity_id "
        "from factset.edm_standard_address "
         "where iso_country = 'FR'" ).
      to_parquet('data/2_processed/admin/factset_french_domiciled.parquet'))
db.close

In [7]:
########################################################################################
# Observe the number of data  workers in each market 
########################################################################################
# set parameters  
db = wrds.Connection(wrds_username='am0195')
year_range = range(2008,2024)
data_roles = pd.read_csv('data/2_processed/linkedin/revelio_role_dict.csv') \
    .loc[lambda x: x['data'].eq(1), 'role_k1500'] \
    .tolist()
temp_direct = 'data/2_processed/linkedin/temp_ctry_output'
os.makedirs(temp_direct, exist_ok=True)
linkedin_to_iso_cross_walk = pd.read_csv('data/2_processed/admin/linkedin_to_iso_crosswalk.csv')
    

# define helper functions 
def collapse_wrapper(year, data_output):
    data_output['startdate'] = pd.to_datetime(data_output['startdate'], errors='coerce')
    data_output['enddate'] = pd.to_datetime(data_output['enddate'], errors='coerce')
    data_output = (
        data_output.assign(
            valid=lambda x: x['startdate'].dt.year.le(year) & (x['enddate'].isna() | x['enddate'].dt.year.ge(year)),
            comp =lambda x: x['total_compensation']*x['weight']).
        loc[lambda x: x['valid']].
        groupby('ctry').agg(
            ctry_data_empl=('weight', 'sum'),
            ctry_data_comp=('comp', 'sum')
        ).reset_index().assign(year = year))
    return(data_output)


## GENERATE THE NUMBER / COMPENSATION FOR ALL DATA ROLES IN EACH COUNTRY 
possible_combos = pd.DataFrame(itertools.product(linkedin_to_iso_cross_walk['ctry'].unique(), year_range), columns=['ctry', 'year'])
data_roles_output = (
    db.raw_sql(
        """
       SELECT country, weight, total_compensation, startdate, enddate 
       FROM revelio.individual_positions 
       WHERE role_k1500 IN %(data_roles)s
       """, 
        params= {"data_roles": tuple(data_roles)})
    .merge(linkedin_to_iso_cross_walk))

data_roles_output = (
    pd.concat([collapse_wrapper(year, data_roles_output) for year in year_range])
    .merge(possible_combos, how = 'right')
    .assign(ctry_data_empl=lambda x: x['ctry_data_empl'].fillna(0),
            ctry_data_comp=lambda x: x['ctry_data_comp'].fillna(0))
)
data_roles_output.to_parquet('data/2_processed/linkedin/data_roles_in_all_countries.parquet')

Loading library list...
Done


In [None]:
########################################################################################
# Observe the Origin Universities /graduation dates of French Data Scientists 
########################################################################################

year_range = range(2008,2024)
cores =  os.cpu_count() - 10; 
data_roles = pd.read_csv('data/2_processed/linkedin/revelio_role_dict.csv').loc[lambda x: x['data'].eq(1), 'role_k1500'].tolist()
params = {"data_roles": tuple(data_roles)}
role_output = db.raw_sql(
    """
    SELECT user_id, weight, total_compensation, startdate, enddate 
    FROM revelio.individual_positions 
    WHERE role_k1500 IN %(data_roles)s AND country = 'France'
    """, 
    params=params)

params = {"data_ids": tuple(role_output['user_id'].tolist())}
user_output = (
    db.raw_sql(
        """
        SELECT *
        FROM revelio.individual_user_education 
        where user_id IN %(data_ids)s
        """,
        params= params 
        ) 
    .loc[lambda x: ~x['rsid'].isna() & ~x['enddate'].isna()] 
    .assign(startdate =lambda x: pd.to_datetime(x['startdate'], errors='coerce'),
            enddate =lambda x: pd.to_datetime(x['enddate'], errors='coerce')) 
    .assign(grad_year=lambda x: x['enddate'].dt.year))

def university_collapse_wrapper(year):
    temp = (
        role_output.assign(startdate =lambda x: pd.to_datetime(x['startdate'], errors='coerce'),
                              enddate =lambda x: pd.to_datetime(x['enddate'], errors='coerce')) 
        .assign(valid=lambda x: x['startdate'].dt.year.le(year) & (x['enddate'].isna() | x['enddate'].dt.year.ge(year)),
                comp =lambda x: x['total_compensation']*x['weight'])
        .loc[lambda x: x['valid']] 
        .merge(user_output.rename(columns = {'startdate': 'uni_start_date','enddate': 'uni_end_date'})) 
        .loc[lambda x: x['uni_end_date'].le(x['startdate'])]
        .loc[lambda x: x.groupby('user_id')['uni_end_date'].idxmax()]
        .loc[lambda x: x['university_country'].eq('France')]
        .groupby(['university_name', 'grad_year', 'rsid', 'university_location', 'university_country'])
        .agg(data_grads =('weight', 'sum'),
             comp_weighted_data_grads=('comp', 'sum'))
        .assign(observation_year = year)
    ).reset_index()
    return(temp)

yr_lvl_dta_uni = pd.concat(Parallel(n_jobs=cores, backend='multiprocessing')(delayed(university_collapse_wrapper)(year) for year in year_range),ignore_index = True)
yr_lvl_dta_uni.to_parquet('data/2_processed/linkedin/data_grads_across_france.parquet')

### GENERATE A LIST OF LOCATIONS SO WE CAN START TRYING TO MATCH TO FIRM DATA 
simple_uni_x_location = (
    pd.merge(yr_lvl_dta_uni,
             yr_lvl_dta_uni.groupby('university_location', as_index=False)['data_grads'].max())
    [['university_name', 'university_location']].drop_duplicates()
    .merge(yr_lvl_dta_uni[['university_name','university_location']]
           .drop_duplicates()
           .assign(num_unis = lambda df: df.groupby('university_location')['university_location'].transform('count'))
           [['university_location', 'num_unis']].drop_duplicates())
)
simple_uni_x_location.to_excel('data/2_processed/admin/uni_x_location_raw.xlsx', index=False)

In [None]:
########################################################################################
# Generate A CROSS BETWEEN LEI AND SIREN CODES 
########################################################################################
#When INSEE is the managing Local Operating Unit (LOU) or the firm is french, it identifies firms with SIREN codes.
##The initial list of LEI codes is provided by the GLIEF
##(https://search.gleif.org/#/search/simpleSearch=France&fulltextFilterId=LEIREC_FULLTEXT&currentPage=1&perPage=15&expertMode=false).

# File path
file_path = 'data/1_raw_data/admin/20241105-0000-gleif-goldencopy-lei2-golden-copy.csv'

# Columns of interest and their new names
interest_cols = ['LEI','Entity.LegalName','Entity.LegalAddress.Country', 'Entity.RegistrationAuthority.RegistrationAuthorityEntityID','Registration.ManagingLOU']
new_names = ["lei", "lei_name", 'lei_country', 'lei_siren',"managing_lou"]
crosswalk = pd.read_csv(file_path, usecols=interest_cols, low_memory=False)
crosswalk.columns = new_names

crosswalk = (crosswalk.assign(insee_registered = lambda c: c['managing_lou'] == '969500Q2MA9VBQ8BG884',
             lei_siren = lambda c: c['lei_siren'].fillna('').astype(str).apply(lambda x: re.sub(r'[^a-zA-Z0-9]', '',x)))
             .loc[lambda x: (x['lei_country'].eq("FR") | x['insee_registered'])])

crosswalk.to_parquet('data/2_processed/admin/LEI_siren_crosswalk.parquet')

In [None]:
########################################################################################
# FIND ALL ROLES LOCATED IN FRANCE OR ASSIGNED TO A FRENCH PERSON 
########################################################################################

####
# FIND THE ROLES OF ALL FRENCH USERS 
####
command = "select user_id from revelio.individual_user where user_country = France"
french_users = db.raw_sql(command)



all_roles = []

# Define the batch size
batch_size = 100000
user_ids = tuple(french_users['user_id'].unique())

# Split user_ids into smaller batches and execute the query for each batch
for i in range(0, len(user_ids), batch_size):
    batch_user_ids = user_ids[i:i + batch_size]
    params = {"user_ids": batch_user_ids}
    
    # Execute the query for the current batch
    roles_batch = db.raw_sql(
        "SELECT rcid, ultimate_parent_rcid "
        "FROM revelio.individual_positions "
        "WHERE user_id IN %(user_ids)s "
        ,
        params=params,
    )
    
    # Append the result of this batch to the list
    all_roles.append(roles_batch)

french_roles = db.raw_sql(
        "SELECT rcid, ultimate_parent_rcid "
        "FROM revelio.individual_positions "
        "WHERE country = 'France' ",
        params=params,
    )
all_roles.append(french_roles)

# Concatenate all batches into a single DataFrame
roles = pd.concat(all_roles, ignore_index=True)


########################################################################################
# FIND ALL COMPANIES THAT HAVE HIRED AT LEAST ONE OF THESE ROLES OR HAD A RELATION DO SO
########################################################################################

rcids = tuple(roles['rcid'].unique());
parent_rcids = tuple(map(str, roles['ultimate_parent_rcid'].unique()));

all_companies = []
## FIND ALL COMPANIES BASED ON THEIR RCID 
for i in range(0, len(rcids), batch_size):
    batch_rcids = rcids[i:i + batch_size]
    params = {"rcids": batch_rcids}
    
    # Execute the query for the current batch
    companies_batch = db.raw_sql("SELECT * "
                      "FROM  revelio.company_mapping "
                       "WHERE rcid IN %(rcids)s ",
                     params = params)
    
    # Append the result of this batch to the list
    all_companies.append(companies_batch)

#### FIND ALL COMPANIES BASED ON THEIR PARENT RCID
for i in range(0, len(parent_rcids), batch_size):
    batch_rcids = rcids[i:i + batch_size]
    params = {"rcids": batch_rcids}
    
    # Execute the query for the current batch
    companies_batch = db.raw_sql("SELECT * "
                      "FROM  revelio.company_mapping "
                       "WHERE ultimate_parent_rcid IN %(rcids)s ",
                     params = params)
    
    # Append the result of this batch to the list
    all_companies.append(companies_batch)
  
companies = pd.concat(all_companies, ignore_index=True).drop_duplicates()
companies['year_founded'] = pd.to_numeric(companies['year_founded'], errors='coerce')


### CLEAN THE NAMES OF THOSE FIRMS 
with open('data/2_processed/admin/common_words.txt', 'r') as file:
    common_words = set(file.read().splitlines())
chunks =  os.cpu_count() - 10
company_chunks = np.array_split(companies,chunks); 
def wrapper(index):
    temp = hf.clean_firm_names(company_chunks[index], "company", True)
    return(hf.strip_words(temp, 'company_cleaned', common_words))
       
companies = pd.concat(Parallel(n_jobs=chunks, backend='multiprocessing')
                          (delayed(wrapper)(index) for index in range(chunks)), ignore_index = True)
companies = companies.loc[~companies['company_cleaned'].eq("")]
companies.to_parquet('data/1_raw_data/linkedin/revelio/france_affiliated_firms.parquet')

In [None]:
########################################################################################
# FIND ALL ROLES TIED TO A COMPANY WITH SOME CONNECTION TO FRANCE
########################################################################################
year_range = range(2008,2024)
int_vars = ['french', 'total', 'engineer', 'data','rnd','stem']
role_dict_vars = ['role_id'] + int_vars[1:]
chunks = 1000
cores =  os.cpu_count() - 10; 
export_path = 'data/2_processed/linkedin/temp_collapsed_roles'
os.makedirs(export_path, exist_ok=True)


role_to_id = pd.read_csv('data/2_processed/linkedin/revelio_role_dict.csv').set_index('role_k1500')['role_id'].to_dict()
country_dict = pd.read_csv('data/2_processed/linkedin/revelio_country_dict.csv')
france_id = country_dict.loc[country_dict['country'] == 'France', 'country_id'].values[0]
french_users = pd.read_parquet('data/2_processed/linkedin/user_components_by_country/users_'+ str(france_id)+".parquet")['user_id']
role_dict = pd.read_csv('data/2_processed/linkedin/revelio_role_dict.csv')

companies = pd.read_parquet('data/1_raw_data/linkedin/revelio/france_affiliated_firms.parquet')[['rcid']].drop_duplicates()
company_chunks = np.array_split(companies, chunks)

for index in reversed(range(chunks)):
    export_file = export_path + f'/chunk_{index}.parquet'
    rcid_list = tuple(map(str, company_chunks[index]['rcid']))
    if not os.path.isfile(export_file): 
        print(f'starting import {index}')
        df = db.raw_sql(
                "SELECT user_id, position_id, country AS role_country, startdate, enddate, role_k1500, "
                "weight, seniority, total_compensation, rcid "
                "FROM revelio.individual_positions WHERE rcid IN %(rcids)s ", 
                params={"rcids": rcid_list})
        
        print(f'starting processing {index}')
        df_rcid_chunks = np.array_split(df[['rcid']].drop_duplicates(), cores)
        df_chunks = []
        for chunk in range(cores):
            df_chunks.append(df.loc[df['rcid'].isin(df_rcid_chunks[chunk]['rcid'])])

        def collapse_and_clean(chunk):
            chunk = (chunk
                  .assign(role_id = chunk['role_k1500'].map(role_to_id),
                          french = chunk['role_country'].eq('France') | chunk['user_id'].isin(french_users),
                          startdate = chunk['startdate'].apply(pd.to_datetime),
                          enddate = chunk['enddate'].apply(pd.to_datetime))
                  .merge(role_dict[role_dict_vars], on='role_id', how='left'))

            output_list = []
            for year in year_range:
                # Create a temporary DataFrame with 'valid' column indicating if 'startdate' <= year <= 'enddate' or 'enddate' is NA
                temp = chunk.copy()
                temp['valid'] = ((temp['startdate'].dt.year <= year) & \
                                 ((temp['enddate'].dt.year >= year) | temp['enddate'].isna())).astype(int)

                # Pre compute interest columns
                for col in int_vars:
                    temp[f'emp_{col}'] = temp[col] * temp['valid'] * temp['weight']
                    temp[f'comp_{col}'] = temp[col] * temp['valid'] * temp['weight']*temp['total_compensation'] / 1000

                output = temp.groupby('rcid').agg({
                    **{f'emp_{col}': 'sum' for col in int_vars},
                    **{f'comp_{col}': 'sum' for col in int_vars}
                }).reset_index()

                for col in int_vars: 
                    output[f'emp_{col}'] = round(output[f'emp_{col}'])

                output['share_emp_french'] = output['emp_french'] /output['emp_total'] 
                output['share_comp_french'] = output['comp_french'] / output['comp_total']
                output['year'] = year
                columns_to_round = ['comp_french', 'comp_total', 'comp_engineer', 'comp_data', 'comp_rnd', 'comp_stem']
                output[columns_to_round] = output[columns_to_round].apply(lambda x: x.round(0))

                output_list.append(output)
            return(pd.concat(output_list).sort_values(by=['rcid']))
        roles_yr_level = pd.concat(Parallel(n_jobs=cores, backend='multiprocessing')(delayed(collapse_and_clean)(chunk) for chunk in df_chunks),ignore_index = True)

        roles_yr_level.to_parquet(export_file)
        clear_output(wait=True)

output_list =[]
file_list = sorted(glob.glob(export_path + '/*'))
for file in file_list:
    clear_output(wait=True)
    print(file)
    output_list.append(pd.read_parquet(file))
    
output_list = pd.concat(output_list)
output_list.to_parquet('data/2_processed/linkedin/french_affiliated_firm_roles_collapsed_raw.parquet')
shutil.rmtree(export_path)

In [None]:
########################################################################################
# USE ROLE DATA TO DETERMINE WHICH FIRMS ARE LIKELY FRENCH 
########################################################################################

french_leis = pd.read_parquet('data/2_processed/admin/LEI_siren_crosswalk.parquet')['lei'].unique()
non_french_country_domains = (pd.read_excel('data/2_processed/admin/domain_names_by_country.xlsx')
                              .assign(name=lambda x: x['name'].str.replace('.', '', regex=False))
                              .query('include != include')['name'])
french_factset_ids = pd.read_parquet('data/2_processed/admin/factset_french_domiciled.parquet')['factset_entity_id'].unique()
roles_data = pd.read_parquet('data/2_processed/linkedin/french_affiliated_firm_roles_collapsed_raw.parquet')

companies = (
    ## determine the firm's max total / french / data values and shares 
    roles_data
    .assign(cost_per_worker = lambda df: df['comp_total'] / df['emp_total'])
    .groupby('rcid', as_index=False)
    .agg({'cost_per_worker':'max','emp_total': 'max','emp_french': 'max','emp_data': 'max',
          'comp_total': 'max','comp_data': 'max','share_emp_french': 'max', 'share_comp_french': 'max'})
    .assign(french_eligible = lambda c: c['emp_french'].gt(0))
    [['rcid', 'french_eligible','cost_per_worker', 'share_emp_french', 'share_comp_french','emp_total', 'emp_data' ,'comp_total', 'comp_data']].
    merge(pd.read_parquet('data/1_raw_data/linkedin/revelio/france_affiliated_firms.parquet'))
    
    ### determine whether the firm likely french or not 
    .assign(url_ending = lambda c: c['url'].apply(lambda x: x.split('.')[-1] if isinstance(x, str) else None))
    .assign(
        admin_score=lambda c: 0
        ## TOP LEVEL DOMAIN 
        + c['url_ending'].eq('fr')  
        - c['url_ending'].isin(non_french_country_domains)  

        ## FACTSET
        + c['factset_entity_id'].isin(french_factset_ids)
        - (~c['factset_entity_id'].isin(french_factset_ids) & c['factset_entity_id'].notna())

        # LEI CHECK 
        +  c['lei'].isin(french_leis) 
        -  (~c['lei'].isin(french_leis) & c['lei'].notna()) 

        # ISIN 
        + c['isin'].str[:2].eq("FR") # add if french isin
        - (~c['isin'].str[:2].eq("FR") & c['isin'].notna())

        # CUSIP 
        + c['cusip'].str[:1].eq("F") # add if french cusip
        - (~c['cusip'].str[:1].eq("F") & c['cusip'].notna())

        # Firm type 
        + c['firm_type_french_likelihood'].eq("likely french") 
        - c['firm_type_french_likelihood'].eq("unlikely french"))
     .assign(
         likely_french = lambda c: 
         c['french_eligible'] & (
         c['admin_score'].gt(0) | 
         (c['admin_score'].eq(0) & (c['share_emp_french'].gt(.5) | c['share_comp_french'].gt(.5)))))
    
    #### mark whether it's a subsidiary 
    .assign(subsidiary = lambda c: c['rcid'] != c['ultimate_parent_rcid'],
            public = lambda df: ~df['ticker'].isna(),
            has_lei = lambda df: ~df['lei'].isna())
    [['rcid','lei','company','company_cleaned', 'company_stripped', 'year_founded', 'ultimate_parent_rcid',
      'likely_french', 'subsidiary', 'public', 'has_lei','share_emp_french', 'share_comp_french',
      'emp_total', 'emp_data' ,'comp_total', 'comp_data','cost_per_worker']]
)
companies.to_parquet('data/2_processed/linkedin/france_affiliated_firms_cleaned.parquet')