In [2]:
## SETUP 
imports = ['wrds', 'pandas as pd', 'os', 're', 'pickle', 'numpy as np', 'from name_matching.name_matcher import NameMatcher',
          'from joblib import Parallel, delayed', 'from IPython.display import display, HTML, clear_output',
          'unicodedata', 'sys']
for command in imports:
    if command.startswith('from'): exec(command)
    else: exec('import ' + command)

if not os.getcwd().endswith('Big Data'):
    os.chdir('../..')

sys.path.append('trade_data_code/2_python')
import A_helper_functions as hf



In [None]:
########################################################################################
# Generate Firm-Year Dataset 
########################################################################################
collapsed_dir = 'data/2_processed/linkedin/role_components_collapsed/'
country_dict = pd.read_csv('data/2_processed/linkedin/revelio_country_dict.csv')
france_id = country_dict.loc[country_dict['country'] == 'France', 'country_id'].values[0]
french_users = pd.read_parquet('data/2_processed/linkedin/user_components_by_country/users_'+ str(france_id)+".parquet" )

role_dict = pd.read_csv('data/2_processed/linkedin/revelio_role_dict.csv')
role_files = os.listdir('data/2_processed/linkedin/role_components')
year_range = range(2008,2024)
int_vars = ['french', 'total', 'engineer', 'data','rnd','stem']
role_dict_vars = ['role_id'] + int_vars[1:]

### COLLAPSE ALL THE ROLE DATA TO THE FIRM-YEAR LEVEL 
os.mkdir(collapsed_dir)
for i, role_file in enumerate(role_files):
    output_file = collapsed_dir + role_file
    if not os.path.exists(output_file):
        print(i / len(role_files))
        roles = (pd.read_parquet('data/2_processed/linkedin/role_components/' + role_file)
                 .rename(columns={'role_k1500': 'role_id'}))
        roles = pd.merge(roles, role_dict[role_dict_vars], on='role_id', how='left')
        roles['french'] = ((roles['role_country'] == 'France') | (roles['user_id'].isin(french_users['user_id']))).astype(int)
        roles[['startdate', 'enddate']] = roles[['startdate', 'enddate']].apply(pd.to_datetime)
        
        output_list = []
        for year in year_range:
            # Create a temporary DataFrame with 'valid' column indicating if 'startdate' <= year <= 'enddate' or 'enddate' is NA
            temp = roles.copy()
            temp['valid'] = ((temp['startdate'].dt.year <= year) & \
                             ((temp['enddate'].dt.year >= year) | temp['enddate'].isna())).astype(int)
            
            # Pre compute interest columns
            for col in int_vars:
                temp[f'emp_{col}'] = temp[col] * temp['valid'] * temp['weight']
                temp[f'comp_{col}'] = temp[col] * temp['valid'] * temp['weight']*temp['total_compensation'] / 1000
            
            output = temp.groupby('rcid').agg({
                **{f'emp_{col}': 'sum' for col in int_vars},
                **{f'comp_{col}': 'sum' for col in int_vars}
            }).reset_index()

            for col in int_vars: 
                output[f'emp_{col}'] = round(output[f'emp_{col}'])

            output['share_emp_french'] = output['emp_french'] /output['emp_total'] 
            output['share_comp_french'] = output['comp_french'] / output['comp_total']
            # add year
            output['year'] = year
            output_list.append(output)
        pd.concat(output_list).sort_values(by=['rcid']).to_parquet(output_file)

### CONCATENATE ALL THE CLEANED FILES AND THEN DELETE COMPONENTS 
output_list = []
for i, role_file in enumerate(role_files):
    print(i / len(role_files))
    output_file = collapsed_dir + role_file
    output_list.append(pd.read_parquet(output_file))

output_list = pd.concat(output_list)
output_list.to_parquet('data/2_processed/linkedin/french_affiliated_firm_roles_collapsed_raw.parquet')
shutil.rmtree(collapsed_dir)

In [3]:
########################################################################################
# USE ROLE DATA TO DETERMINE WHICH FIRMS ARE LIKELY FRENCH 
########################################################################################

french_leis = pd.read_parquet('data/2_processed/admin/LEI_siren_crosswalk.parquet')['lei'].unique()
non_french_country_domains = (pd.read_excel('data/2_processed/admin/domain_names_by_country.xlsx')
                              .assign(name=lambda x: x['name'].str.replace('.', '', regex=False))
                              .query('include != include')['name'])
french_factset_ids = pd.read_parquet('data/2_processed/admin/factset_french_domiciled.parquet')['factset_entity_id'].unique()

companies = (
    ## determine the firm's max total / french / data values and shares 
    pd.read_parquet('data/2_processed/linkedin/french_affiliated_firm_roles_collapsed_raw.parquet')
    .groupby('rcid', as_index=False)
    .agg({'emp_total': 'max','emp_french': 'max', 'share_emp_french': 'max', 'share_comp_french': 'max', 'emp_data': 'max'})
    .assign(french_eligible = lambda c: c['emp_french'].gt(0),
         data_eligible = lambda c: c['emp_data'].gt(0))
    .assign(role_eligible = lambda c: c['data_eligible'] & c['french_eligible'])
    [['rcid', 'french_eligible', 'data_eligible','role_eligible', 'share_emp_french', 'share_comp_french']].
    merge(pd.read_parquet('data/1_raw_data/linkedin/revelio/france_affiliated_firms.parquet'))
    
    ### determine whether the firm likely french or not 
    .assign(url_ending = lambda c: c['url'].apply(lambda x: x.split('.')[-1] if isinstance(x, str) else None))
    .assign(
        admin_score=lambda c: 0
        ## TOP LEVEL DOMAIN 
        + c['url_ending'].eq('fr')  
        - c['url_ending'].isin(non_french_country_domains)  

        ## FACTSET
        + c['factset_entity_id'].isin(french_factset_ids)
        - (~c['factset_entity_id'].isin(french_factset_ids) & c['factset_entity_id'].notna())

        # LEI CHECK 
        +  c['lei'].isin(french_leis) 
        -  (~c['lei'].isin(french_leis) & c['lei'].notna()) 

        # ISIN 
        + c['isin'].str[:2].eq("FR") # add if french isin
        - (~c['isin'].str[:2].eq("FR") & c['isin'].notna())

        # CUSIP 
        + c['cusip'].str[:1].eq("F") # add if french cusip
        - (~c['cusip'].str[:1].eq("F") & c['cusip'].notna())

        # Firm type 
        + c['firm_type_french_likelihood'].eq("likely french") 
        - c['firm_type_french_likelihood'].eq("unlikely french"))
     .assign(
         likely_french = lambda c: 
         c['french_eligible'] & (
         c['admin_score'].gt(0) | 
         (c['admin_score'].eq(0) & (c['share_emp_french'].gt(.5) | c['share_comp_french'].gt(.5)))))
    
    #### mark whether it's a subsidiary 
    .assign(subsidiary = lambda c: c['rcid'] != c['ultimate_parent_rcid'])
    [['rcid','lei','company','company_cleaned', 'company_stripped', 'year_founded', 'ultimate_parent_rcid',
      'french_eligible', 'data_eligible', 'role_eligible', 'likely_french', 'subsidiary']]
)
companies.to_parquet('data/2_processed/linkedin/france_affiliated_firms_cleaned.parquet')
