In [2]:
## SETUP 
imports = ['wrds', 'pandas as pd', 'os', 're', 'pickle', 'numpy as np', 'from name_matching.name_matcher import NameMatcher',
          'from joblib import Parallel, delayed', 'from IPython.display import display, HTML, clear_output',
          'unicodedata','sys', 'matplotlib.pyplot as plt', 'glob', 'shutil','from sklearn.decomposition import PCA']
for command in imports:
    if command.startswith('from'): exec(command)
    else: exec('import ' + command)

if not os.getcwd().endswith('Big Data'):
    os.chdir('../..')

sys.path.append('trade_data_code/2_python')
import A_helper_functions as hf

In [None]:
########################################################################################
# Find Ancilliary Information Associated with our matched companies 
########################################################################################
matching_output = pd.read_parquet('data/2_processed/admin/fuzzy_matching_output_final.parquet')
num_chunks = 50
temp_direct = 'data/2_processed/linkedin/temp_role'
os.makedirs(temp_direct, exist_ok=True)
db = wrds.Connection(wrds_username='am0195')
chunks = np.array_split(matching_output['rcid'].unique(), num_chunks)

for index in range(num_chunks):
    file_path = temp_direct + "/temp" + str(index) + ".parquet"
    if not os.path.exists(file_path):  
        clear_output(wait=True)
        print(str(round(100*(index+1)/num_chunks,2))+ '%')
        temp = (
            db.raw_sql(
                """
                SELECT rcid, child_rcid, ultimate_parent_rcid, ticker, lei,  hq_zip_code, hq_metro_area, hq_state, hq_country
                FROM  revelio.company_mapping 
                WHERE rcid IN %(rcid_list)s
                """,
                params = {'rcid_list': tuple(chunks[index].tolist())})
            .assign(has_subsid = lambda x: ~x['rcid'].eq(x['child_rcid']),
                    is_subsid = lambda x: ~x['rcid'].eq(x['ultimate_parent_rcid']),
                    is_public = lambda x: ~x['ticker'].isna(),
                    has_lei = lambda x: ~x['lei'].isna(),
                    french_hq = lambda x: ~x['hq_country'].isna(),
                    hq_metro_area =lambda x: x['hq_metro_area']
                    .str.replace('france nonmetropolitan area', 'non_metro', regex=False)
                    .str.replace('metropolitan area', '', regex=False)
                    .str.strip()))

        parent_temp = (
            db.raw_sql(
            """
            SELECT ultimate_parent_rcid, hq_country
            FROM  revelio.company_mapping 
            WHERE rcid IN %(rcid_list)s
            """,
            params = {'rcid_list':  tuple(temp.loc[temp['is_subsid']]['ultimate_parent_rcid'].unique().tolist())})
            .assign(parent_non_french_hq = lambda x: (~x['hq_country'].eq('France')).where(x['hq_country'].notna()))
            .rename(columns={"hq_country": "parent_hq_country"}))
        temp = pd.merge(temp, parent_temp, how = 'left')
        temp.to_parquet(file_path)

(pd.concat([pd.read_parquet(file) for file in glob.glob(temp_direct + "/*.parquet")],ignore_index = True)
 .to_parquet('data/2_processed/linkedin/matched_firm_base_info.parquet'))
shutil.rmtree(temp_direct)

In [None]:
########################################################################################
# FIND ALL ROLES ASSOCIATED WITH OUR MATCHED COMPANIES
########################################################################################
matching_output = pd.read_parquet('data/2_processed/admin/fuzzy_matching_output_final.parquet')
num_chunks = 50
temp_direct = 'data/2_processed/linkedin/temp_role'
os.makedirs(temp_direct, exist_ok=True)
db = wrds.Connection(wrds_username='am0195')
chunks = np.array_split(matching_output['rcid'].unique(), num_chunks)

for index in range(num_chunks):
    file_path = temp_direct + "/temp" + str(index) + ".parquet"
    if not os.path.exists(file_path):    
        clear_output(wait=True)
        print(str(round(100*(index+1)/num_chunks,2))+ '%')
        params = {'rcid_list': tuple(chunks[index].tolist())}
        temp = role_output = db.raw_sql(
            """
            SELECT rcid,user_id,country, weight, total_compensation, startdate, enddate, role_k1500 
            FROM revelio.individual_positions 
            WHERE rcid IN %(rcid_list)s
            """, 
            params= params)
        for col in ['startdate', 'enddate']:
            temp[col] = pd.to_datetime(temp[col], errors='coerce')
        temp.to_parquet(file_path)

(pd.concat([pd.read_parquet(file) for file in glob.glob(temp_direct + "/*.parquet")],
           ignore_index = True)
 .to_parquet('data/2_processed/linkedin/matched_firm_role_output.parquet'))
shutil.rmtree(temp_direct)

In [None]:
########################################################################################
# Perform PCA to generate metrics of differentiation 
########################################################################################
temp_direct = 'data/2_processed/linkedin/temp_role'
os.makedirs(temp_direct, exist_ok=True)

def collapse_year_level(year, making_pca, pca_model=None):
    temp = (
        output.assign(
            valid=lambda x: x['startdate'].dt.year.le(year) & 
                            (x['enddate'].isna() | x['enddate'].dt.year.ge(year)),
            wgted_comp=lambda x: x['total_compensation'] * x['weight']
        )
        .loc[lambda x: x['valid']]
        .groupby(['firmid', 'role_k1500'], as_index=False)
        .agg(comp=('wgted_comp', 'sum'))
        .assign(year=year)
        .pivot_table(index=['firmid', 'year'], columns='role_k1500', values='comp', aggfunc='sum', fill_value=0)
        .pipe(lambda df: df.div(df.sum(axis=1), axis=0))
        .replace([np.inf, -np.inf], np.nan)
        .dropna()
    )
    if making_pca:
        pca_model = PCA(n_components=10)
        pca_model.fit(temp)
        return pca_model
    else:
        file_path = temp_direct + "/temp" + str(year) + ".parquet"
        pd.concat([
            temp.reset_index()[['firmid', 'year']],
            pd.DataFrame(pca_model.transform(temp), columns=[f'PC{i+1}' for i in range(10)])
        ], axis=1).to_parquet(file_path)
        print(year)
        
#set param values 
years = range(2008, 2024)
sample_year = 2015
   
# Load and merge data
long_data = pd.read_parquet('data/2_processed/linkedin/matched_firm_role_output.parquet')
matching_output = pd.read_parquet('data/2_processed/admin/fuzzy_matching_output_final.parquet')[['rcid', 'siren']].rename(columns={'siren': 'firmid'})
output = pd.merge(long_data, matching_output)

# run pca analysis
pca_model = collapse_year_level(sample_year, making_pca=True)
[collapse_year_level(year,False,pca_model) for year in range(2008, 2024)]
(pd.concat([pd.read_parquet(file) for file in glob.glob(temp_direct + "/*.parquet")],ignore_index = True)
 .to_parquet('data/2_processed/linkedin/matched_firm_pca_output.parquet'))
shutil.rmtree(temp_direct)


In [None]:
########################################################################################
# FIND THE PRESTIGE / education OF ALL EMPLOYEES ASSOCIATED WITH OUR MATCHED COMPANIES 
########################################################################################
num_chunks = 50
temp_direct = 'data/2_processed/linkedin/temp_user_prestige'
os.makedirs(temp_direct, exist_ok=True)
role_output = pd.read_parquet('data/2_processed/linkedin/matched_firm_role_output.parquet')
chunks = np.array_split(role_output['user_id'].unique(), num_chunks)
db = wrds.Connection(wrds_username='am0195')

for index in range(num_chunks):
    file_path = temp_direct + "/temp" + str(index) + ".parquet"
    if not os.path.exists(file_path): 
        clear_output(wait=True)
        print(str(round(100*(index+1)/num_chunks,2))+ '%')
        temp = db.raw_sql(
            """
            select user_id, prestige, highest_degree 
            from revelio.individual_user 
            where user_id IN %(user_ids)s
            """,
            params= {'user_ids': tuple(chunks[index].tolist())})
        temp.to_parquet(file_path)
    
(pd.concat([pd.read_parquet(file) for file in glob.glob(temp_direct + "/*.parquet")],
           ignore_index = True)
 .to_parquet('data/2_processed/linkedin/matched_firm_user_prestige.parquet'))
shutil.rmtree(temp_direct)

9.8%


In [None]:
########################################################################################
# FIND THE AMOUNT OF WORKERS CURRENTLY WORKING OR WITH EXPERIENCE ABROAD PER COMPANY 
########################################################################################

### SET PARAMETERS
num_chunks = 500
temp_direct = 'data/2_processed/linkedin/temp_user_output'
os.makedirs(temp_direct, exist_ok=True)
db = wrds.Connection(wrds_username='am0195')
matching_output = pd.read_parquet('data/2_processed/admin/fuzzy_matching_output_final.parquet')[['rcid', 'siren']].rename(columns={'siren':'firmid'})
role_output = pd.read_parquet('data/2_processed/linkedin/matched_firm_role_output.parquet').merge(matching_output)
chunks = np.array_split(role_output['firmid'].unique(), num_chunks)
linkedin_to_iso_cross_walk = (pd.read_csv('data/2_processed/admin/linkedin_to_iso_crosswalk.csv')
                              .assign(needs_collapse = lambda df: df.groupby('ctry')['ctry'].transform('count') >1))

### DEFINE FUNCTIONS 
def run_subsection(index):
    clear_output(wait=True)
    print(str(round(100*(index+1)/num_chunks,2))+ '%')

    role_subset = role_output.loc[lambda x: x['firmid'].isin(chunks[index])]
    params = {"user_ids":  tuple(role_subset['user_id'].unique())}
    ever_role_subset = db.raw_sql(
        """
        SELECT user_id, rcid, country, startdate, enddate, weight, total_compensation, seniority
        FROM revelio.individual_positions 
        where user_id IN %(user_ids)s
        """,
        params= params 
        )
    print('finished scraping')
    subset_output = pd.concat([collapse_year_level(year, role_subset, ever_role_subset) for year in range(2008, 2024)],
                            ignore_index=True)
    subset_output.to_parquet(temp_direct + "/temp"+str(index)+".parquet")

    
def collapse_year_level(year, role_subset, ever_role_subset):
    valid_cols = ['valid_now', 'valid_ever', 'valid_l5'] 
    temp = (
         ## Determine which users are active in a given year for a given firm 
         role_subset
        .assign(startdate =lambda x: pd.to_datetime(x['startdate'], errors='coerce'),
                enddate =lambda x: pd.to_datetime(x['enddate'], errors='coerce')) 
        .assign(valid=lambda x: x['startdate'].dt.year.le(year) & (x['enddate'].isna() | x['enddate'].dt.year.ge(year)))
        .loc[lambda x: x['valid'], ['firmid','user_id']].drop_duplicates()

        ## add all the roles those users have ever held 
        .merge(ever_role_subset)

        # drop all roles that start after the year of interest
        .drop_duplicates()
        .assign(startdate =lambda x: pd.to_datetime(x['startdate'], errors='coerce'),
                enddate =lambda x: pd.to_datetime(x['enddate'], errors='coerce'),
                comp =lambda x: x['total_compensation']*x['weight'],
               )
         .loc[lambda x: x['startdate'].dt.year.le(year)]

        ## Mark if the the position occured in the year of interest, within 5 years of the year of interest or ever.
        ## Second assign step restricts to only the highest compensation value over the validity period. Stops us from double counting promotions etc. 
        .assign(valid_ever = lambda x: x['comp'] == x.groupby(['user_id', 'firmid', 'country'])['comp'].transform('max'),
                valid_now = lambda x: x['startdate'].dt.year.le(year) & (x['enddate'].isna() | x['enddate'].dt.year.ge(year)),
                valid_l5 = lambda x: x['startdate'].dt.year.le(year) & (x['enddate'].isna() | x['enddate'].dt.year.ge(year-5)))
        .assign(valid_now = lambda x: x['valid_now'] & x['comp'].eq(x.groupby(['user_id', 'firmid', 'country','valid_now'])['comp'].transform('max')),
                valid_l5 = lambda x: x['valid_l5'] & x['comp'].eq(x.groupby(['user_id', 'firmid', 'country','valid_l5'])['comp'].transform('max')))

        ## now melt the data frame 
        .melt(
        id_vars=['country', 'firmid', 'comp', 'weight'],
        value_vars=valid_cols,
        var_name='valid_type',
        value_name='valid_flag')
        .loc[lambda x: x['valid_flag']]

        ## collapse by country firmid valid_type 
        .groupby(['country', 'firmid', 'valid_type'])
        .agg(empl=('weight', 'sum'), comp=('comp', 'sum'))
        .reset_index()

        ## reshape back to wide 
        .pivot(index=['firmid', 'country'], columns='valid_type', values=['empl', 'comp'])
    )
    temp.columns = [f'{stat}_{vtype}' for stat, vtype in temp.columns]
    temp = temp.reset_index()
    temp.columns = temp.columns.str.replace('valid_', '', regex=True)
    temp['year'] = year
    return(temp)

### EXECUTE SCRAPING AND INITIAL COLLAPSE TO YEAR-firmid-ctry LEVEL 
[run_subsection(index) for index in range(num_chunks)]

### Match to ISO-2 CODES AND EXPORT 
combined_output = (
    pd.concat([pd.read_parquet(file) for file in glob.glob(temp_direct + "/*.parquet")],ignore_index = True)
    .merge(linkedin_to_iso_cross_walk, how = 'left'))

columns_to_sum = [col for col in combined_output.columns if 'comp' in col or 'empl' in col]
columns_to_keep = ['firmid', 'year','ctry'] + columns_to_sum

combined_output = pd.concat(
    [combined_output.loc[lambda x: ~x['needs_collapse'], columns_to_keep],
    combined_output.loc[lambda x: x['needs_collapse']].groupby(['firmid', 'year', 'ctry'])[columns_to_sum].sum().reset_index()]
    , axis=0, ignore_index=True)

combined_output.to_parquet('data/2_processed/linkedin/matched_firm_foreign_employment.parquet')

In [1]:
########################################################################################
# Generate our compensation / employment datasets at the year level 
########################################################################################
role_dict = pd.read_csv('data/2_processed/linkedin/revelio_role_dict.csv')
int_cols = list((set(role_dict.columns) - {'role_k1500', 'Unnamed: 0'}) | {'french_data'})
matching_output = pd.read_parquet('data/2_processed/admin/fuzzy_matching_output_final.parquet')[['rcid', 'siren']].rename(columns={'siren':'firmid'})

### Get the firm info at the siren level 
base_vars = ['has_subsid', 'is_subsid', 'is_public', 'has_lei', 'french_hq', 'parent_non_french_hq']
matched_firm_base_info = (
    pd.read_parquet('data/2_processed/linkedin/matched_firm_base_info.parquet').merge(matching_output)
    .assign(needs_collapse = lambda df: df.groupby('firmid')['firmid'].transform('count') > 1)
)
matched_firm_base_info = pd.concat(
    [matched_firm_base_info.loc[lambda x: ~x['needs_collapse'], ['firmid'] + base_vars],
     matched_firm_base_info.loc[lambda x: x['needs_collapse']].groupby('firmid', as_index=False)[base_vars].max()]
    , axis=0, ignore_index=True)

### generate the base for the role data collapse 
role_data = (
    ## merge together all the component datasets 
    pd.read_parquet('data/2_processed/linkedin/matched_firm_role_output.parquet')
    .merge(pd.read_parquet('data/2_processed/linkedin/matched_firm_user_prestige.parquet'), how = 'left')
    .merge(matching_output)
    .merge(role_dict, how = 'left')
    
    ## generate necessary variables  
     .assign(french= lambda x: x['country'].eq("France"),
         comp =  lambda x: x['total_compensation']*x['weight'])
    .assign(french_data = lambda x: x['data'] & x['french'])
)

### Carry out the collapse 
def collapse_wrapper(year):
    clear_output(wait=True) 
    print(year)
    temp = (
        role_data
        .assign(valid = lambda x: x['startdate'].dt.year.le(year) & (x['enddate'].isna() | x['enddate'].dt.year.ge(year)))
        .assign(valid = lambda x: x['valid'] & x['comp'].eq(x.groupby(['user_id', 'firmid', 'country','valid'])['comp'].transform('max')))
        .loc[lambda x: x['valid']]
        .assign(comp_weighted_prestige =lambda x: x['prestige']*x['comp']/ x.groupby(['firmid'])['comp'].transform('sum'),
                weighted_prestige = lambda x: x['prestige']*x['weight'])
    )
    ## collapse the prestige variables since we don't need those broken out    
    temp_prestige = temp.groupby('firmid', as_index=False)[['comp_weighted_prestige', 'weighted_prestige']].max()

    temp = (temp
     .melt(id_vars=['firmid', 'comp', 'weight'],
            value_vars=int_cols,
            var_name='type',
            value_name='valid_flag')
     .loc[lambda x: x['valid_flag'].eq(1)]
     .groupby(['firmid', 'type'])
     .agg(empl=('weight', 'sum'), comp=('comp', 'sum'))
     .reset_index()
     .pivot(index=['firmid'], columns='type', values=['empl', 'comp']))

    temp.columns = [f'{stat}_{vtype}' for stat, vtype in temp.columns]
    temp = temp.reset_index()
    temp = pd.merge(temp, temp_prestige,  how='outer')
    temp = temp.assign(year = year)
    return(temp)
role_annual_collapsed = pd.concat([collapse_wrapper(year) for year in range(2008, 2024)], ignore_index=True)
cols_to_fill = [col for col in role_annual_collapsed.columns if col.startswith('empl') or col.startswith('comp')]
role_annual_collapsed[cols_to_fill] = role_annual_collapsed[cols_to_fill].fillna(0)

final_output = pd.merge(role_annual_collapsed, matched_firm_base_info, how = 'left')
for var in int_cols[1:]:
    final_output[f'share_comp_{var}'] = final_output[f'comp_{var}'] / final_output['comp_total']
final_output.to_parquet('data/2_processed/linkedin/matched_firm_empl_and_linkedin_characteristics.parquet')

NameError: name 'pd' is not defined