In [75]:
## SETUP 
imports = ['wrds', 'pandas as pd', 'os', 're', 'pickle', 'numpy as np', 'from name_matching.name_matcher import NameMatcher',
          'from joblib import Parallel, delayed', 'from IPython.display import display, HTML, clear_output',
          'unicodedata','sys', 'matplotlib.pyplot as plt', 'glob', 'shutil','from sklearn.decomposition import PCA',
          'from geopy  import Nominatim', 'time', 'geopandas as gpd', 'requests', 'from io import BytesIO']
for command in imports:
    if command.startswith('from'): exec(command)
    else: exec('import ' + command)

if not os.getcwd().endswith('Big Data'):
    os.chdir('../..')
sys.path.append('trade_data_code/2_python')


cluster = 'Google' not in os.getcwd()
if ~cluster:
    raw_admin = '1) data/15_revelio_data/1_inputs/a_raw_data/admin/'
    processed_linkedin = '1) data/15_revelio_data/1_inputs/b_processed_data/linkedin/'
    processed_admin = '1) data/15_revelio_data/1_inputs/b_processed_data/admin/'
if cluster:
    raw_admin = 'data/1_raw_data/admin/'
    processed_linkedin = 'data/2_processed/linkedin/'
    processed_admin = 'data/2_processed/admin/'
import A_helper_functions as hf

In [None]:
## ADD PARENT HQ LOCATION 
subsids =pd.read_parquet(processed_linkedin + 'firm_lvl_info_all_matched_firms.parquet').loc[lambda x: x['is_subsid']]
num_chunks = 50
temp_direct = processed_linkedin + 'temp_5'
os.makedirs(temp_direct, exist_ok=True)
db = wrds.Connection(wrds_username='am0195')
chunks = np.array_split(subsids['ultimate_parent_rcid'].unique(), num_chunks)

# SCRAPE / PROCESS DATA FOR EACH CHUNK 
for index in range(num_chunks):
    file_path = os.path.join(temp_direct, f"temp{index}.parquet")
    if not os.path.exists(file_path):
        clear_output(wait=True)
        print(f"{round(100 * (index + 1) / num_chunks, 2)}%")
        rcid_list = tuple(chunks[index].tolist())
        temp = db.raw_sql(
            """
            SELECT rcid, hq_country
            FROM revelio.company_mapping
            WHERE rcid IN %(rcid_list)s
            """,
            params={"rcid_list": rcid_list}
        )
        temp.to_parquet(file_path)

output = (pd.concat([pd.read_parquet(file) for file in glob.glob(temp_direct + "/*.parquet")],ignore_index = True)
          .drop_duplicates()
          .rename(columns={'rcid': 'ultimate_parent_rcid', 'hq_country' :  'ultimate_parent_hq_country'}))

output = pd.read_parquet(processed_linkedin + 'firm_lvl_info_all_matched_firms.parquet').merge(output, how = 'left')
output.to_parquet('firm_lvl_info_all_matched_firms.parquet')
shutil.rmtree(temp_direct)

In [6]:
## FIND ALL ROLES ASSOCIATED WITH OUR MATCHED COMPANIES
remaining_to_find = (pd.read_parquet(processed_linkedin + 'firm_lvl_info_all_matched_firms.parquet'))

num_chunks = 500
temp_direct = processed_linkedin + 'temp'
os.makedirs(temp_direct, exist_ok=True)
db = wrds.Connection(wrds_username='am0195')
chunks = np.array_split(remaining_to_find['rcid'].unique(), num_chunks)

for index in range(num_chunks):
    file_path = temp_direct + "/temp" + str(index) + ".parquet"
    if not os.path.exists(file_path):    
        clear_output(wait=True)
        print(str(round(100*(index+1)/num_chunks,2))+ '%')
        params = {'rcid_list': tuple(chunks[index].tolist())}
        temp = db.raw_sql(
            """
            SELECT rcid,user_id,position_id, weight, total_compensation,position_number, startdate, enddate, role_k1500,country, state, metro_area, city, seniority 
            FROM revelio.individual_positions 
            WHERE rcid IN %(rcid_list)s
            """, 
            params= params)
        for col in ['startdate', 'enddate']:
            temp[col] = pd.to_datetime(temp[col], errors='coerce')
        temp.to_parquet(file_path)

output = pd.concat([pd.read_parquet(file) for file in glob.glob(temp_direct + "/*.parquet")],ignore_index = True)
output.to_parquet(processed_linkedin + 'matched_firm_role_output.parquet')
shutil.rmtree(temp_direct)

100.0%


In [102]:
## Match Cities from our data to NUTS 3 regions 
from geopy.geocoders import Nominatim
import time

## import and clean admin data 
french_cities = (pd.read_csv(raw_admin + 'insee_french_cities.csv',
                          usecols = ['city_code', 'latitude', 'longitude'])
                 .rename(columns={'city_code':'city'})
                 .loc[lambda x: ~x['latitude'].isna()])
french_cities = hf.clean_firm_names(french_cities, "city", False).drop_duplicates(subset = 'city_cleaned')

## import and clean linkedin data 
role_cities = (pd.read_parquet(processed_linkedin + 'matched_firm_role_output.parquet',
                               columns=["city", "country"])
               .loc[lambda x: x['country'].eq('France') & ~x['city'].isna()].drop_duplicates())

firm_cities = (pd.read_parquet(processed_linkedin + 'firm_lvl_info_all_matched_firms.parquet',
                               columns=['hq_city', 'hq_country'])
               .loc[lambda x: x['hq_country'].eq('France') & ~x['hq_city'].isna()]
               .drop_duplicates())
uni_cities = (pd.read_parquet(processed_linkedin +'data_grads_across_france.parquet')
              [['university_city']].drop_duplicates)
linkedin_cities = pd.Series(pd.concat([firm_cities['hq_city'],role_cities['city'],
                                      uni_cities['university_city']]).unique())
linkedin_cities = hf.clean_firm_names(pd.DataFrame({'city': linkedin_cities}), 'city', False)

## match based on cleaned names 
linkedin_cities = pd.merge(linkedin_cities, french_cities.drop('city', axis = 1), how = 'left')


## use openstreetmap to match 
geolocator = Nominatim(user_agent="your_app_name_here")
for i in range(len(linkedin_cities)):
    if pd.isna(linkedin_cities.loc[i, 'latitude']):
        clear_output(wait=True)
        print(str(round(100*(i+1)/len(linkedin_cities),2))+ '%')
        print(i)
        city_string = re.sub(r"Arrondissement d(?:e|u|es|')\s*", "", linkedin_cities['city'][i])
        location = geolocator.geocode(city_string + ", France", timeout = 10)
        if location:
            linkedin_cities.loc[i, ['latitude', 'longitude']] = [
                location.latitude, location.longitude]
        time.sleep(1)

## finally use the matches generated by chatgpt for the final set of matches 
chat_matched_cities =  (pd.merge(linkedin_cities.loc[linkedin_cities['latitude'].isna()][['city', 'city_cleaned']],
                    pd.read_csv(processed_admin + 'chat_gpt_matches_for_final_cities.csv',  encoding='latin1')
                    , on = 'city_cleaned', how = 'outer'))
linkedin_cities = pd.concat([linkedin_cities.loc[~linkedin_cities['latitude'].isna()],chat_matched_cities],
                            ignore_index = True)

## match to NUTS 3 regions 
gisco_nuts = (gpd.read_file(BytesIO(
    requests.get("https://gisco-services.ec.europa.eu/distribution/v2/nuts/geojson/NUTS_RG_60M_2021_4326_LEVL_3.geojson")
    .content)).loc[lambda x: x['CNTR_CODE'] == 'FR'].to_crs("EPSG:4326")[['NUTS_ID', 'NUTS_NAME', 'geometry']])

linkedin_cities = (linkedin_cities
    .drop(['department_name', 'department_number'],axis = 1)
    .dropna(subset=["latitude", "longitude"]))

linkedin_cities = (
    gpd.GeoDataFrame(linkedin_cities,geometry=gpd.points_from_xy(
        linkedin_cities["longitude"], linkedin_cities["latitude"]),crs="EPSG:4326")
    .sjoin(gisco_nuts.to_crs("EPSG:4326"), 
          predicate="intersects")
    .loc[lambda x: ~x["NUTS_ID"].str.startswith("FRY")]
    .reset_index()[['city', 'NUTS_ID', 'NUTS_NAME']])

## export the file 
linkedin_cities.to_parquet(processed_linkedin + 'linkedin_city_coords.parquet')

In [5]:
## FIND THE PRESTIGE / EDUCATION OF ALL EMPLOYEES ASSOCIATED WITH OUR MATCHED COMPANIES 
num_chunks = 500
temp_direct =  processed_linkedin + 'temp'                              
shutil.rmtree(temp_direct)
os.makedirs(temp_direct, exist_ok=True)
role_output = pd.read_parquet(processed_linkedin + 'matched_firm_role_output.parquet')

existing_prestige = pd.read_parquet(processed_linkedin + 'matched_firm_user_prestige.parquet') ## REMOVE
role_output = role_output.loc[~role_output['user_id'].isin(existing_prestige['user_id'])] ## REMOVE 
                              
chunks = np.array_split(role_output['user_id'].unique(), num_chunks)
db = wrds.Connection(wrds_username='am0195')

for index in range(num_chunks):
    file_path = temp_direct + "/temp" + str(index) + ".parquet"
    if not os.path.exists(file_path): 
        clear_output(wait=True)
        print(str(round(100*(index+1)/num_chunks,2))+ '%')
        temp = db.raw_sql(
            """
            select user_id, prestige, highest_degree 
            from revelio.individual_user 
            where user_id IN %(user_ids)s
            """,
            params= {'user_ids': tuple(chunks[index].tolist())})
        temp.to_parquet(file_path)
    
output = pd.concat([pd.read_parquet(file) for file in glob.glob(temp_direct + "/*.parquet")],ignore_index = True) 
output = pd.concat([output, existing_prestige], ignore_index = True) # REMOVE 
output.to_parquet(processed_linkedin + 'matched_firm_user_prestige.parquet')                              
shutil.rmtree(temp_direct)

FileNotFoundError: [Errno 2] No such file or directory: 'data/2_processed/linkedin/temp'

In [None]:
## Perform PCA to generate metrics of differentiation 
temp_direct = processed_linkedin + 'temp_role'
os.makedirs(temp_direct, exist_ok=True)

def collapse_year_level(year, making_pca, weight_var, pca_model=None):
    temp = (
        output.assign(
            valid=lambda x: x['startdate'].dt.year.le(year) & 
                            (x['enddate'].isna() | x['enddate'].dt.year.ge(year)),
            wgted_comp=lambda x: x['total_compensation'] * x['weight']
        )
        .loc[lambda x: x['valid']]
        .groupby(['firmid', 'role_k1500'], as_index=False)
        .agg(comp=(weight_var, 'sum'))
        .assign(year=year)
        .pivot_table(index=['firmid', 'year'], columns='role_k1500', values='comp', aggfunc='sum', fill_value=0)
        .pipe(lambda df: df.div(df.sum(axis=1), axis=0))
        .replace([np.inf, -np.inf], np.nan)
        .dropna()
    )
    if making_pca:
        pca_model = PCA(n_components=10)
        pca_model.fit(temp)
        return pca_model
    else:
        file_path = temp_direct + "/temp" + str(year) + ".parquet"
        pd.concat([
            temp.reset_index()[['firmid', 'year']],
            pd.DataFrame(pca_model.transform(temp), columns=[f'{weight_var}_PC{i+1}' for i in range(10)])
        ], axis=1).to_parquet(file_path)
        print(year)
        
#set param values 
years = range(2008, 2024)
sample_year = 2015
   
# Load and merge data
long_data = pd.read_parquet(processed_linkedin + 'matched_firm_role_output.parquet')
matching_output = pd.read_parquet(processed_linkedin + 'firm_lvl_info_all_matched_firms.parquet')[['rcid', 'firmid']]
output = pd.merge(long_data, matching_output)

# run pca analysis
for weight_var in ['wgted_comp','weight']:
    print('starting pca gen')
    pca_model = collapse_year_level(sample_year, True, weight_var)
    print('finished pca gen')
    [collapse_year_level(year,False,weight_var,pca_model) for year in range(2008, 2024)]
    (pd.concat([pd.read_parquet(file) for file in glob.glob(temp_direct + "/*.parquet")],ignore_index = True)
     .to_parquet(processed_linkedin + 'matched_firm_pca_'+ weight_var + '_output.parquet'))
shutil.rmtree(temp_direct)

In [None]:
########################################################################################
# FIND THE AMOUNT OF WORKERS CURRENTLY WORKING OR WITH EXPERIENCE ABROAD PER COMPANY 
########################################################################################

### SET PARAMETERS
num_chunks = 10000
temp_direct = processed_linkedin + 'temp_4'
os.makedirs(temp_direct, exist_ok=True)
db = wrds.Connection(wrds_username='am0195')
matching_output = pd.read_parquet(processed_linkedin + 'firm_lvl_info_all_matched_firms.parquet')[['rcid', 'firmid']]

role_output = pd.read_parquet(processed_linkedin + 'matched_firm_role_output.parquet').merge(matching_output)
chunks = np.array_split(role_output['firmid'].unique(), num_chunks)
linkedin_to_iso_cross_walk = (pd.read_csv(processed_admin +'linkedin_to_iso_crosswalk.csv')
                              .assign(needs_collapse = lambda df: df.groupby('ctry')['ctry'].transform('count') >1))

### DEFINE FUNCTIONS 
def run_subsection(index):
    clear_output(wait=True)
    print(str(round(100*(index+1)/num_chunks,2))+ '%')
    
    role_subset = role_output.loc[lambda x: x['firmid'].isin(chunks[index])]
    params = {"user_ids":  tuple(role_subset['user_id'].unique())}
    ever_role_subset = db.raw_sql(
        """
        SELECT user_id, country, startdate, enddate
        FROM revelio.individual_positions 
        where user_id IN %(user_ids)s
        """,
        params= params 
    )
    print('done scraping')
    subset_output = pd.concat([collapse_year_level(year, role_subset, ever_role_subset) for year in range(2009, 2021)],
                            ignore_index=True)
    subset_output.to_parquet(temp_direct + "/temp"+str(index)+".parquet")

    
def collapse_year_level(year, role_subset, ever_role_subset):
    print(year)
    cutoff_date = pd.Timestamp(f'{year}-01-01')
    temp = (
        ## determine which users are active in a given year for a given firm
        role_subset
        .rename(columns={'country':'current_ctry'})
        .assign(
                startdate =lambda x: pd.to_datetime(x['startdate'], errors='coerce'),
                enddate =lambda x: pd.to_datetime(x['enddate'], errors='coerce')) 
        .assign( valid = lambda x: (x['startdate'] <= cutoff_date) & ((x['enddate'].isna()) | (x['enddate'] >= cutoff_date)))
        .loc[lambda x: x['valid'], ['firmid','user_id', 'current_ctry', 'weight', 'total_compensation']]
        .assign(priority = lambda x: (x['current_ctry'] == 'France').astype(int))
        .sort_values(by = 'priority', ascending = False)
        .drop_duplicates(subset = ['firmid', 'user_id'], keep = 'first')
    
        ## add all the roles those users have ever held 
         .merge(ever_role_subset).drop_duplicates()
    
        # drop all roles that start after the year of interest
        .assign(startdate =lambda x: pd.to_datetime(x['startdate'], errors='coerce'),
                enddate =lambda x: pd.to_datetime(x['enddate'], errors='coerce'),
               years_since_active = lambda x: (year - x['enddate'].dt.year.fillna(year)).clip(lower=0))
         .loc[lambda x: (x['startdate'] <= cutoff_date) & ~x['country'].eq('France')]
    
        # compute the employee's effective tenure in the foreign market 
        .assign(effective_end_date = lambda x: x['enddate'].where(x['enddate'].notna() 
                                                                  & x['enddate'].lt(cutoff_date), cutoff_date))
        .assign(duration = lambda x: (x['effective_end_date'] - x['startdate']).dt.days/ 365.25)
    
        # collapse down (note this will double count tenure if they held multiple roles 
        .groupby(['firmid', 'user_id', 'current_ctry', 'country'], as_index=False)
        .agg({ 'duration': 'sum','years_since_active': 'min', 'total_compensation' : 'min', 'weight' : 'min'})
        .rename(columns={'country':'ctry'})
        .assign(year = year) 
    )
    return(temp)

### EXECUTE SCRAPING AND INITIAL COLLAPSE TO YEAR-firmid-ctry LEVEL 
[run_subsection(index) for index in range(num_chunks)]
output = pd.concat([pd.read_parquet(file) for file in glob.glob(temp_direct + "/*.parquet")],ignore_index = True)

### Match to ISO-2 CODES AND EXPORT 
output.merge(linkedin_to_iso_cross_walk, how = 'left').to_parquet(processed_linkedin + 'matched_firm_foreign_emp_history.parquet')

shutil.rmtree(temp_direct)