In [2]:
## SETUP 
imports = ['wrds', 'pandas as pd', 'os', 're', 'pickle', 'numpy as np', 'from name_matching.name_matcher import NameMatcher',
          'from joblib import Parallel, delayed', 'from IPython.display import display, HTML, clear_output',
          'unicodedata','sys', 'matplotlib.pyplot as plt']
for command in imports:
    if command.startswith('from'): exec(command)
    else: exec('import ' + command)

if not os.getcwd().endswith('Big Data'):
    os.chdir('../..')

sys.path.append('trade_data_code/2_python')
import A_helper_functions as hf

In [3]:
########################################################################################
# Identify firms we will try to match 
########################################################################################
role_info = pd.read_parquet('data/2_processed/linkedin/french_affiliated_firm_roles_collapsed_raw.parquet')
companies = pd.read_parquet('data/2_processed/linkedin/france_affiliated_firms_cleaned.parquet')
french_role_info = role_info.loc[role_info['rcid'].isin(companies.loc[companies['likely_french']]['rcid'])]

In [3]:
role_info = pd.read_parquet('data/2_processed/linkedin/french_affiliated_firm_roles_collapsed_raw.parquet')

Unnamed: 0,rcid,emp_french,emp_total,emp_engineer,emp_data,emp_rnd,emp_stem,comp_french,comp_total,comp_engineer,comp_data,comp_rnd,comp_stem,share_emp_french,share_comp_french,year
0,31,0.0,203.0,4.0,0.0,3.0,3.0,0.0,13109.0,259.0,0.0,137.0,137.0,0.00000,0.00000,2008
1,31,0.0,252.0,2.0,0.0,3.0,3.0,0.0,14363.0,109.0,0.0,146.0,146.0,0.00000,0.00000,2012
2,31,0.0,236.0,1.0,0.0,0.0,1.0,0.0,12754.0,53.0,0.0,0.0,53.0,0.00000,0.00000,2023
3,31,0.0,350.0,2.0,0.0,2.0,2.0,0.0,15820.0,139.0,0.0,139.0,139.0,0.00000,0.00000,2018
4,31,1.0,271.0,3.0,0.0,5.0,5.0,29.0,14110.0,133.0,0.0,148.0,148.0,0.00369,0.00203,2013
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29547,97916116,0.0,6.0,6.0,0.0,4.0,6.0,0.0,592.0,592.0,0.0,411.0,592.0,0.00000,0.00000,2010
29548,97916116,0.0,6.0,6.0,0.0,4.0,6.0,0.0,592.0,592.0,0.0,411.0,592.0,0.00000,0.00000,2009
29549,97916116,0.0,6.0,6.0,0.0,4.0,6.0,0.0,592.0,592.0,0.0,411.0,592.0,0.00000,0.00000,2008
29550,97916116,0.0,6.0,6.0,0.0,4.0,6.0,0.0,592.0,592.0,0.0,411.0,592.0,0.00000,0.00000,2022


In [30]:
french_role_info = (
    french_role_info.assign(
        size = lambda df: np.select([
            df['emp_total'].lt(10),
            df['emp_total'].ge(10) & df['emp_total'].lt(50),
            df['emp_total'].ge(50) & df['emp_total'].lt(200)],
            [4,3,2],default=1
        )))

In [150]:
#################################### 
## GENERATE SUMMARY STATS BY SIZE 
####################################

############
# TIME SERIES 
############
summary_data = (
    french_role_info
     .assign(share_data_unwgted =lambda df: (df['comp_data']/df['comp_total']))  
    .groupby(['size','year'], as_index=False)
    .agg({'comp_total': 'sum', 'comp_data': 'sum', 'share_data_unwgted': 'mean'})
    .assign(share_data=lambda df: df['comp_data'] / df['comp_total'])
)

summary_data = (
    pd.merge(summary_data,
             summary_data.loc[summary_data['year'].eq(2008)]
             .rename(columns= {'share_data':'base_share', 'share_data_unwgted': 'base_share_unwgted'})
             [['size', 'base_share', 'base_share_unwgted']])
     .assign(relative_share = lambda df: (df['share_data']/df['base_share'] -1)*100,
            relative_share_unwgted = lambda df: (df['share_data_unwgted']/df['base_share_unwgted'] -1)*100)
)


def plot_time_series(df,group_var, x_var, y_var, y_label, x_label, title):
    for size, data in df.groupby(group_var):
        plt.plot(data[x_var], data[y_var], label=size)
    plt.xlabel(x_label)
    plt.ylabel(y_label)
    plt.title(title)
    plt.legend( loc='center left', bbox_to_anchor=(1, 0.5),
           labels=['large (200+)', 'medium (50-200)', 'small (10-50)', 'micro (<10)'])
    plt.grid(True)
    plt.ylim(-10, 100)
    plt.show()
#plot_time_series(summary_data,'size','year', 'share_data', 'share data', '', 'Weighted Data Cost Share by Firm Size')
#plot_time_series(summary_data,'size','year', 'share_data_unwgted', 'share data', '', 'Unweighted Data Cost Share by Firm Size')
plot_time_series(summary_data,'size','year', 'relative_share', '% Growth', '','Weighted Growth in Data Cost Share since 2008')
plot_time_series(summary_data,'size','year', 'relative_share_unwgted', '% Growth', '','Unweighted Growth in Data Cost Share since 2008')
    

     
###########
# Distribution 
###########
for yr in [2008, 2023]:
    summary_data = (
        french_role_info[french_role_info['year'] == yr]  # Filter by year
        .assign(share_data=lambda df: (df['comp_data']/df['comp_total']).round(3))  # Round 'share_data' to 3 decimal places
        .groupby(['size', 'share_data'])  # Group by 'size' and 'share_data'
        .size()  # Get the count of rows in each group
        .reset_index(name='count')  # Reset the index and rename the count column
        .groupby('size')  # Group by 'size'
        .apply(lambda x: x.assign(percent=(x['count'] / x['count'].sum())*100))  # Calculate share
        .reset_index(drop=True)  # Reset index again if needed
    )

    plot_time_series(summary_data.loc[lambda df: ~df['share_data'].eq(0) & df['share_data'].le(.1)],
                     'size','share_data', 'percent', '% of firms', '', f'Distribution of Data Cost Share by Size ({yr})')
    print(f"large -->  equals 0 = {summary_data.loc[lambda df: df['size'].eq(1) & df['share_data'].eq(0)]['percent'].sum().round(1)}%" +
         f"; greater than .1 = {summary_data.loc[lambda df: df['size'].eq(1) & df['share_data'].gt(.1)]['percent'].sum().round(1)}%" )
    print(f"medium --> equals 0 = {summary_data.loc[lambda df: df['size'].eq(2) & df['share_data'].eq(0)]['percent'].sum().round(1)}%" +
         f"; greater than .1 = {summary_data.loc[lambda df: df['size'].eq(2) & df['share_data'].gt(.1)]['percent'].sum().round(1)}%" )

    print(f"small -->  equals 0 = {summary_data.loc[lambda df: df['size'].eq(3) & df['share_data'].eq(0)]['percent'].sum().round(1)}%" +
         f"; greater than .1 = {summary_data.loc[lambda df: df['size'].eq(3) & df['share_data'].gt(.1)]['percent'].sum().round(1)}%" )

    print(f"micro -->  equals 0 = {summary_data.loc[lambda df: df['size'].eq(4) & df['share_data'].eq(0)]['percent'].sum().round(1)}%" +
         f"; greater than .1 = {summary_data.loc[lambda df: df['size'].eq(4) & df['share_data'].gt(.1)]['percent'].sum().round(1)}%" )

In [150]:
dictionary_complete.to_parquet('data/2_processed/admin/fuzzy_matching_output_final.parquet')
siren_numbers = (
    ##import
    pd.read_csv('../1_IWH/data/2_patent_tm_scraping/1_raw/1_StockUniteLegaleHistorique_utf8.csv',
                usecols=['denominationUniteLegale', 'siren', 'dateDebut', 'dateFin', 'etatAdministratifUniteLegale'],
                dtype = {'siren': 'str'})
    
    #rename columns
    .rename(columns={'denominationUniteLegale': 'admin_name', 'dateDebut': 'start_year',
                     'dateFin': 'end_year', 'etatAdministratifUniteLegale': 'status'}) 

     # filter 
    .loc[lambda df: df['siren'].isin(dictionary_complete['siren']) & ~df['status'].eq('C')]
    
    # fix date variables 
     .assign(start_year=lambda df: pd.to_datetime(df['start_year'], errors='coerce').dt.year,
             end_year=lambda df: pd.to_datetime(df['end_year'], errors='coerce').dt.year)
)
### MAKE END YEAR 2025, if end year 