In [1]:
## SETUP 
imports = ['wrds', 'pandas as pd', 'os', 're', 'pickle', 'numpy as np', 'from name_matching.name_matcher import NameMatcher',
          'from joblib import Parallel, delayed', 'from IPython.display import display, HTML, clear_output', 'random',
          'unicodedata','sys', 'glob', 'from datetime import datetime', 'from itertools import chain']
for command in imports:
    if command.startswith('from'): exec(command)
    else: exec('import ' + command)

if not os.getcwd().endswith('Big Data'):
    os.chdir('../../..')

sys.path.append('trade_data_code/2_python')
import A_helper_functions as hf

In [None]:
#######################################################################################
# IMPORT AND CLEAN SCRAPED DATA
########################################################################################
###############
# Set parameters / Functions 
###############
cores =  os.cpu_count() - 10 
def import_wrapper(index):
    file_path = raw_files[index]
    file = pd.read_csv(file_path)
    if file_path.find('_time') == -1:
        file = file.assign(siren = (match := re.search(r'results(\d+)\.csv', file_path)) and match.group(1) or None,
                           scrape_method = 'siren')
    else:
        file = file.assign(scrape_method = 'time')
        if file_path.find("tm_")!= -1:
            dates = re.findall(r'\d{8}', file_path)
            file = file.assign(application_year = datetime.strptime(str(dates[0]), "%Y%m%d").year)
    return(file)

def clean_with_commas(chunks,index, name_column):
    df = chunks[index]
    df['name_col'] = df[name_column]
    split_names = df['name_col'].str.split(',', expand=True)
    split_names.columns = [f'name_col_{i}' for i in range(split_names.shape[1])]
    df = pd.concat([df, split_names], axis=1)
    
    for index in range(len([col for col in df.columns if "name_col_" in col])):
        col = f"name_col_{index}"
        df.loc[lambda c: c[col].notna(), col] = (
            hf.clean_firm_names(df.loc[lambda c: c[col].notna()],col,False)[col + "_cleaned"])
    df = df.assign(**{f"{name_column}_cleaned": df.filter(like='name_col_').apply(
            lambda row: ",".join(sorted(set(filter(None, row.dropna().astype(str))))), axis=1)})
    return(df.filter(like=name_column))

###############
# Initial Import and Clean Trademarks 
###############
raw_files =  (glob.glob('data/3_IP_data/2_working/tm_time/*') + 
              glob.glob('data/3_IP_data/2_working/tm_siren/*'))

tm_raw = (
    ## import all the files 
    pd.concat(Parallel(n_jobs=cores, backend='multiprocessing')
    (delayed(import_wrapper)(index) 
     for index in range(len(raw_files))), ignore_index = True)
    .assign(trademark_type= lambda df: df['ukey'].str.split('|').str[0],
            trademark_name = lambda df: df['Mark'].astype(str),
            applicant_name = lambda df: df['DEPOSANT'].astype(str),
            application_number = lambda df: df['ApplicationNumber'].astype(str),
            trademark_status = lambda df: df['MarkCurrentStatusCode'])
    [['application_number','trademark_name', 'trademark_type', 'applicant_name',
      'trademark_status', 'scrape_method', 'application_year', 'siren' ]]) 
 
###############
# Initial Import / Clean Patents
###############
raw_files =  (glob.glob('data/3_IP_data/2_working/patent_time/*')
              + glob.glob('data/3_IP_data/2_working/patent_siren/*'))
patent_raw = (
    ## import all the files 
    pd.concat(Parallel(n_jobs=cores, backend='multiprocessing')
    (delayed(import_wrapper)(index) 
     for index in range(len(raw_files))), ignore_index = True)
    
    ## clean 
    .assign(
        application_year = lambda df:pd.to_datetime(df['DEPD'].astype(str), format='%Y%m%d', errors='coerce').dt.year,
        applicant_name = lambda df: df['DENE'].apply(str),
        type = lambda df: np.where(df['NAT'].str.strip() != "", df['NAT'], np.nan),
        collection = lambda df: df['PUBN'].str[9:11],
        publication_number = lambda df: df['PUBN'].str.extract(r'<doc-number>([0-9]+(?:\.[0-9]+)?)</doc-number>')[0],
        application_number = lambda df: df['DEPN'].str.extract(r'<doc-number>([0-9]+(?:\.[0-9]+)?)</doc-number>')[0])
    .rename(columns={'IPCR': 'ipcr', 'TIT': 'title'})
    [['application_number', 'publication_number', 'type', 'collection',
      'siren', 'application_year', 'applicant_name', 'title', 'ipcr', 'scrape_method']])

###############
# Generate Cleaned Versions of the Names  
###############
names = pd.concat([patent_raw[['applicant_name']], tm_raw[['applicant_name']]], ignore_index = True).drop_duplicates()
names_cleaned = (pd.concat(
    Parallel(n_jobs=cores, backend='multiprocessing')
    (delayed(clean_with_commas)(np.array_split(names,cores), index, 'applicant_name')
    for index in range(cores)), ignore_index = True))

patent_raw = pd.merge(patent_raw, names_cleaned)
tm_raw = pd.merge(tm_raw, names_cleaned)

###############
# Merge and export  
###############
wd = 'data/3_IP_data/2_working/'

patent_time = patent_raw.loc[patent_raw['scrape_method'].eq('time')].drop('siren',axis = 1)
patent_time.to_parquet(wd + "patent_time_init.parquet") 

patent_siren = patent_raw.loc[patent_raw['scrape_method'].eq('siren')]
patent_siren.to_parquet(wd + "patent_siren_init.parquet") 

tm_time = tm_raw.loc[tm_raw['scrape_method'].eq('time')].drop('siren',axis = 1)
tm_time.to_parquet(wd + "tm_time_init.parquet") 

tm_siren = tm_raw.loc[tm_raw['scrape_method'].eq('siren')].drop('application_year',axis = 1)
tm_siren.to_parquet(wd + "tm_siren_init.parquet") 

In [None]:
##############################################################
## Retrieve names from Trademarks / Patents for use
##############################################################
###############
# Import / set parameters 
###############
cores =  os.cpu_count() - 10 
wd = 'data/3_IP_data/2_working/'
tm_siren = pd.read_parquet(wd+ 'tm_siren_init.parquet')
tm_time =  pd.read_parquet(wd+ 'tm_time_init.parquet')
patent_siren = pd.read_parquet(wd+ 'patent_siren_init.parquet')
patent_time = pd.read_parquet(wd+ 'patent_time_init.parquet')

###############
# Import and Process The Sirens 
###############
#######
### prepare the siren numbers from the admin data 
#######
siren_numbers = (
    ##import
    pd.read_csv('data/3_IP_data/1_raw/1_StockUniteLegaleHistorique_utf8.csv',
                usecols=['denominationUniteLegale', 'siren', 'dateDebut', 'dateFin', 'etatAdministratifUniteLegale'],
                dtype = {'siren': 'str'})
    
    #rename columns
    .rename(columns={'denominationUniteLegale': 'admin_name', 'dateDebut': 'start_year',
                     'dateFin': 'end_year', 'etatAdministratifUniteLegale': 'status'}) 
   
    # fix date variables 
     .assign(start_year=lambda df: pd.to_datetime(df['start_year'], errors='coerce').dt.year,
             end_year=lambda df: pd.to_datetime(df['end_year'], errors='coerce').dt.year)
    # filter 
    .loc[lambda df: df['admin_name'].notna() & ~df['admin_name'].eq('[ND]') & ~df['status'].eq('C')]
)
siren_numbers.loc[siren_numbers['end_year'].isna(), 'end_year'] = 2024

siren_chunks = np.array_split(siren_numbers,cores); 
def cleaning_wrapper(index):
    return(hf.clean_firm_names(siren_chunks[index],'admin_name',False))
siren_numbers = (
    pd.concat(Parallel(n_jobs=cores, backend='multiprocessing')
              (delayed(cleaning_wrapper)(index) for index in range(cores)), ignore_index = True)
[['siren', 'admin_name_cleaned','start_year','end_year']])


#######
### prepare the siren numbers from scraped data
#######
tm_combined = (
    pd.merge(pd.read_parquet(wd + "tm_time_init.parquet").drop('scrape_method', axis = 1),
             pd.read_parquet(wd + "tm_siren_init.parquet").drop('scrape_method', axis = 1),
             how = 'left')
    [['application_year', 'siren', 'applicant_name_cleaned']]
    .drop_duplicates())
    
patent_combined = (
    patent_siren.loc[lambda c: ~c['applicant_name_cleaned'].str.contains(",", na=False)]
    [['siren', 'application_year', 'applicant_name_cleaned']]
).drop_duplicates()

#######
## MERGE AND GENERATE COVERAGE YEARS
#######
siren_numbers = pd.concat([siren_numbers,
                           pd.concat([tm_combined, patent_combined],ignore_index = True)
                           .assign(start_year = lambda df: df['application_year'])
                           .rename(columns = {'application_year': 'end_year',
                                              'applicant_name_cleaned': 'admin_name_cleaned'})],
                          ignore_index = True)

siren_numbers['combo'] = siren_numbers['siren'] + siren_numbers['admin_name_cleaned']

start_dates = (siren_numbers.loc[lambda c: c['start_year'].notna()]
               .sort_values(['combo','start_year'])
               .groupby(['combo']).head(1)[['siren','admin_name_cleaned','start_year']])

end_dates = (siren_numbers.loc[lambda c: c['end_year'].notna()]
            .sort_values(['combo','end_year'], ascending = [True, False])
            .groupby('combo').head(1)[['siren','admin_name_cleaned','end_year']])

siren_numbers = pd.merge(start_dates,end_dates).loc[lambda x: x['end_year'].gt(1989)]



###############
# generate list of names to match
###############
names_to_match = (
    pd.concat([
        patent_siren[['application_year', 'applicant_name_cleaned']],
        patent_time[['application_year', 'applicant_name_cleaned']],
        tm_time[['application_year', 'applicant_name_cleaned']]],
        ignore_index = True)
    .drop_duplicates()
    .assign(applicant_name_cleaned = lambda df: df['applicant_name_cleaned'].str.split(','))
    .explode('applicant_name_cleaned').drop_duplicates()
)

###############
# use admin_names to generate a list of common words to strip 
###############
def strip_wrapper(df,index,name): return(hf.strip_words(df[index], name, common_words)) 
names_vec = ['andre', 'bernard', 'claude', 'jacques', 'jean','louis', 'marie', 'martin', 'michel','paul', 'pierre', 'philippe']
cut_off = .01
#siren_numbers = pd.read_parquet(wd + 'matching_dictionary.parquet')
#names_to_match = pd.read_parquet(wd + 'names_to_match.parquet')

word_counts = siren_numbers['admin_name_cleaned'].str.split(expand = True).stack().value_counts()
common_words = set(word_counts[word_counts > np.max(word_counts) * cut_off].index)
common_words = {word for word in common_words if not word.isnumeric() and word not in names_vec}

names_to_match = pd.concat(Parallel(n_jobs=cores, backend='multiprocessing')
                           (delayed(strip_wrapper)(np.array_split(names_to_match,cores),index, 'applicant_name_cleaned') 
                            for index in range(cores)), ignore_index = True)
siren_numbers = pd.concat(Parallel(n_jobs=cores, backend='multiprocessing')
                           (delayed(strip_wrapper)(np.array_split(siren_numbers,cores),index, 'admin_name_cleaned') 
                            for index in range(cores)), ignore_index = True)
    
names_to_match.to_parquet(wd + 'names_to_match.parquet')
siren_numbers.to_parquet(wd + 'matching_dictionary.parquet')

In [None]:
##############################################################
## Match Firms Name to Administrative Data 
##############################################################
###############
# Import / set parameters /functions 
###############
def matching_wrapper(index):
    clear_output(wait=True)
    print(f"{yr}: {round((index+1)/chunks*100, 2)}%")
    temp_firms = firm_chunks[index]
    if temp_firms.empty:
        return pd.DataFrame()
    #run first version of matcher on all words 
    matches = matcher.match_names(to_be_matched=temp_firms, column_matching='applicant_name_cleaned')
    results = (pd.wide_to_long(matches,stubnames=["match_name", "score", "match_index"], i="original_name", j="match",suffix="_\d+")
               .reset_index()[['original_name', 'match_name', 'score']]
               .rename(columns={'original_name': 'applicant_name_cleaned', 'match_name': 'admin_name_cleaned', 'score': 'raw_score'})
               .merge(temp_firms[['applicant_name_cleaned', 'applicant_name_stripped']], how = 'left', on = 'applicant_name_cleaned')
               .merge(dictionary[['admin_name_cleaned', 'admin_name_stripped']], how = 'left', on = 'admin_name_cleaned'))
    company_chunks = [group for _, group in results.groupby('applicant_name_cleaned')]

    ### run the second version of matcher only on words from initial list 
    results = []
    temp_matcher = NameMatcher(number_of_matches=init_matches, legal_suffixes=False, common_words= False, top_n= init_matches, verbose=False)
    temp_matcher.set_distance_metrics(['bag', 'typo', 'refined_soundex'])

    for chunk in company_chunks:
        chunk = chunk.reset_index()
        try:
            temp_matcher.load_and_process_master_data(column='admin_name_stripped', df_matching_data=chunk, transform=True)
            temp_matches = temp_matcher.match_names(to_be_matched=chunk.iloc[0], column_matching='applicant_name_stripped')
            temp_results = (pd.wide_to_long(temp_matches,stubnames=["match_name", "score", "match_index"], i="original_name", j="match",suffix="_\d+")
                            .reset_index()[['original_name', 'match_name', 'score']]
                            .rename(columns={'original_name': 'applicant_name_stripped', 'match_name': 'admin_name_stripped', 'score': 'stripped_score'})
                            .drop_duplicates()
                            .merge(chunk, how = 'right').sort_values(by = 'stripped_score', ascending = False)
                            .head(final_matches)
                            [['applicant_name_cleaned', 'admin_name_cleaned','raw_score', 'stripped_score']]
                            .assign(application_year = yr)
                           )
        except Exception as e:
            print(f"Error processing company: {chunk.loc[0,'applicant_name_cleaned']} in index {index}") 
            temp_results = (chunk.sort_values(by = 'raw_score', ascending = False)
                            .head(final_matches)
                            [['applicant_name_cleaned', 'admin_name_cleaned','raw_score']])
        results.append(temp_results)
    
    results = (pd.concat(results, ignore_index = True)
               .assign(stripped_score = lambda df: round(df['stripped_score']) if 'stripped_score' in df.columns else None)    
               .loc[lambda df: df['stripped_score'].ge(min_score)])
    return(results)


wd = 'data/3_IP_data/2_working/'
cores =  os.cpu_count() - 10; cut_off = .01
siren_numbers = pd.read_parquet(wd + 'matching_dictionary.parquet')
names_to_match = pd.read_parquet(wd + 'names_to_match.parquet')
init_matches = 50; final_matches = 1; min_score = 90; cores =  os.cpu_count() - 10; 
chunks = cores*10

###############
# Carry out fuzzy matching  
###############
for yr in range(1990,2024):
    print(f'preparing matcher for {yr}')
    ## prepare list of words 
    matching = names_to_match.loc[names_to_match['application_year'].eq(yr)].copy()
    dictionary = siren_numbers.loc[lambda df: df['start_year'].le(yr) & df['end_year'].ge(yr)].copy()

    direct_matches = (dictionary[['admin_name_cleaned', 'siren']]
                      .loc[dictionary['admin_name_cleaned'].isin(matching['applicant_name_cleaned'])]
                      .assign(applicant_name_cleaned = lambda df: df['admin_name_cleaned'],
                            raw_score = 100, stripped_score = 100, application_year = yr))

    remaining_to_match = matching.loc[lambda df: ~df['applicant_name_cleaned'].isin(direct_matches['admin_name_cleaned']) &
                                          df['applicant_name_stripped'].isin(dictionary['admin_name_stripped']) &
                                          ~df['applicant_name_stripped'].eq("")]
    
    ### Run Fuzzy Matching
    firm_chunks = np.array_split(remaining_to_match, chunks)
    matcher = NameMatcher(number_of_matches=init_matches, legal_suffixes=False, common_words= False, top_n= init_matches, verbose=False)
    matcher.set_distance_metrics(['bag', 'typo', 'refined_soundex'])
    matcher.load_and_process_master_data(column='admin_name_cleaned', df_matching_data=dictionary, transform=True)
    matching_output = Parallel(n_jobs=cores, backend='multiprocessing')(delayed(matching_wrapper)(index) for index in range(chunks))


    ### Perform Necessary Cleaning
    matching_output =  pd.concat(matching_output, ignore_index = True).merge(dictionary[['siren','admin_name_cleaned']])
    matching_output = (
        ## add back in the direct matches 
        pd.concat([matching_output, direct_matches], ignore_index = True) 

        ## note the number of sirens each applicant / admin name is matched to 
        .assign(num_matches=lambda c: c.groupby(['applicant_name_cleaned', 'siren'])['raw_score'].transform('size'))

        ## only keep the best stripped score performance for each applicant name 
        .sort_values(by='stripped_score', ascending=False).groupby('applicant_name_cleaned').head(1)

        ## only keep the match if it was unique (only matched to one siren)
        .loc[lambda df: df['num_matches'].eq(1)]
    )
    matching_output.to_parquet(wd + f'dictionary_complete_{yr}.parquet')

dictionary = []
for file in glob.glob('data/3_IP_data/2_working/dictionary_complete_*'):
    dictionary.append(pd.read_parquet(file))
    dictionary = pd.concat(dictionary, ignore_index = True)

###############################
## Do quality Assurance to determine cutoffs  
###############################
## Rule would be that we would have to hit a 75% likely match rate in sample 
## to allow that group in. Did this based on needing a 100% matched when stripped
## and then relaxing the raw score cutoff. Rule eneded up with was raw score of at least 90
## as checks of 85 yielded only 67% likely match rate 
checking_quality = False 
if checking_quality:
    def matching_performance_check(raw_score_cutoff,sample_size):
        random.seed(42)
        test_sample = dictionary.loc[lambda df: ~df['raw_score'].eq(100) & df['stripped_score'].eq(100) & df['raw_score'].ge(raw_score_cutoff)]
        num_obs = len(test_sample)
        test_sample = (test_sample
                       .loc[test_sample['applicant_name_cleaned'].isin(random.sample(list(test_sample['applicant_name_cleaned'].unique()), sample_size))]
                       .drop_duplicates(subset = 'applicant_name_cleaned')
                       .assign(match_likelihood = -10)).reset_index()
        for i in range(len(test_sample)):
            print(f"{i +1})")
            display(HTML(f"<span style='font-size:20px;'>Name to match: {test_sample['applicant_name_cleaned'].iloc[i]}</span>"))
            print("")
            display(HTML(f"<span style='font-size:20px;'>Proposed Match: {test_sample['admin_name_cleaned'].iloc[i]}</span>"))
            user_input = input("rating = ")
            while user_input not in ['1','2','3','break']:
                user_input = input("best fit = ")
            clear_output(wait=True)
            if user_input == 'break':
                break
            else:
                test_sample.loc[i, 'match_likelihood'] = int(user_input)-2
        output = pd.DataFrame({'raw_score_cutoff': [raw_score_cutoff],
                               'num_obs': num_obs,
                               'match_rate': len(test_sample.loc[test_sample['match_likelihood'].eq(1)])/sample_size,
                              'accepted_average': [np.mean(test_sample.loc[test_sample['match_likelihood'].eq(1)]['raw_score'])],
                              'failed_average': [np.mean(test_sample.loc[~test_sample['match_likelihood'].eq(1)]['raw_score'])]})
        return(output)
    ninety_cutoff = matching_performance_check(90, 100)
    eighty_five_cutoff = matching_performance_check(85,100)
    
    
############ 
### Use Results of Quality Assurance, export data
############
dictionary = []
for file in glob.glob('data/3_IP_data/2_working/dictionary_complete_*'):
    dictionary.append(pd.read_parquet(file))
dictionary = pd.concat(dictionary, ignore_index = True)
dictionary = dictionary.loc[lambda df: df['raw_score'].ge(90) & df['stripped_score'].eq(100)]
dictionary.to_parquet('data/3_IP_data/3_final/dictionary_complete.parquet')

In [2]:
##############################################################
## USE NEW FIRM NAME - SIREN DICTIONARY TO GENERATE FINAL DATASETS 
##############################################################
wd = 'data/3_IP_data/2_working/'
outd = 'data/3_IP_data/3_final/'
dictionary = pd.read_parquet(outd + 'dictionary_complete.parquet')

siren_final = []
for version in ['patent', 'tm']:
    print(version)
    merge_method = 'left'
    if version == 'patent': merge_method = 'outer'
    df_combined = (
        pd.merge(pd.read_parquet(wd + f"{version}_time_init.parquet").drop('scrape_method', axis = 1),
                 pd.read_parquet(wd + f"{version}_siren_init.parquet").drop('scrape_method', axis = 1),
                 how = merge_method)
        .drop_duplicates()
        .assign(index_num = lambda df: range(1, len(df) + 1)))

    def_matched = (df_combined
                   .loc[lambda df: ~df['siren'].isna() &  ~df['applicant_name_cleaned'].str.contains(",", na=False)]
                   .assign(method = 'scraping'))
    not_scrape_matched =  df_combined.loc[~df_combined['index_num'].isin(def_matched['index_num'])].drop('siren',axis = 1)
    record_grouped = (
        not_scrape_matched
        .assign(applicant_name_cleaned = lambda df: df['applicant_name_cleaned'].str.split(','))
        .explode('applicant_name_cleaned')
        .merge(dictionary[['applicant_name_cleaned', 'siren', 'application_year']])
        .groupby('index_num').agg(siren=pd.NamedAgg(column='siren', aggfunc=lambda x: ','.join(x)))
        .reset_index()
        .assign(method = 'matching')
        .merge(not_scrape_matched, how = 'outer')
    )
    record_grouped = pd.concat([record_grouped, def_matched], ignore_index = True)
    record_grouped.to_parquet(outd + version + "_record_level_final.parquet")
    
    siren_grouped = (
        record_grouped.loc[record_grouped['siren'].notna()]
        .assign(siren = lambda df: df['siren'].str.split(','))
        .explode('siren')
        .groupby(['siren', 'application_year'])
        .size()
        .reset_index(name=f"num_{version}")
    )
    siren_final.append(siren_grouped)
pd.merge(siren_final[0], siren_final[1],how= 'outer').to_parquet(outd + "siren_level_patent_and_tm_final.parquet")

########
## Generate Summaries of Match Rate by patent vs. tm and "collection" 
########
patent_match_shares = (
    pd.read_parquet(outd + 'patent_record_level_final.parquet')
    .groupby('collection')['method']
    .value_counts(normalize=True, dropna=False)
    .reset_index(name='share')
)
tm_match_shares = (
    pd.read_parquet(outd + 'tm_record_level_final.parquet')
    .groupby('trademark_type')['method']
    .value_counts(normalize=True, dropna=False)
    .reset_index(name='share')
)

patent_match_shares.to_csv(outd + "patent_match_shares.csv")
tm_match_shares.to_csv(outd + "tm_match_shares.csv")