Read SDC Platinum data

In [34]:
import pandas as pd
from  path_utils import get_base_path
import os

# Set path
base = get_base_path()
relative =  r"05 Analysis\01 Main\01 Stata\raw\sdc_gafam\SDC Platinum sdc_gafam_usonly_completed.xlsx"
file_path = os.path.join(base, relative)

# Load the Excel file
df = pd.read_excel(file_path, sheet_name="Request 7", header=2)


Read Target assignees file (JSON)

In [56]:
import json

# Load existing target_assignees_dict from a file if it exists
target_assignees_path = os.path.join(base, r"05 Analysis\01 Main\00 Python data\target_assignees.json")
if os.path.exists(target_assignees_path):
    with open(target_assignees_path, "r") as file:
        try:
            target_assignees_dict = json.load(file)
        except json.JSONDecodeError:
            # In case the file is empty or not properly formatted
            target_assignees_dict = {}
else:
    # Initialize an empty dictionary if the file doesn't exist
    target_assignees_dict = {}


Clean to get create a GAFAM deals only 

In [36]:
# Define GAFAM companies
gafam_companies = ["Google", "Microsoft", "Facebook", "Apple", "Amazon"]

# Create GAFAM variable
df['GAFAM'] = df['Acquiror Full Name'].apply(lambda x: 1 if any(company.lower() in x.lower() for company in gafam_companies) else 0)

# Get TargetFullName for GAFAM == 1
gafam_deals = df[(df['GAFAM'] == 1) & (df['Acquiror Full Name'] != df['Target Full Name'])]['Target Full Name']
gafam_deals = gafam_deals

# Initialize an empty dictionary to store the results
target_assignees_dict = {}


Get best matches from PatensView API

In [57]:
import api.get_closest_assignees as gca
import importlib
importlib.reload(gca)

import re

# Total number of targets (counting those already in the dictionary)
total_targets = len(gafam_deals)

# Initialize the counter for processed targets
processed_count = len(target_assignees_dict)


# Iterate over each target and apply the get_closest_assignees function
for i, target in enumerate(gafam_deals):
    # Clean target name by removing anything in parentheses (including parentheses)
    cleaned_target = re.sub(r"\(.*?\)", "", target).strip()
    cleaned_target = re.sub(r"\{.*?\}", "", cleaned_target).strip()

    # Check if the cleaned target already exists in the saved dictionary
    if cleaned_target in target_assignees_dict:
        print(f"Skipping {cleaned_target} as it already exists in the saved file.")
        processed_count += 1
        continue

    # Get the closest assignees for the cleaned target
    best_matches = gca.get_closest_assignees(cleaned_target, threshold=75)
    
    # Store the result in the dictionary
    target_assignees_dict[cleaned_target] = {}
    target_assignees_dict[cleaned_target]['str_matched'] = best_matches if best_matches else []
    
    # Update the processed count
    processed_count += 1

    # Print progress every 25 targets
    if processed_count % 25 == 0:
        print(f"Have completed {processed_count}/{total_targets} targets.")


Skipping Fox Software Inc as it already exists in the saved file.
Skipping Stac Electronics Inc as it already exists in the saved file.
Skipping Altamira Software Corp as it already exists in the saved file.
Skipping One Tree Software as it already exists in the saved file.
Skipping UUNet Technologies Inc as it already exists in the saved file.
Skipping Wang Laboratories Inc as it already exists in the saved file.
Skipping Individual Inc as it already exists in the saved file.
Skipping Blue Ribbon SoundWorks Ltd as it already exists in the saved file.
Skipping Netwise Inc as it already exists in the saved file.
Skipping Bruce Artwick Organization as it already exists in the saved file.
Skipping Vermeer Technologies as it already exists in the saved file.
Skipping Aspect Software Engineering as it already exists in the saved file.
Skipping Colusa Software Inc as it already exists in the saved file.
Skipping Mobile Telecommunications Technologies Corp as it already exists in the saved fi

Save the best matches as dict.

In [55]:
# Save the updated target_assignees_dict back to the JSON file
with open(target_assignees_path, "w") as outfile:
    json.dump(target_assignees_dict, outfile, indent=4)

In [65]:
empty_match_count = 0
for key, value in target_assignees_dict.items():
    if 'str_matched' in value and isinstance(value['str_matched'], list) and value['str_matched']:
        empty_match_count += 1

print(f"Number of keys where 'str_matched' is an empty list: {empty_match_count}")


Number of keys where 'str_matched' is an empty list: 197
