In [0]:
%pip install lxml
%pip install graphviz -U --quiet
%pip install networkx -U --quiet
%pip install matplotlib -U --quiet
%pip install pydot -U --quiet
%pip install beautifulsoup4 -U --quiet



%restart_python

In [0]:
dbutils.fs.ls('/Volumes/nokia_assignment_472210556114873/default/nokia_assignment_managed_volume/')

In [0]:
from bs4 import BeautifulSoup
import os
from pyspark.sql import SparkSession

# Define the HTML tags to remove (keep their content)
html_tags = {
    "b", "i", "u", "h1", "h2", "h3", "h4", "h5", "h6", "p", 
    "span", "div", "br", "strong", "em", "sub", "sup", "drawings", 
    "figure", "img", "ol", "ul", "li", "ol", "table", "tr", "td", "o", "us-math", "us-chemistry", "us-sequence-list-doc", "sequence-list",
    "th", "tbody", "thead", "tfoot", "figref", "description-of-drawings", "summary-of-invention", "brief-description-of-drawings"
}

def clean_html_tags(xml_content):
    """
    Remove specified HTML tags while preserving their content and maintaining XML structure
    """
    # First, preserve the XML declaration if present
    if '<?xml' in xml_content:
        xml_declaration = xml_content[:xml_content.find('?>') + 2]
        main_content = xml_content[xml_content.find('?>') + 2:]
    else:
        xml_declaration = ''
        main_content = xml_content

    soup = BeautifulSoup(main_content, 'xml')
    
    # Find all tags that match our html_tags set
    for tag in soup.find_all(html_tags):
        # Replace the tag with its contents while preserving attributes
        if tag.string:
            tag.replace_with(tag.string)
        else:
            tag.unwrap()
    
    return xml_declaration + str(soup)

def process_xml_file(input_path, output_path):
    """
    Process the XML file to remove HTML tags and save to a new file
    """
    # Create output directory if it doesn't exist
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    
    # Read the entire file
    with open(input_path, 'r', encoding='utf-8') as file:
        content = file.read()
    
    # Find the root tag
    root_start = content.find('<us-patent-applications>')
    if root_start == -1:
        root_start = content.find('<us-patent-application')
    
    # Split XML declaration and root opening tag
    xml_declaration = content[:root_start]
    
    # Write the start of the file
    with open(output_path, 'w', encoding='utf-8') as outfile:
        outfile.write(xml_declaration)
        if '<us-patent-applications>' in content:
            outfile.write('<us-patent-applications>\n')
        
        # Process each patent application separately
        start_tag = '<us-patent-application'
        end_tag = '</us-patent-application>'
        
        pos = content.find(start_tag)
        while pos != -1:
            # Find the end of this patent application
            end_pos = content.find(end_tag, pos) + len(end_tag)
            if end_pos == -1:
                break
            
            # Extract and clean this patent application
            patent_xml = content[pos:end_pos]
            cleaned_patent = clean_html_tags(patent_xml)
            outfile.write(cleaned_patent + '\n')
            
            # Move to next patent application
            pos = content.find(start_tag, end_pos)
        
        # Close the root element if it exists
        if '<us-patent-applications>' in content:
            outfile.write('</us-patent-applications>')

def read_with_spark(cleaned_xml_path):
    """
    Read the cleaned XML file using Spark
    """
    spark = SparkSession.builder \
        .appName("XML Patent Reader") \
        .config("spark.jars.packages", "com.databricks:spark-xml_2.12:0.15.0") \
        .config("spark.driver.memory", "4g") \
        .config("spark.executor.memory", "4g") \
        .config("spark.sql.legacy.allowUntypedScalaUDF", "true") \
        .getOrCreate()
    
    # Read XML files into DataFrame
    df = (
        spark.read.format("xml")
        .option("rowTag", "us-patent-application")
        .option("charset", "UTF-8")
        .option("ignoreSurroundingSpaces", "true")
        .option("mode", "PERMISSIVE")
        .option("excludeAttribute", "true")
        .option("valueTag", "_VALUE")
        .load(cleaned_xml_path)
    )
    
    return df

# Usage example
input_xml_path = "/Volumes/nokia_assignment_472210556114873/default/nokia_assignment_managed_volume/batch1.xml"
output_xml_path = "/Volumes/cleaned/default/cleaned_batches/batch1_cleaned.xml"


In [0]:

# Clean the XML file
process_xml_file(input_xml_path, output_xml_path)


In [0]:

# Read the cleaned XML file using Spark
df = read_with_spark(output_xml_path)

# Show information about the DataFrame
print(f"Total number of records: {df.count()}")
print("\nSchema:")
df.printSchema()


In [0]:
df.limit(10).display()

In [0]:
from pyspark.sql.functions import col, count, lit
import pandas as pd

# Function to recursively extract field names with parent-child hierarchy
def get_nested_fields(schema, prefix=""):
    fields = []
    for field in schema.fields:
        field_name = f"{prefix}.{field.name}" if prefix else field.name
        if hasattr(field.dataType, "fields"):  # Check if it's a struct (nested)
            fields.extend(get_nested_fields(field.dataType, field_name))
        else:
            fields.append(field_name)
    return fields

# Get hierarchical field paths
nested_fields = get_nested_fields(df.schema)

# Calculate total number of records
total_count = df.count()

# Compute occurrence percentage of each column
col_counts = (
    df.select([(count(col(c)) / total_count * 100).alias(c) for c in nested_fields])
    .toPandas()
    .transpose()
    .reset_index()
)

# Rename columns
col_counts.columns = ["Tag", "Occurrence (%)"]

# Sort fields hierarchically
col_counts["Tag"] = col_counts["Tag"].apply(lambda x: x.replace(".", " → "))  # Format for readability
col_counts = col_counts.sort_values("Tag")

# Convert back to Spark DataFrame
hierarchical_df = spark.createDataFrame(col_counts)

# Show the result
hierarchical_df.show(truncate=False)


In [0]:
hierarchical_df.display()

In [0]:
from pyspark.sql.functions import col, count

# Calculate total number of records
total_count = df.count()

# Get hierarchical field paths
nested_fields = get_nested_fields(df.schema)

# Compute occurrence percentage of each column
col_counts = (
    df.select([(count(col(c)) / total_count * 100).alias(c) for c in nested_fields])
    .toPandas()
    .transpose()
    .reset_index()
)

# Rename columns
col_counts.columns = ["Tag", "Occurrence (%)"]

# Select columns with more than 80% occurrence
selected_tags = col_counts[col_counts["Occurrence (%)"] >= 80]["Tag"].tolist()

# Replace back the formatted tags (in case needed)
selected_columns = [tag.replace(" → ", ".") for tag in selected_tags]

# Select these columns from the original DataFrame
selected_df = df.select(*selected_columns)



In [0]:
selected_columns

In [0]:
selected_df.display()

In [0]:
selected_df.columns

In [0]:
# Create a mapping for the ambiguous column names
column_mapping = {
    # Abstract, claims, and description
    '_VALUE': 'abstract_text',  # First occurrence
    'claim': 'claims',
    '_VALUE': 'description_text',  # Second occurrence
    'heading': 'description_sections',
    
    # Application reference
    'country': 'application_country',  # First occurrence
    'date': 'application_date',  # First occurrence
    'doc-number': 'application_number',  # First occurrence
    
    # CPC classifications
    'classification-cpc': 'cpc_secondary',
    'date': 'cpc_action_date',  # Second occurrence
    'class': 'cpc_class',
    'classification-data-source': 'cpc_data_source',
    'classification-status': 'cpc_status',
    'classification-value': 'cpc_value',
    'date': 'cpc_version_date',  # Third occurrence
    'country': 'cpc_office_country',  # Second occurrence
    'main-group': 'cpc_main_group',
    'scheme-origination-code': 'cpc_scheme_origin',
    'section': 'cpc_section',
    'subclass': 'cpc_subclass',
    'subgroup': 'cpc_subgroup',
    'symbol-position': 'cpc_symbol_position',
    
    # IPCR classification
    'classification-ipcr': 'ipc_classification',
    
    # Publication reference
    'invention-title': 'invention_title',
    'country': 'publication_country',  # Third occurrence
    'date': 'publication_date',  # Fourth occurrence
    'doc-number': 'publication_number',  # Second occurrence
    'kind': 'publication_kind',
    
    # Other fields
    'us-application-series-code': 'application_series_code',
    'inventor': 'inventors',
    'us-applicant': 'applicants'
}

# Since there are duplicate column names in selected_df.columns, 
# we need to create a list that maintains the order and handles duplicates
new_column_names = [
    'abstract_text',
    'claims',
    'description_text',
    'description_sections',
    'application_country',
    'application_date',
    'application_number',
    'cpc_secondary',
    'cpc_action_date',
    'cpc_class',
    'cpc_data_source',
    'cpc_status',
    'cpc_value',
    'cpc_version_date',
    'cpc_office_country',
    'cpc_main_group',
    'cpc_scheme_origin',
    'cpc_section',
    'cpc_subclass',
    'cpc_subgroup',
    'cpc_symbol_position',
    'ipc_classification',
    'invention_title',
    'publication_country',
    'publication_date',
    'publication_number',
    'publication_kind',
    'application_series_code',
    'inventors',
    'applicants'
]

# Create the renamed DataFrame
renamed_df = selected_df.toDF(*new_column_names)

In [0]:
renamed_df.limit(20).display()

In [0]:
renamed_df.printSchema()

In [0]:
from pyspark.sql.functions import col, lit, concat_ws, array_join, from_unixtime, to_date
from pyspark.sql.functions import regexp_replace, lpad, when, length, expr

def clean_date(col_obj):
    """Clean and format date strings from various formats to standard date"""
    return to_date(regexp_replace(col_obj.cast("string"), r"[\[\]]", ""), "yyyyMMdd")

# Create helper functions for CPC component formatting
def format_class(class_col):
    """Ensure class is exactly 2 digits"""
    return lpad(class_col.cast("string"), 2, "0")

def format_group(group_col):
    """Format group (1-4 digits)"""
    return when(group_col.isNotNull(), group_col.cast("string")).otherwise(lit(""))

def format_subgroup(subgroup_col):
    """Ensure subgroup has at least 2 digits"""
    return when(length(subgroup_col.cast("string")) < 2, 
                lpad(subgroup_col.cast("string"), 2, "0")
            ).otherwise(subgroup_col.cast("string"))

# Improved patent dataframe with properly formatted CPC
patent_df = renamed_df.select(
    col("invention_title"),
    array_join(col("abstract_text"), " ").alias("abstract"),

    array_join(col("description_text"), " ").alias("description"),
    col("description_sections"),
    
    # Additional classification
    col("ipc_classification"),
    
    # CPC metadata
    clean_date(col("cpc_action_date")).alias("cpc_action_date"),
    col("cpc_data_source"),
    col("cpc_status"),
    col("cpc_value"),
    clean_date(col("cpc_version_date")).alias("cpc_version_date"),
    col("cpc_office_country"),
    col("cpc_scheme_origin"),
    col("cpc_symbol_position"),
    
    # Publication reference
    col("publication_country"),
    clean_date(col("publication_date")).alias("publication_date"),
    col("publication_number"),
    col("publication_kind"),
    
    # Application reference
    col("application_country"),
    clean_date(col("application_date")).alias("application_date"),
    col("application_number"),
    col("application_series_code"),
    
    # CPC components individually for later analysis
    col("cpc_section").alias("cpc_section"),
    format_class(col("cpc_class")).alias("cpc_class"),
    col("cpc_subclass").alias("cpc_subclass"),
    format_group(col("cpc_main_group")).alias("cpc_group"),
    format_subgroup(col("cpc_subgroup")).alias("cpc_subgroup"),
    
    # Full CPC code with proper structure (e.g., A01B33/00)
    concat_ws("", 
        col("cpc_section"),                                    # A
        format_class(col("cpc_class")),                       # 01
        col("cpc_subclass"),                                  # B
        format_group(col("cpc_main_group")),                  # 33
        lit("/"),                                             # /
        format_subgroup(col("cpc_subgroup"))                  # 00
    ).alias("cpc_main"),
    
    # Hierarchical CPC code for tiered analysis (section, class, subclass)
    concat_ws("", 
        col("cpc_section"),                                  
        format_class(col("cpc_class"))                      
    ).alias("cpc_class_level"),
    
    concat_ws("", 
        col("cpc_section"),                                  
        format_class(col("cpc_class")),                     
        col("cpc_subclass")                                 
    ).alias("cpc_subclass_level")
)



In [0]:
patent_df.limit(10).display()

In [0]:
from pyspark.sql.functions import explode_outer
# Inventors
inventors_df = renamed_df.select(
    col("publication_number"),
    explode_outer(col("inventors")).alias("inventor")
).select(
    col("publication_number"),
    col("inventor.addressbook.first-name").alias("first_name"),
    col("inventor.addressbook.last-name").alias("last_name"),
    col("inventor.addressbook.address.city").alias("city"),
    col("inventor.addressbook.address.state").alias("state"),
    col("inventor.addressbook.address.country").alias("country")
)

In [0]:
inventors_df.limit(10).display()

In [0]:

# US Applicants
applicants_df = renamed_df.select(
    col("publication_number"),
    explode_outer(col("applicants")).alias("applicant")
).select(
    col("publication_number"),
    col("applicant.addressbook.first-name").alias("first_name"),
    col("applicant.addressbook.last-name").alias("last_name"),
    col("applicant.addressbook.orgname").alias("organization_name"),
    col("applicant.addressbook.address.city").alias("city"),
    col("applicant.addressbook.address.state").alias("state"),
    col("applicant.addressbook.address.country").alias("country")
)


In [0]:
applicants_df.limit(10).display()

In [0]:
from pyspark.sql.window import Window
from pyspark.sql.functions import col, explode_outer, collect_list, array_join, lit, when, flatten, posexplode

# Extract claims
claims_df = renamed_df.select(
    col("publication_number"),
    explode_outer(col("claims")).alias("claim")
)

# Function to recursively extract all _VALUE elements
def extract_claim_text(claims_df):
    # First level: extract the claim-text array from each claim
    level1_df = claims_df.select(
        "publication_number", 
        explode_outer(col("claim.claim-text")).alias("claim_text_obj")
    )
    
    # Extract _VALUE arrays directly
    level2_df = level1_df.select(
        "publication_number",
        col("claim_text_obj._VALUE").alias("value_array")
    )
    
    # Handle nested claim-text if present (for deeply nested claims)
    nested_claims_df = level1_df.filter(col("claim_text_obj.claim-text").isNotNull())
    
    if nested_claims_df.count() > 0:
        nested_df = nested_claims_df.select(
            "publication_number",
            explode_outer(col("claim_text_obj.claim-text")).alias("nested_claim_text")
        ).select(
            "publication_number",
            col("nested_claim_text._VALUE").alias("value_array")
        )
        
        # Union the direct values with nested values
        return level2_df.union(nested_df)
    else:
        return level2_df

# Extract all claim text values
claim_values_df = extract_claim_text(claims_df)

# Flatten and join arrays if needed
claim_values_df = claim_values_df.withColumn(
    "claim_text", 
    when(col("value_array").isNotNull(),
         array_join(col("value_array"), "# ")
    ).otherwise(lit(None))
)

# Group by publication_number to consolidate ALL claims into a single row
final_claims_df = claim_values_df.groupBy("publication_number").agg(
    array_join(collect_list("claim_text"), "# ").alias("all_claims_text")
)

In [0]:
final_claims_df.limit(10).display()

In [0]:
renamed_df.limit(10).display()

In [0]:
from pyspark.sql.functions import col, lit, concat_ws, array_join, from_unixtime, to_date
from pyspark.sql.functions import regexp_replace, lpad, when, length, expr, explode_outer
from pyspark.sql.functions import collect_list, struct, array

def clean_date(col_obj):
    """Clean and format date strings from various formats to standard date"""
    return to_date(regexp_replace(col_obj.cast("string"), r"[\[\]]", ""), "yyyyMMdd")

# Create helper functions for classification component formatting
def format_class(class_col):
    """Ensure class is exactly 2 digits"""
    return lpad(class_col.cast("string"), 2, "0")

def format_group(group_col):
    """Format group (1-4 digits)"""
    return when(group_col.isNotNull(), group_col.cast("string")).otherwise(lit(""))

def format_subgroup(subgroup_col):
    """Ensure subgroup has at least 2 digits"""
    return when(length(subgroup_col.cast("string")) < 2, 
                lpad(subgroup_col.cast("string"), 2, "0")
            ).otherwise(subgroup_col.cast("string"))

# Function to format both CPC and IPC codes with the same structure
def format_patent_class(section_col, class_col, subclass_col, main_group_col, subgroup_col):
    """Create standardized classification code string"""
    return concat_ws("", 
        section_col,                                    
        format_class(class_col),                      
        subclass_col,                                 
        format_group(main_group_col),                 
        lit("/"),                                            
        format_subgroup(subgroup_col)                 
    )

# Process IPC classifications first
ipc_df = renamed_df.select(
    col("publication_number"),
    explode_outer(col("ipc_classification")).alias("ipc")
).select(
    col("publication_number"),
    col("ipc.section").alias("ipc_section"),
    col("ipc.class").alias("ipc_class"),
    col("ipc.subclass").alias("ipc_subclass"),
    col("ipc.main-group").alias("ipc_main_group"),
    col("ipc.subgroup").alias("ipc_subgroup"),
    col("ipc.classification-value").alias("ipc_value"),
    clean_date(col("ipc.action-date.date")).alias("ipc_action_date"),
    col("ipc.classification-status").alias("ipc_status"),
    col("ipc.classification-level").alias("ipc_level"),
    col("ipc.classification-data-source").alias("ipc_data_source"),
    col("ipc.generating-office.country").alias("ipc_office_country"),
    clean_date(col("ipc.ipc-version-indicator.date")).alias("ipc_version_date"),
    col("ipc.symbol-position").alias("ipc_symbol_position")
)

# Add formatted IPC code
ipc_df = ipc_df.withColumn(
    "ipc_code",
    format_patent_class(
        col("ipc_section"),
        col("ipc_class"),
        col("ipc_subclass"),
        col("ipc_main_group"),
        col("ipc_subgroup")
    )
)

# Group IPC information by publication_number
ipc_grouped_df = ipc_df.groupBy("publication_number").agg(
    collect_list("ipc_code").alias("ipc_codes"),
    collect_list(
        struct(
            "ipc_code", "ipc_section", "ipc_class", "ipc_subclass",
            "ipc_main_group", "ipc_subgroup", "ipc_value", "ipc_action_date",
            "ipc_status", "ipc_level", "ipc_data_source"
        )
    ).alias("ipc_details")
)

# Extract inventors and applicants
inventors_df = renamed_df.select(
    col("publication_number"),
    explode_outer(col("inventors")).alias("inventor")
).select(
    col("publication_number"),
    col("inventor.addressbook.first-name").alias("inventor_first_name"),
    col("inventor.addressbook.last-name").alias("inventor_last_name"),
    concat_ws(" ", 
        col("inventor.addressbook.first-name"), 
        col("inventor.addressbook.last-name")
    ).alias("inventor_name"),
    col("inventor.addressbook.address.city").alias("inventor_city"),
    col("inventor.addressbook.address.state").alias("inventor_state"),
    col("inventor.addressbook.address.country").alias("inventor_country")
)

# Group inventors
inventors_grouped_df = inventors_df.groupBy("publication_number").agg(
    collect_list("inventor_name").alias("inventor_names"),
    collect_list(
        struct(
            "inventor_name", "inventor_city", "inventor_state", "inventor_country"
        )
    ).alias("inventor_details")
)

# Extract applicants
applicants_df = renamed_df.select(
    col("publication_number"),
    explode_outer(col("applicants")).alias("applicant")
).select(
    col("publication_number"),
    col("applicant.addressbook.first-name").alias("applicant_first_name"),
    col("applicant.addressbook.last-name").alias("applicant_last_name"),
    col("applicant.addressbook.orgname").alias("applicant_orgname"),
    when(col("applicant.addressbook.orgname").isNotNull(), 
         col("applicant.addressbook.orgname"))
    .otherwise(
        concat_ws(" ", 
            col("applicant.addressbook.first-name"),
            col("applicant.addressbook.last-name")
        )
    ).alias("applicant_name"),
    col("applicant.addressbook.address.city").alias("applicant_city"),
    col("applicant.addressbook.address.state").alias("applicant_state"),
    col("applicant.addressbook.address.country").alias("applicant_country")
)

# Extract US applicants
us_applicants_df = renamed_df.select(
    col("publication_number"),
    explode_outer(col("applicants")).alias("applicant")
).select(
    col("publication_number"),
    col("applicant.addressbook.first-name").alias("applicant_first_name"),
    col("applicant.addressbook.last-name").alias("applicant_last_name"),
    col("applicant.addressbook.orgname").alias("applicant_orgname"),
    when(col("applicant.addressbook.orgname").isNotNull(), 
         col("applicant.addressbook.orgname"))
    .otherwise(
        concat_ws(" ", 
            col("applicant.addressbook.first-name"),
            col("applicant.addressbook.last-name")
        )
    ).alias("applicant_name"),
    col("applicant.addressbook.address.city").alias("applicant_city"),
    col("applicant.addressbook.address.state").alias("applicant_state"),
    col("applicant.addressbook.address.country").alias("applicant_country")
).filter(col("applicant_country") == "US")  # Filter for US applicants

# Group US applicants
applicants_grouped_df = applicants_df.groupBy("publication_number").agg(
    collect_list("applicant_name").alias("us_applicant_names"),
    collect_list(
        struct(
            "applicant_name", "applicant_city", "applicant_state", "applicant_country"
        )
    ).alias("us_applicant_details")
)

# Extract claims
claims_df = renamed_df.select(
    col("publication_number"),
    explode_outer(col("claims")).alias("claim")
)

# Function to extract claim text values recursively
def extract_claim_text(claims_df):
    # First level extraction
    level1_df = claims_df.select(
        "publication_number", 
        explode_outer(col("claim.claim-text")).alias("claim_text_obj")
    )
    
    # Extract direct _VALUE arrays
    level2_df = level1_df.select(
        "publication_number",
        col("claim_text_obj._VALUE").alias("value_array")
    )
    
    # Handle nested claim-text if present
    nested_claims_df = level1_df.filter(col("claim_text_obj.claim-text").isNotNull())
    
    if nested_claims_df.count() > 0:
        nested_df = nested_claims_df.select(
            "publication_number",
            explode_outer(col("claim_text_obj.claim-text")).alias("nested_claim_text")
        ).select(
            "publication_number",
            col("nested_claim_text._VALUE").alias("value_array")
        )
        
        # Union direct and nested values
        return level2_df.union(nested_df)
    else:
        return level2_df

# Process claims
claim_values_df = extract_claim_text(claims_df)
claim_values_df = claim_values_df.withColumn(
    "claim_text", 
    when(col("value_array").isNotNull(),
         array_join(col("value_array"), "# ")
    ).otherwise(lit(None))
)

# Aggregate claims by publication_number
claims_grouped_df = claim_values_df.groupBy("publication_number").agg(
    array_join(collect_list("claim_text"), "# ").alias("all_claims_text")
)

# Improved patent dataframe with properly formatted CPC and additional data
patent_df = renamed_df.select(
    # Basic patent information
    col("publication_number"),
    col("invention_title"),
    array_join(col("abstract_text"), " ").alias("abstract"),
    array_join(col("description_text"), " ").alias("description"),

    # Publication reference
    col("publication_country"),
    clean_date(col("publication_date")).alias("publication_date"),
    col("publication_kind"),
    
    # Application reference
    col("application_country"),
    clean_date(col("application_date")).alias("application_date"),
    col("application_number"),
    col("application_series_code"),
    
    # CPC components individually for later analysis
    col("cpc_section").alias("cpc_section"),
    format_class(col("cpc_class")).alias("cpc_class"),
    col("cpc_subclass").alias("cpc_subclass"),
    format_group(col("cpc_main_group")).alias("cpc_group"),
    format_subgroup(col("cpc_subgroup")).alias("cpc_subgroup"),
    
    # CPC metadata
    clean_date(col("cpc_action_date")).alias("cpc_action_date"),
    col("cpc_data_source"),
    col("cpc_status"),
    col("cpc_value"),
    clean_date(col("cpc_version_date")).alias("cpc_version_date"),
    col("cpc_office_country"),
    col("cpc_scheme_origin"),
    col("cpc_symbol_position"),
    
    # Full CPC code with proper structure (e.g., A01B33/00)
    format_patent_class(
        col("cpc_section"),
        col("cpc_class"),
        col("cpc_subclass"),
        col("cpc_main_group"),
        col("cpc_subgroup")
    ).alias("cpc_main"),
    
    # Hierarchical CPC code for tiered analysis
    concat_ws("", 
        col("cpc_section"),                                  
        format_class(col("cpc_class"))                      
    ).alias("cpc_class_level"),
    
    concat_ws("", 
        col("cpc_section"),                                  
        format_class(col("cpc_class")),                     
        col("cpc_subclass")                                 
    ).alias("cpc_subclass_level")
)

# Join all the components
complete_patent_df = patent_df.join(
    claims_grouped_df, on="publication_number", how="left"
).join(
    ipc_grouped_df, on="publication_number", how="left"
).join(
    inventors_grouped_df, on="publication_number", how="left"
).join(
    applicants_grouped_df, on="publication_number", how="left"
)

In [0]:
complete_patent_df.limit(10).display()