# Install Necessary Libraries

In [None]:
# Script to install required packages via pip
import os

packages = [
    'numpy',
    'ipywidgets',
    'torch',
    'matplotlib',
    'scikit-learn',
    'seaborn',
    'transformers',
    'datasets',
    'evaluate',
]

# Install each package
for package in packages:
    os.system(f'pip install {package}')


# Import Necessary Libraries

In [60]:
import pandas as pd
import os
import ipywidgets as widgets
import ast
import re
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

# Generate Dataset

## Import Database

In [61]:
from epo.tipdata.epab import EPABClient
#For test database (10K)
#epab = EPABClient(env='TEST')

#For the complete database
epab = EPABClient(env='PROD')

## Check EPAB fields

In [None]:
epab.fields()

In [None]:
q = epab.query_publication(number="%", kind_code="", date="20240101-20240131", language='EN')
print ('Total query:', q)

## Create a CSV file to store the data

In [None]:
result = q.get_results("publication.number, ipc, claims",limit = len(q))
df = pd.DataFrame(result)
csv_file = "q_results_output.csv"
df.to_csv(csv_file)
print(f"All data fetched and saved to {csv_file} successfully.")

## Modify the column header

In [None]:
# Read the CSV file
df = pd.read_csv('q_results_output.csv')

# Replace '.' with '_' in the column headers
df.columns = df.columns.str.replace('.', '_', regex=False)

# Save the DataFrame with updated column names back to the original file
df.to_csv('q_results_output.csv', index=False)

# Display the updated column names
print(df.columns)

# Display the total number of rows (data entries)
total_rows = df.shape[0]
print(f"Total number of data rows: {total_rows}")


## Generate Data for the Model

In [None]:
# Load the CSV file
df = pd.read_csv('q_results_output.csv')
print(df.columns)

In [None]:
# Specify the columns you want to keep
selected_columns = ['publication_number', 'ipc', 'claims']

# Check if the columns exist in the DataFrame
existing_columns = [col for col in selected_columns if col in df.columns]

# Filter the DataFrame to keep only the selected columns
filtered_df = df[existing_columns]

# Specify the output file name and the save directory
output_file_name = 'selected_columns_output.csv'
save_directory = './'

# Ensure the directory exists
os.makedirs(save_directory, exist_ok=True)

# Set the full path for saving the CSV file
full_path = os.path.join(save_directory, output_file_name)

# Save the filtered DataFrame to a new CSV file
filtered_df.to_csv(full_path, index=False)

# Print the file path for confirmation
print(f"CSV file '{output_file_name}' generated and saved to: {full_path}")

# Optionally, preview the first few rows of the new CSV file
print("\nPreview of the generated CSV file:")
print(filtered_df.head())


## Separate claim data in English

In [None]:
# Read the CSV file
df = pd.read_csv('selected_columns_output.csv')

# Function to filter claims for English language
def filter_english_claims(claims):
    claims_list = ast.literal_eval(claims)  # Convert string representation to list of dictionaries
    # Filter the claims for language 'EN'
    english_claims = [claim for claim in claims_list if claim['language'] == 'EN']
    return english_claims

# Apply the filter function to the 'claims' column
df['english_claims'] = df['claims'].apply(filter_english_claims)

# Replace the 'claims' column with 'english_claims' data
df['claims'] = df['english_claims']

# Optionally drop the 'english_claims' column if no longer needed
df = df.drop(columns=['english_claims'])

# Save the modified DataFrame to 'filtered_english_claims.csv'
df.to_csv('filtered_english_claims.csv', index=False)

# Display the modified DataFrame (optional)
print(df)


## Clean Data

In [None]:
# Define the file path
file_path = 'filtered_english_claims.csv'

# Check if the file exists
if os.path.exists(file_path):
    # Load the CSV file into a DataFrame
    df = pd.read_csv(file_path)

    # Display the first three rows in an interactive DataTable
    table_widget = widgets.Output()
    with table_widget:
        display(df.head(3))

    display(table_widget)
else:
    print(f"The file {file_path} does not exist.")

In [71]:
# Function to clean the 'IPC' column
def clean_ipc_column(ipc_data):
    try:
        # Convert string representation of list to actual list of dictionaries
        ipc_list = ast.literal_eval(ipc_data)
        
        # Extract and clean the 'symbol', removing unwanted characters except alphanumeric and '/'
        cleaned_ipc = [re.sub(r'[^A-Za-z0-9/]', '', entry['symbol']) for entry in ipc_list]
        
        # Join cleaned symbols into a single string, separated by commas
        return ', '.join(cleaned_ipc)
    except (ValueError, SyntaxError, KeyError):
        # If any error occurs during the processing, return the original data
        return ipc_data

# Example usage:
df['cleaned_ipc'] = df['ipc'].apply(clean_ipc_column)


In [72]:
# Function to clean the 'Claims' column and convert to lowercase
def clean_claims_column(claims_data):
    try:
        # Convert string representation to a list of dictionaries
        claims_list = ast.literal_eval(claims_data)
        
        # Extract and clean the 'text' field where the language is 'EN'
        for claim in claims_list:
            if claim.get('language') == 'EN':
                # Remove HTML tags and claim numbers
                cleaned_claims = re.sub(r'<.*?>', '', claim['text'])  # Remove HTML tags
                cleaned_claims = re.sub(r'\b\d+\.\s', '', cleaned_claims)  # Remove claim numbers like 1., 2., etc.
                
                # Remove unwanted special characters but keep commas, periods, and question marks
                cleaned_claims = re.sub(r'[^a-zA-Z0-9,.?\s]', '', cleaned_claims)
                
                # Remove newline characters
                cleaned_claims = cleaned_claims.replace('\n', '')

                # Convert the cleaned claims to lowercase
                cleaned_claims = cleaned_claims.lower()

                return cleaned_claims.strip()  # Return cleaned and lowercased text with extra spaces removed
                
        # Return original data if no 'EN' claims found
        return claims_data
    
    except (ValueError, SyntaxError, KeyError):
        # Return original data if any error occurs
        return claims_data


In [73]:
# Function to clean the 'description_text' column
def clean_description_column(description_text):
    if not isinstance(description_text, str):
        description_text = ''  # Convert non-string or NaN to an empty string
    cleaned_description = re.sub(r'<.*?>', '', description_text)  # Remove all HTML-like tags
    cleaned_description = re.sub(r'\s+', ' ', cleaned_description).strip()  # Clean up extra spaces and new lines
    return cleaned_description

In [74]:
# Function to clean the abstract column 
def clean_abstract_column(abstract_text):
    if not isinstance(abstract_text, str):
        return ''  # Return empty string if the input is not a valid string
    
    # Step 1: Remove image tags and their associated metadata like <img ... />
    cleaned_abstract = re.sub(r'<img.*?>', '', abstract_text)

    # Step 2: Remove all other HTML tags like <p>, <ul>, <li>, <br>, etc.
    cleaned_abstract = re.sub(r'<.*?>', ' ', cleaned_abstract)

    # Step 3: Remove list-related numbering like (1), (2), (3), etc. and also numbers with dots 1., 2., 3., etc.
    cleaned_abstract = re.sub(r'\(\d+\)', '', cleaned_abstract)  # Remove list numbering like (1), (2), etc.
    cleaned_abstract = re.sub(r'\b\d+\.\s', '', cleaned_abstract)  # Remove numbering like 1. 2. etc.
    
    # Step 4: Remove extra whitespace and newlines
    cleaned_abstract = re.sub(r'\s+', ' ', cleaned_abstract).strip()

    return cleaned_abstract


In [None]:
# Define the file paths
input_file_path = 'filtered_english_claims.csv'
output_file_path = 'cleaned_columns_output.csv'

# Set the chunk size for reading and processing the CSV file
chunk_size = 5000

# Process the CSV file in chunks
chunks = pd.read_csv(input_file_path, chunksize=chunk_size)

# Initialize the output CSV file with headers from the first chunk
header_written = False

for chunk in chunks:
    # Apply the cleaning functions to the relevant columns
    chunk['ipc'] = chunk['ipc'].apply(clean_ipc_column)
    chunk['claims'] = chunk['claims'].apply(clean_claims_column)

    # Optionally add additional cleaning for 'description_text' and 'abstract_text'
    # chunk['description_text'] = chunk['description_text'].apply(clean_description_column)
    # chunk['abstract_text'] = chunk['abstract_text'].apply(clean_abstract_column)

    # Keep only the 'ipc' and 'claims' columns
    chunk = chunk[['publication_number', 'ipc', 'claims']]

    # Write the cleaned chunk to the output CSV file
    if not header_written:
        chunk.to_csv(output_file_path, index=False, mode='w')
        header_written = True
    else:
        chunk.to_csv(output_file_path, index=False, mode='a', header=False)

# Print a success message
print(f"Cleaned data saved successfully to {output_file_path}!")

In [None]:
# Define the file path
file_path = 'cleaned_columns_output.csv'

# Check if the file exists
if os.path.exists(file_path):
    # Load the CSV file into a DataFrame
    df = pd.read_csv(file_path)

    # Display the first three rows in an interactive DataTable
    table_widget = widgets.Output()
    with table_widget:
        display(df.head(20))

    display(table_widget)
else:
    print(f"The file {file_path} does not exist.")

## Load Data Randomly

In [None]:
# Load the cleaned dataset
file_path = 'cleaned_columns_output.csv'
df = pd.read_csv(file_path)

# Display the shape of the loaded dataset to verify
print(f"Loaded {df.shape[0]} rows from the dataset.")

# Save the accumulated data to 'even_data_distribution.csv'
output_file_path = 'random_data_distribution.csv'
df.to_csv(output_file_path, index=False)


In [None]:
# Extract the IPC section (first letter of the IPC code)
df['ipc_section'] = df['ipc'].apply(lambda x: x[0] if pd.notna(x) and len(x) > 0 else '')

# Count the number of rows for each IPC section
section_counts = df['ipc_section'].value_counts()

# Print the counts for each section
print("Data count per section:")
print(section_counts)

In [None]:
# Plot the distribution of rows per section
plt.figure(figsize=(10, 6))
sns.barplot(x=section_counts.index, y=section_counts.values, hue=section_counts.index, palette='viridis', dodge=False, legend=False)
plt.title('Distribution of Rows by IPC Section (First 20,000 Rows)')
plt.xlabel('IPC Section')
plt.ylabel('Number of Rows')
plt.grid(True)
plt.show()


In [None]:
file_path = 'random_data_distribution.csv'

# Check if the file exists
if os.path.exists(file_path):
    # Load the CSV file into a DataFrame
    df = pd.read_csv(file_path)

    # Display few rows in an interactive DataTable
    table_widget = widgets.Output()
    with table_widget:
        display(df.head(20))

    display(table_widget)
else:
    print(f"The file {file_path} does not exist.")

## Load Data with even distribution

In [None]:
# File path to the cleaned dataset
file_path = 'cleaned_columns_output.csv'

# Initialize an empty DataFrame to store the accumulated data
accumulated_data = pd.DataFrame()

# Define the IPC sections (A, B, C, D, E, F, G, H)
sections = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H']

# Initialize a dictionary to track the number of rows per section
section_counts = {section: 0 for section in sections}

# Desired number of rows per section
max_rows_per_section = 5000

# Function to check if 'claims' is empty ([], NaN, or empty string)
def is_valid_claim(claims):
    try:
        # Ensure proper type and check for empty lists
        claims_list = ast.literal_eval(claims) if isinstance(claims, str) else claims
        if isinstance(claims_list, list) and len(claims_list) == 0:
            return False
    except (ValueError, SyntaxError):
        pass
    return pd.notna(claims) and claims.strip() != ''  # Ensure no empty strings or NaN

# Load data in chunks and accumulate data until we have enough rows for each section
chunksize = 500  # Adjust the chunk size if necessary

for chunk in pd.read_csv(file_path, chunksize=chunksize):
    # Remove duplicates based on the 'publication_number' column (assuming this column exists)
    chunk = chunk.drop_duplicates(subset=['publication_number']).copy()
    
    # Remove rows where the 'claims' column is empty (i.e., NaN, empty string, or [])
    chunk = chunk[chunk['claims'].apply(is_valid_claim)].copy()

    # Extract the first letter of the IPC code to create the section
    chunk['ipc_section'] = chunk['ipc'].apply(lambda x: x[0] if pd.notna(x) and len(x) > 0 else '')

    # Iterate over each section and accumulate rows
    for section in sections:
        if section_counts[section] < max_rows_per_section:
            section_data = chunk[chunk['ipc_section'] == section]
            required_rows = max_rows_per_section - section_counts[section]

            # Sample up to the required number of rows from the current chunk
            sampled_section_data = section_data.head(required_rows)

            # Concatenate the sampled data to the accumulated data
            accumulated_data = pd.concat([accumulated_data, sampled_section_data])

            # Update the section count
            section_counts[section] += len(sampled_section_data)

        # If we have reached the required rows for all sections, stop loading more data
        if all(count >= max_rows_per_section for count in section_counts.values()):
            break

    # Check again if we have enough rows for all sections, stop if true
    if all(count >= max_rows_per_section for count in section_counts.values()):
        break

# Print the counts for each section
print("Data count per section:")
print(section_counts)

# Display the shape of the accumulated dataset
print(f"Accumulated dataset contains {accumulated_data.shape[0]} rows.")

# Save the accumulated data to 'even_data_distribution.csv'
output_file_path = 'even_data_distribution.csv'
accumulated_data.to_csv(output_file_path, index=False)

print(f"Data has been saved to {output_file_path}.")


In [None]:
# Plot the distribution of rows per section
plt.figure(figsize=(10, 6))
sns.barplot(x=list(section_counts.keys()), y=list(section_counts.values()), hue=list(section_counts.keys()), palette='viridis', dodge=False, legend=False)
plt.title('Distribution of Rows by IPC Section')
plt.xlabel('IPC Section')
plt.ylabel('Number of Rows')
plt.grid(True)
plt.show()


In [None]:
# Load the cleaned dataset
file_path = 'even_data_distribution.csv'
df = pd.read_csv(file_path)

# Display the shape of the loaded dataset to verify
print(f"Loaded {df.shape[0]} rows from the dataset.")

In [None]:
# Check if the file exists
if os.path.exists(file_path):
    # Load the CSV file into a DataFrame
    df = pd.read_csv(file_path)

    # Display few rows in an interactive DataTable
    table_widget = widgets.Output()
    with table_widget:
        display(df.head(20))

    display(table_widget)
else:
    print(f"The file {file_path} does not exist.")

## Generate Data for training and evaluation

In [None]:
# Load the dataset
#file_path = 'even_data_distribution.csv' # For even dataset
file_path = 'random_data_distribution.csv' # For random dataset
df = pd.read_csv(file_path)

# Convert 'ipc' column to string type to avoid TypeError
df['ipc'] = df['ipc'].astype(str)

# Split the dataset into training and evaluation sets (e.g., 80% for training and 20% for evaluation)
train_df, eval_df = train_test_split(df, test_size=0.1, random_state=42)

# Save the training and evaluation datasets to new CSV files
train_file_path = 'train_dataset.csv'
eval_file_path = 'eval_dataset.csv'

train_df.to_csv(train_file_path, index=False)
eval_df.to_csv(eval_file_path, index=False)

# Print success message
print(f"Training dataset saved to {train_file_path}")
print(f"Evaluation dataset saved to {eval_file_path}")

# Optionally, display the sizes of the datasets to verify the split
print(f"Training dataset size: {train_df.shape[0]} rows")
print(f"Evaluation dataset size: {eval_df.shape[0]} rows")

In [None]:
# Load the new dataset and display it
new_file_path = 'train_dataset.csv'
df = pd.read_csv(new_file_path)
print(df.head())

# Analyze the data
print("\nData Information:\n")
print(df.info())

In [None]:
# Example: Check for missing values
missing_values = df.isnull().sum()
print("\nMissing Values in Each Column:\n")
print(missing_values)

In [None]:
# Load the evaluation dataset
eval_file_path = 'eval_dataset.csv'
eval_df = pd.read_csv(eval_file_path)

# Check if the 'ipc' column exists
if 'ipc' in eval_df.columns:
    # Ensure the 'ipc' column is treated as a string, even if it's not
    eval_df['ipc'] = eval_df['ipc'].astype(str)

    # Extract the IPC section from the 'ipc' column (first character)
    eval_df['ipc_section'] = eval_df['ipc'].apply(lambda x: x[0] if pd.notna(x) and len(x) > 0 else '')
    
    # Verify the 'ipc_section' column was created correctly
    print("First few rows of 'ipc_section' column:")
    print(eval_df[['ipc', 'ipc_section']].head())

    # Save the updated dataset back to the original file (overwriting it)
    eval_df.to_csv(eval_file_path, index=False)
    
    print(f"IPC sections added successfully and saved back to {eval_file_path}")
else:
    print("Error: The 'ipc' column is missing from the dataset.")


In [None]:
eval_file_path = 'eval_dataset.csv'
eval_df = pd.read_csv(eval_file_path)
print(eval_df.columns)

# Create dataset for prior art search

## Query Data and save in CSV

In [98]:
from epo.tipdata.epab import EPABClient
#For test database (10K)
#epab = EPABClient(env='TEST')

#For the complete database
epab = EPABClient(env='PROD')

In [None]:
## Query for the data that are published in English since Jan 2021 to Jan 2024 
c = epab.query_publication(number="%", kind_code="A1", date="20240101-20240131", language='EN')
print ('Total number of query:', c)

In [102]:
import pandas as pd
search_info = c.get_results(
    "publication.number, application.filing_date, ipc, search_report.date, search_report.ipc_field, search_report.is_no_unity, search_report.is_incomplete_search, search_report.is_no_search, srep_citation.is_patent, srep_citation.document, srep_citation.document_xml, srep_citation.category, srep_citation.relevant_claims, srep_citation.relevant_passage, srep_citation.corresponding_docs",
limit = 50000)
df_search = pd.DataFrame(search_info)
df_search.to_csv('search_report_info.csv', index=False)

## Modify Column Header

In [None]:
# Read the CSV file
df = pd.read_csv('search_report_info.csv')

# Replace '.' with '_' in the column headers
df.columns = df.columns.str.replace('.', '_', regex=False)

# Display the total number of rows (data entries)
total_rows = df.shape[0]
print(f"Total number of data rows: {total_rows}")

# Save the DataFrame with updated column names back to the original file
df.to_csv('search_report_info.csv', index=False)

# Display the updated column names
print(df.columns)


## Select Data Fields to keep

In [None]:
# Specify the columns you want to keep
selected_columns = ['publication_number', 'ipc', 'search_report_ipc_field', 'search_report_is_no_unity', 'search_report_is_incomplete_search',
       'search_report_is_no_search', 'srep_citation']

# Check if the columns exist in the DataFrame
existing_columns = [col for col in selected_columns if col in df.columns]

# Filter the DataFrame to keep only the selected columns
filtered_df = df[existing_columns]

# Specify the output file name and the save directory
output_file_name = 'selected_columns_output.csv'
save_directory = './'

# Ensure the directory exists
os.makedirs(save_directory, exist_ok=True)

# Set the full path for saving the CSV file
full_path = os.path.join(save_directory, output_file_name)

# Save the filtered DataFrame to a new CSV file
filtered_df.to_csv(full_path, index=False)

# Print the file path for confirmation
print(f"CSV file '{output_file_name}' generated and saved to: {full_path}")

# Optionally, preview the first few rows of the new CSV file
print("\nPreview of the generated CSV file:")
print(filtered_df.head())


## Remove Data with Empty fields

In [None]:
import pandas as pd

# File path (replace with your actual file path)
file_path = 'selected_columns_output.csv'
output_file_path = 'filtered_search_report_info.csv'  # File path for the filtered CSV

# Load the CSV file into a DataFrame
data = pd.read_csv(file_path)

# Filter out rows where 'srep_citation' is empty (i.e., equals '[]')
srep_citation_filtered = data[data['srep_citation'] != '[]']

# Save the filtered DataFrame to a new CSV file
srep_citation_filtered.to_csv(output_file_path, index=False)

# Print confirmation and the header
print(f"Filtered data saved to {output_file_path}")
print("Filtered DataFrame Header:", srep_citation_filtered.columns)

# Display the total number of rows (data entries)
total_rows = srep_citation_filtered.shape[0]
print(f"Total number of data rows: {total_rows}")

# Show the first 20 rows of the filtered DataFrame
print("\nFirst 20 rows of filtered data:")
print(srep_citation_filtered.head(20))


## Clean Data

In [None]:
file_path = 'filtered_search_report_info.csv'
# Check if the file exists
if os.path.exists(file_path):
    # Load the CSV file into a DataFrame
    df = pd.read_csv(file_path)

    # Display the first three rows in an interactive DataTable
    table_widget = widgets.Output()
    with table_widget:
        display(df.head(3))

    display(table_widget)
else:
    print(f"The file {file_path} does not exist.")

In [107]:
# Function to clean the 'IPC' column
def clean_ipc_column(ipc_data):
    try:
        # Convert string representation of list to actual list of dictionaries
        ipc_list = ast.literal_eval(ipc_data)
        
        # Extract and clean the 'symbol', removing unwanted characters except alphanumeric and '/'
        cleaned_ipc = [re.sub(r'[^A-Za-z0-9/]', '', entry['symbol']) for entry in ipc_list]
        
        # Join cleaned symbols into a single string, separated by commas
        return ', '.join(cleaned_ipc)
    except (ValueError, SyntaxError, KeyError):
        # If any error occurs during the processing, return the original data
        return ipc_data



In [108]:
# Function to clean the 'search_report_ipc_field' column
def clean_search_report_ipc_field(ipc_data):
    try:
        # Convert string representation of list to actual list (if it's stored as a string)
        ipc_list = ast.literal_eval(ipc_data)
        
        # Extract and clean the 'symbol', removing unwanted characters except alphanumeric and '/'
        cleaned_ipc = [re.sub(r'[^A-Za-z0-9/]', '', entry) for entry in ipc_list]
        
        # Join cleaned symbols into a single string, separated by commas
        return ', '.join(cleaned_ipc)
    except (ValueError, SyntaxError, KeyError):
        # If any error occurs during the processing, return the original data
        return ipc_data


In [109]:
# Function to clean up the srep_citation field
def clean_srep_citation(citation_data):
    try:
        # Convert string representation of list to an actual list of dictionaries
        citation_list = ast.literal_eval(citation_data)
        
        # Regular expression to extract the patent identifier from document_xml (e.g., US10976829B1)
        patent_id_pattern = re.compile(r'dnum="([A-Z0-9]+)"')

        # Extract the patent identifier from the 'document_xml' field
        cleaned_citations = []
        for entry in citation_list:
            if 'document_xml' in entry:
                match = patent_id_pattern.search(entry['document_xml'])
                if match:
                    cleaned_citations.append(match.group(1))
        
        # Return a comma-separated string of patent identifiers
        return ', '.join(cleaned_citations) if cleaned_citations else ''
    
    except (ValueError, SyntaxError, KeyError):
        # If any error occurs during the processing, return an empty string to indicate no valid data
        return ''


In [110]:
# Function to extract the document numbers from the cleaned srep_citation
def extract_document_numbers(cleaned_citation):
    try:
        # Regular expression to match and capture document numbers (e.g., 10976829, 3757908)
        document_number_pattern = re.compile(r'[A-Z]{2}(\d+)[A-Z]\d?')

        # Find all document numbers in the cleaned srep_citation
        document_numbers = document_number_pattern.findall(cleaned_citation)

        # Return a comma-separated string of document numbers or an empty string if none found
        return ', '.join(document_numbers) if document_numbers else ''
    
    except (ValueError, SyntaxError, KeyError):
        # If any error occurs during the processing, return an empty string
        return ''


In [None]:
import pandas as pd

# Define the file paths
input_file_path = 'filtered_search_report_info.csv'
output_file_path = 'cleaned_columns_output_prior_art.csv'
document_numbers_file_path = 'documents_to_fetch.csv'

# Set the chunk size for reading and processing the CSV file
chunk_size = 5000

# Process the CSV file in chunks
chunks = pd.read_csv(input_file_path, chunksize=chunk_size)

# Initialize the output CSV file with headers from the first chunk
header_written = False
document_header_written = False

# Iterate through chunks
for chunk in chunks:
    # Apply the cleaning functions to the relevant columns
    chunk['ipc'] = chunk['ipc'].apply(clean_ipc_column)
    chunk['search_report_ipc_field'] = chunk['search_report_ipc_field'].apply(clean_search_report_ipc_field)
    chunk['srep_citation'] = chunk['srep_citation'].apply(clean_srep_citation)
    chunk['document_numbers'] = chunk['srep_citation'].apply(extract_document_numbers)

    # Keep only the 'publication_number', 'ipc', 'search_report_ipc_field', 'srep_citation', and 'document_numbers' columns
    cleaned_chunk = chunk[['publication_number', 'ipc', 'search_report_ipc_field', 'srep_citation', 'document_numbers']]

    # Save the cleaned data
    if not header_written:
        cleaned_chunk.to_csv(output_file_path, index=False, mode='w')  # Write the header
        header_written = True
    else:
        cleaned_chunk.to_csv(output_file_path, index=False, mode='a', header=False)  # Append without the header

    # Save only the document numbers
    document_numbers_chunk = chunk[['document_numbers']]
    if not document_header_written:
        document_numbers_chunk.to_csv(document_numbers_file_path, index=False, mode='w')  # Write the header
        document_header_written = True
    else:
        document_numbers_chunk.to_csv(document_numbers_file_path, index=False, mode='a', header=False)  # Append without the header

# Print a success message
print(f"Cleaned data saved to {output_file_path}!")
print(f"Document numbers saved to {document_numbers_file_path}!")


In [None]:
file_path = 'cleaned_columns_output_prior_art.csv'
# Check if the file exists
if os.path.exists(file_path):
    # Load the CSV file into a DataFrame
    df = pd.read_csv(file_path)

    # Display the first three rows in an interactive DataTable
    table_widget = widgets.Output()
    with table_widget:
        display(df.head(300))

    display(table_widget)
else:
    print(f"The file {file_path} does not exist.")

In [None]:
import pandas as pd

# Define the file path for the document numbers CSV
document_numbers_file_path = 'documents_to_fetch.csv'

# Read the CSV file containing the document numbers
data = pd.read_csv(document_numbers_file_path)

# Display initial number of rows
initial_row_count = data.shape[0]
print(f"Initial number of rows: {initial_row_count}")

# Split the 'document_numbers' column into separate rows
# First, convert the 'document_numbers' column into a list of strings by splitting on the commas
data_expanded = data['document_numbers'].str.split(',', expand=True)

# Melt the dataframe so that all the split values are in a single 'document_number' column
data_melted = data_expanded.melt(value_name='document_number').dropna()

# Remove leading and trailing spaces from document numbers
data_melted['document_number'] = data_melted['document_number'].str.strip()

# Drop rows with empty or missing document numbers
data_cleaned = data_melted[['document_number']].dropna().replace('', pd.NA).dropna().drop_duplicates().reset_index(drop=True)

# Display final number of rows
final_row_count = data_cleaned.shape[0]
print(f"Final number of rows: {final_row_count}")

# Display the first 10 rows of the final output
print("\nFirst 10 rows of the final output:")
print(data_cleaned.head(10))

# Optionally, you can save the cleaned document numbers into a new CSV file
data_cleaned.to_csv('documents_to_fetch.csv', index=False)


## Create Train Dataset


In [None]:
# Read the CSV file
file_path = 'documents_to_fetch.csv'
df = pd.read_csv(file_path)

# Extract the document_number column
document_numbers_df = df[['document_number']]

# Initialize an empty DataFrame to store the search results in each batch
final_df = pd.DataFrame()

# Counters for successful and unsuccessful fetches
successful_fetches = 0
unsuccessful_fetches = 0


# Loop through the DataFrame and convert document_number to string
for index, row in document_numbers_df.iterrows():
    document_number = str(row['document_number'])  # Convert to string
    
    # Now use the document_number in your query
    q = epab.query_publication(number=document_number, kind_code="", date="")
    result = q.get_results("publication.number, ipc, claims")
    #result = q.get_results("publication.number, ipc, claims, abstract.text, description.text")
    df_data = pd.DataFrame(result)
    # Check if df_data is not empty before appending
    if not df_data.empty:
        #print("Found Document for:", document_number)
        final_df = pd.concat([final_df, df_data], ignore_index=True)
        successful_fetches += 1
        #print (final_df.shape[0])
    else:
        unsuccessful_fetches += 1
        
    

final_df.to_csv('fetched_documents.csv')
# Print the total number of successful and unsuccessful fetches
print(f"Total successful fetches: {successful_fetches}")
print(f"Total unsuccessful fetches: {unsuccessful_fetches}")

# Create Validation Dataset

In [None]:
#Set number of data you want to take
number_of_data = 10000

# Read the CSV file
file_path = 'cleaned_columns_output_prior_art.csv'
df = pd.read_csv(file_path, nrows = number_of_data)



# Extract the document_number column
document_numbers_df = df[['publication_number']]

# Initialize an empty DataFrame to store the search results in each batch
final_df = pd.DataFrame()

# Counters for successful and unsuccessful fetches
successful_fetches = 0
unsuccessful_fetches = 0


# Loop through the DataFrame and convert document_number to string
for index, row in document_numbers_df.iterrows():
    document_number = str(row['publication_number'])  # Convert to string
    
    # Now use the document_number in your query
    q = epab.query_publication(number=document_number, kind_code="", date="", language='EN' )
    result = q.get_results("publication.number, publication.language, ipc, claims")
    #result = q.get_results("publication.number, ipc, claims, abstract.text, description.text")
    df_data = pd.DataFrame(result)
    # Check if df_data is not empty before appending
    if not df_data.empty:
        #print("Found Document for:", document_number)
        final_df = pd.concat([final_df, df_data], ignore_index=True)
        successful_fetches += 1
        #print (final_df.shape[0])
    else:
        unsuccessful_fetches += 1
        
    

final_df.to_csv('prior_art_validation_dataset.csv')
# Print the total number of successful and unsuccessful fetches
print(f"Total successful fetches: {successful_fetches}")
print(f"Total unsuccessful fetches: {unsuccessful_fetches}")