In [2]:
import joblib
from sentence_transformers import SentenceTransformer
import pandas as pd
import os
import time

# Define directory paths (you can modify these for your local setup)
main_script_dir = r'E:/Code Fest 2024/Codes/Search report'
application_dir = r'E:/Code Fest 2024/Codes/Search report/Application Forms/LIGAMENT ASSEMBLY'

# Load the saved classifier and label encoder from the main script's directory
classifier_file_path = os.path.join(main_script_dir, 'ipc_section_classifier.pkl')
label_encoder_file_path = os.path.join(main_script_dir, 'ipc_section_label_encoder.pkl')

# Load the classifier and label encoder
loaded_classifier = joblib.load(classifier_file_path)
loaded_label_encoder = joblib.load(label_encoder_file_path)

# Load the SentenceTransformer model for embeddings
model = SentenceTransformer('AI-Growth-Lab/PatentSBERTa')

# Define the input CSV file path
csv_file_path = os.path.join(application_dir, f"{os.path.basename(application_dir)}.csv")

# Check if the correct CSV file is being read
if not os.path.exists(csv_file_path):
    print(f"Error: CSV file not found at {csv_file_path}")
else:
    print(f"Reading CSV file from {csv_file_path}")

# Load the CSV into a pandas DataFrame
df = pd.read_csv(csv_file_path)

# Debugging: Print the DataFrame to check if it's loaded correctly
print("CSV Data Loaded:")
print(df.head())  # Print the first few rows of the DataFrame

# Ensure the necessary columns are present in the CSV
if 'claims' not in df.columns or 'title' not in df.columns:
    print(f"Error: 'claims' or 'title' column not found in {csv_file_path}.")
else:
    # Proceed with prediction if columns exist
    predictions = []

    # Generate a publication number as an integer based on the current time
    publication_number = int(time.time())

    # Iterate through each row in the dataframe
    for index, row in df.iterrows():
        claims = row['claims']
        title = row['title']
        
        if claims:
            print(f"Processing claims: {claims}")
            
            # Encode the input claims into embeddings
            claims_embedding = model.encode([claims])

            # Predict using the loaded classifier
            predicted_encoded_section = loaded_classifier.predict(claims_embedding)

            # Decode the predicted section back to the IPC format
            predicted_section = loaded_label_encoder.inverse_transform(predicted_encoded_section)

            # Append the results to the predictions list, including the generated publication number
            predictions.append([publication_number, title, predicted_section[0]])
            print(f"Prediction written: {predicted_section[0]}")

    # Convert predictions to a DataFrame
    predictions_df = pd.DataFrame(predictions, columns=['publication_number', 'title', 'ipc'])

    # Debugging: Display the predictions DataFrame
    print("Predictions DataFrame:")
    display(predictions_df)

    # Save to a CSV file (for local output in Jupyter environment)
    output_csv_file_path = os.path.join(application_dir, 'test_ipc_codes.csv')
    predictions_df.to_csv(output_csv_file_path, mode='w', header=True, index=False)

    print(f"Predictions successfully saved in {output_csv_file_path}")


RuntimeError: CUDA error: out of memory
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
