# Install Necessary Libraries

In [None]:
# Script to install required packages via pip
import os

packages = [
    'numpy',
    'ipywidgets',
    'torch',
    'matplotlib',
    'scikit-learn',
    'seaborn',
    'transformers',
    'datasets',
    'evaluate',
]

# Install each package
for package in packages:
    os.system(f'pip install {package}')


# Import Necessary Libraries

In [1]:
import pandas as pd
import numpy as np
from IPython.display import display, clear_output, HTML
import ipywidgets as widgets
import torch
from torch import nn
import math
import matplotlib.pyplot as plt
from collections import Counter
import re
import csv
import xml.etree.ElementTree as ET
import ast
import os
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, AutoModel
from sentence_transformers import SentenceTransformer
from datasets import Dataset
from sklearn.preprocessing import LabelEncoder
import torch
import seaborn as sns
import evaluate
from sklearn.model_selection import train_test_split, StratifiedKFold
from torch.utils.data import Dataset, DataLoader
import warnings
import joblib
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
from tqdm import tqdm

# Check if GPU is available

In [2]:
# Check if CUDA (GPU) is available
if torch.cuda.is_available():
    device = torch.device('cuda')
    print(f"Using GPU: {torch.cuda.get_device_name(0)}")
else:
    device = torch.device('cpu')
    print("Using CPU")

Using GPU: Tesla V100-SXM2-16GB


In [None]:
epab.fields()

In [None]:
## Query for the data that are published in English

In [None]:
#q = epab.query_publication(number="%")
q = epab.query_publication_language("EN")
# getting the results with the application number only
q.get_results("application.number")

In [None]:
# Read the CSV file
df = pd.read_csv('q_results_output_2.csv')

# Replace '.' with '_' in the column headers
df.columns = df.columns.str.replace('.', '_', regex=False)

# Display the total number of rows (data entries)
total_rows = df.shape[0]
print(f"Total number of data rows: {total_rows}")

# Save the DataFrame with updated column names back to the original file
df.to_csv('q_results_output_2.csv', index=False)

# Display the updated column names
print(df.columns)


In [None]:
# Load the CSV file
df = pd.read_csv('E:/Code Fest 2024/q_results_output_2.csv')
print(df.columns)

In [None]:
# Specify the columns you want to keep
selected_columns = ['publication_number', 'ipc', 'claims']

# Check if the columns exist in the DataFrame
existing_columns = [col for col in selected_columns if col in df.columns]

# Filter the DataFrame to keep only the selected columns
filtered_df = df[existing_columns]

# Specify the output file name and the save directory
output_file_name = 'selected_columns_output.csv'
save_directory = 'E:/Code Fest 2024/'

# Ensure the directory exists
os.makedirs(save_directory, exist_ok=True)

# Set the full path for saving the CSV file
full_path = os.path.join(save_directory, output_file_name)

# Save the filtered DataFrame to a new CSV file
filtered_df.to_csv(full_path, index=False)

# Print the file path for confirmation
print(f"CSV file '{output_file_name}' generated and saved to: {full_path}")

# Optionally, preview the first few rows of the new CSV file
print("\nPreview of the generated CSV file:")
print(filtered_df.head())


In [None]:
# Read the CSV file
df = pd.read_csv('E:/Code Fest 2024/selected_columns_output.csv')

# Function to filter claims for English language
def filter_english_claims(claims):
    claims_list = ast.literal_eval(claims)  # Convert string representation to list of dictionaries
    # Filter the claims for language 'EN'
    english_claims = [claim for claim in claims_list if claim['language'] == 'EN']
    return english_claims

# Apply the filter function to the 'claims' column
df['english_claims'] = df['claims'].apply(filter_english_claims)

# Replace the 'claims' column with 'english_claims' data
df['claims'] = df['english_claims']

# Optionally drop the 'english_claims' column if no longer needed
df = df.drop(columns=['english_claims'])

# Save the modified DataFrame to 'filtered_english_claims.csv'
df.to_csv('E:/Code Fest 2024/filtered_english_claims.csv', index=False)

# Display the modified DataFrame (optional)
print(df)


# Train Model

In [3]:
# Load the cleaned dataset
file_path = 'train_dataset.csv'
df = pd.read_csv(file_path)

# Initialize the SentenceTransformer model
model = SentenceTransformer('AI-Growth-Lab/PatentSBERTa')

# Remove rows where the 'claims' column is empty (NaN or empty string)
df = df[df['claims'].notna() & (df['claims'].str.strip() != '')]

# Extract the first letter of the IPC code for section prediction
df['ipc_section'] = df['ipc'].apply(lambda x: x[0] if pd.notna(x) and len(x) > 0 else '')

# Encode labels for IPC section
label_encoder_section = LabelEncoder()
df['encoded_section'] = label_encoder_section.fit_transform(df['ipc_section'])

# Train/test split (for section prediction)
X_train, X_test, y_train, y_test = train_test_split(df['claims'], df['encoded_section'], test_size=0.2, random_state=42)

# Encode the claims into embeddings using PatentSBERTa
X_train_embeddings = model.encode(X_train.tolist())
X_test_embeddings = model.encode(X_test.tolist())

# Train a simple classifier (e.g., Logistic Regression) on the embeddings
classifier = LogisticRegression(max_iter=1000)

# Cross-validation to simulate training in folds (like epochs)
skf = StratifiedKFold(n_splits=5)

# Initialize lists to store metrics for each fold
accuracy_list = []
precision_list = []
recall_list = []
f1_list = []

# Train and evaluate the model for each fold with a progress bar
for fold, (train_index, test_index) in enumerate(tqdm(skf.split(X_train_embeddings, y_train), total=skf.get_n_splits(), desc="Cross-validation Folds")):
    X_train_fold, X_test_fold = X_train_embeddings[train_index], X_train_embeddings[test_index]
    y_train_fold, y_test_fold = y_train.iloc[train_index], y_train.iloc[test_index]  # Use .iloc[] for positional indexing

    classifier.fit(X_train_fold, y_train_fold)
    y_pred_fold = classifier.predict(X_test_fold)
    
    # Calculate metrics for the current fold
    accuracy = accuracy_score(y_test_fold, y_pred_fold)
    precision = precision_score(y_test_fold, y_pred_fold, average='weighted')
    recall = recall_score(y_test_fold, y_pred_fold, average='weighted')
    f1 = f1_score(y_test_fold, y_pred_fold, average='weighted')

    # Append metrics to lists
    accuracy_list.append(accuracy)
    precision_list.append(precision)
    recall_list.append(recall)
    f1_list.append(f1)

    # Display progress for each fold
    print(f"Fold {fold+1}/{skf.get_n_splits()} - Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}")

# Save the final classifier and label encoder after cross-validation training is done
classifier_file_path = 'Models/ipc_section_classifier.pkl'
joblib.dump(classifier, classifier_file_path)
print(f"Classifier model saved to {classifier_file_path}")

label_encoder_file_path = 'Models/ipc_section_label_encoder.pkl'
joblib.dump(label_encoder_section, label_encoder_file_path)
print(f"Label encoder saved to {label_encoder_file_path}")

# Visualizing Metrics
epochs = range(1, len(accuracy_list) + 1)

plt.figure(figsize=(12, 8))
plt.plot(epochs, accuracy_list, label='Accuracy', marker='o')
plt.plot(epochs, precision_list, label='Precision', marker='o')
plt.plot(epochs, recall_list, label='Recall', marker='o')
plt.plot(epochs, f1_list, label='F1-score', marker='o')
plt.xlabel('Fold (Simulating Epochs)')
plt.ylabel('Metric Value')
plt.title('Training Metrics Over Cross-Validation Folds')
plt.legend()
plt.grid(True)
plt.show()

# Final evaluation on the test set with progress bar
print("\nTraining final classifier on full training set...")
with tqdm(total=len(X_train_embeddings), desc="Final Training Progress") as pbar:
    classifier.fit(X_train_embeddings, y_train)
    pbar.update(len(X_train_embeddings))

y_pred = classifier.predict(X_test_embeddings)

# Compute final metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print(f"Test Accuracy: {accuracy}")
print(f"Test Precision: {precision}")
print(f"Test Recall: {recall}")
print(f"Test F1-score: {f1}")

# Classification report
print(classification_report(y_test, y_pred, target_names=label_encoder_section.classes_))

# Confusion matrix for final evaluation
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(10, 7))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=label_encoder_section.classes_, yticklabels=label_encoder_section.classes_)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix for IPC Section Prediction')
plt.show()

# Save the final classifier after the test evaluation
final_classifier_file_path = 'Models/final_ipc_section_classifier.pkl'
joblib.dump(classifier, final_classifier_file_path)
print(f"Final classifier model saved to {final_classifier_file_path}")


In [None]:
# Save the trained classifier
classifier_file_path = 'Models/ipc_section_classifier.pkl'
joblib.dump(classifier, classifier_file_path)

# Save the label encoder for the IPC sections
label_encoder_file_path = 'Models/ipc_section_label_encoder.pkl'
joblib.dump(label_encoder_section, label_encoder_file_path)

print(f"Model saved to {classifier_file_path}")
print(f"Label encoder saved to {label_encoder_file_path}")


In [None]:
# Load the evaluation dataset
eval_file_path = 'eval_dataset.csv'
eval_df = pd.read_csv(eval_file_path)

# Check if the 'ipc' column exists
if 'ipc' in eval_df.columns:
    # Ensure the 'ipc' column is treated as a string, even if it's not
    eval_df['ipc'] = eval_df['ipc'].astype(str)

    # Extract the IPC section from the 'ipc' column (first character)
    eval_df['ipc_section'] = eval_df['ipc'].apply(lambda x: x[0] if pd.notna(x) and len(x) > 0 else '')
    
    # Verify the 'ipc_section' column was created correctly
    print("First few rows of 'ipc_section' column:")
    print(eval_df[['ipc', 'ipc_section']].head())

    # Save the updated dataset back to the original file (overwriting it)
    eval_df.to_csv(eval_file_path, index=False)
    
    print(f"IPC sections added successfully and saved back to {eval_file_path}")
else:
    print("Error: The 'ipc' column is missing from the dataset.")


In [None]:
eval_file_path = 'eval_dataset.csv'
eval_df = pd.read_csv(eval_file_path)
print(eval_df.columns)

In [None]:
# Load the trained model and encoders
classifier = joblib.load('Models/ipc_section_classifier.pkl')
label_encoder_section = joblib.load('Models/ipc_section_label_encoder.pkl')

# Load the evaluation dataset
eval_file_path = 'eval_dataset.csv'
eval_df = pd.read_csv(eval_file_path)

# Initialize the SentenceTransformer model (PatentSBERTa)
model = SentenceTransformer('AI-Growth-Lab/PatentSBERTa')

# Remove rows where the 'claims' column is empty (NaN or empty string)
eval_df = eval_df[eval_df['claims'].notna() & (eval_df['claims'].str.strip() != '')]

# Encode the 'claims' from eval_dataset.csv into embeddings
eval_claims_embeddings = model.encode(eval_df['claims'].tolist())

# Use the trained classifier to predict the IPC section
predicted_encoded_sections = classifier.predict(eval_claims_embeddings)

# Decode the predicted sections back to their original IPC section format
predicted_sections = label_encoder_section.inverse_transform(predicted_encoded_sections)

# Add the predicted sections and check if predictions are correct
eval_df['predicted_section'] = predicted_sections
eval_df['correct_prediction'] = eval_df['predicted_section'] == eval_df['ipc_section']

# Display the DataFrame showing publication_number, ipc, claims, actual ipc_section, and predicted_section
result_df = eval_df[['publication_number', 'ipc', 'claims', 'ipc_section', 'predicted_section', 'correct_prediction']]

# Display the DataFrame with the results
print(result_df.head(10))  # Show first 10 rows

# Calculate and display the accuracy
correct_predictions = result_df['correct_prediction'].sum()
total_predictions = len(result_df)
accuracy = correct_predictions / total_predictions

print(f"\nTotal predictions: {total_predictions}")
print(f"Correct predictions: {correct_predictions}")
print(f"Accuracy: {accuracy * 100:.2f}%")

# Save the result to a new CSV file
result_file_path = 'predicted_eval_dataset.csv'
result_df.to_csv(result_file_path, index=False)

print(f"\nResults saved to {result_file_path}")

# Optional: Show the first 10 rows of the saved file
result_df.head(100)


In [None]:
# Load the saved classifier and label encoder
classifier_file_path = 'Models/ipc_section_classifier.pkl'
label_encoder_file_path = 'Models/ipc_section_label_encoder.pkl'

loaded_classifier = joblib.load(classifier_file_path)
loaded_label_encoder = joblib.load(label_encoder_file_path)

# Load the SentenceTransformer model for embeddings
model = SentenceTransformer('AI-Growth-Lab/PatentSBERTa')

# Function to take user input and predict the IPC section
def predict_ipc_section():
    while True:
        # Take user input (claims)
        user_input = input("Enter the claims (or type 'exit' to quit): ")
        
        if user_input.lower() == 'exit':
            print("Exiting the prediction prompt.")
            break
        
        # Encode the input claims into embeddings
        claims_embedding = model.encode([user_input])

        # Predict using the loaded classifier
        predicted_encoded_section = loaded_classifier.predict(claims_embedding)

        # Decode the predicted section back to the IPC format
        predicted_section = loaded_label_encoder.inverse_transform(predicted_encoded_section)

        # Show the prediction
        print(f"Predicted IPC Section: {predicted_section[0]}")

# Start the interactive prompt
predict_ipc_section()
