# **pre processing and cleaning**

In [33]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string

In [34]:
# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [35]:
# Load your patent dataset
data = pd.read_excel('/content/path_to_output_file.xlsx')  # Replace 'your_patent_dataset.csv' with your dataset file

In [36]:
# Function for text preprocessing
def preprocess_text(text):
    # Tokenization
    tokens = word_tokenize(text.lower())  # Tokenize and convert to lowercase

    # Remove punctuation and numbers
    table = str.maketrans('', '', string.punctuation + string.digits)
    tokens = [word.translate(table) for word in tokens]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    return tokens

In [37]:
# Preprocess 'Title' and 'Abstract' columns
data['Title_Processed'] = data['Title'].fillna('').apply(preprocess_text)
data['Abstract_Processed'] = data['Abstract'].fillna('').apply(preprocess_text)

In [38]:
# Clean the data
data['Title_Lower'] = data['Title'].str.lower()  # Create a lowercase version of 'Title'
data['Abstract_Lower'] = data['Abstract'].str.lower()  # Create a lowercase version of 'Abstract'
data.drop_duplicates(subset=['Title_Lower', 'Abstract_Lower'], inplace=True)  # Remove case-insensitive duplicates
data.drop(['Title_Lower', 'Abstract_Lower'], axis=1, inplace=True)  # Drop temporary lowercase columns

In [39]:
# Display the cleaned and preprocessed data
print(data[['Title_Processed', 'Abstract_Processed']].head())

                                     Title_Processed  \
0  [aerial, drone, companion, device, method, ope...   
2  [onboard, drone, humanmachine, interface, auto...   
3  [autonomous, navigation, unmanned, aerial, veh...   
4  [user, equipment, , system, , control, method,...   
6                                [autonomous, drone]   

                                  Abstract_Processed  
0  [method, operating, aerial, drone, companion, ...  
2  [autonomous, drone, system, us, onboard, comma...  
3  [system, autonomous, navigation, unmanned, aer...  
4  [provided, user, equipment, controlling, drone...  
6  [autonomous, drone, provided, , remote, contro...  


In [43]:
# Save the cleaned and preprocessed data to an Excel file
data.to_excel('cleaned_data.xlsx', index=False)

## **#Building a Patent Search Engine: Use an inverted index data structure to create a patent search engine. Implement advanced search capabilities, such as Boolean search, ranking algorithms, and phrase matching.**

In [44]:

from collections import defaultdict
import re

In [45]:
class InvertedIndex:
    def __init__(self):
        self.index = defaultdict(set)

    def add_document(self, doc_id, doc_content):
        for word in re.findall(r'\w+', doc_content.lower()):
            self.index[word].add(doc_id)

    def search(self, query):
        result_set = set()
        for word in query.split():
            if word in self.index:
                result_set.update(self.index[word])
            else:
                return set()  # Return empty set if any word is not found in the index
        return result_set

In [47]:
class PatentSearchEngine:
    def __init__(self):
        self.inverted_index = InvertedIndex()
        self.patent_data = {}  # Dictionary to store patent data

    def add_patent(self, patent_id, patent_title, patent_abstract):
        self.inverted_index.add_document(patent_id, patent_title + ' ' + patent_abstract)
        self.patent_data[patent_id] = {'Title': patent_title, 'Abstract': patent_abstract}

    def search_patents_by_title(self, query):
        patent_ids = self.inverted_index.search(query.lower())
        return [self.patent_data[id]['Abstract'] for id in patent_ids]

    def search_most_relevant_patents(self, query):
        patent_ids = self.inverted_index.search(query.lower())
        ranked_patents = []
        for id in patent_ids:
            title = self.patent_data[id]['Title']
            abstract = self.patent_data[id]['Abstract']
            # Score based on keyword matching in title and abstract
            score = self._calculate_score(query, title, abstract)
            ranked_patents.append((id, title, abstract, score))
        ranked_patents.sort(key=lambda x: x[3], reverse=True)  # Sort by score
        return ranked_patents

In [48]:
def _calculate_score(self, query, title, abstract):
        query_words = set(query.lower().split())
        title_words = set(title.lower().split())
        abstract_words = set(re.findall(r'\w+', abstract.lower()))
        matching_words = title_words.union(abstract_words).intersection(query_words)
        return len(matching_words)

In [57]:
!pip install sentence-transformers

Collecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting sentencepiece (from sentence-transformers)
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: sentence-transformers
  Building wheel for sentence-transformers (setup.py) ... [?25l[?25hdone
  Created wheel for sentence-transformers: filename=sentence_transformers-2.2.2-py3-none-any.whl size=125923 sha256=d6d8c012dfaab84dc41bc8b5df91025d969e7fe358181544d2b47c7428d809ca
  Stored in directory: /root/.cache/pip/wheels/62/f2/10/1e606fd5f02395388f74e7462910fe851042f97238cbbd902f
Successfully built sentence-tra

In [58]:
from sentence_transformers import SentenceTransformer

In [59]:
from transformers import BertModel, BertTokenizer
import torch
from sklearn.metrics.pairwise import cosine_similarity

In [60]:
# Function to encode text into semantic vectors
def encode_text(text):
    # Tokenize and encode the text
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512)
    outputs = model(**inputs)
     # Use the pooled output for representing the entire sentence
    sentence_embedding = outputs.pooler_output
    return sentence_embedding.detach().numpy()

In [61]:
# Function for semantic search
def semantic_search(query, documents):
    # Encode the query and the documents
    query_vector = encode_text(query)
    doc_vectors = [encode_text(doc) for doc in documents]

    # Compute cosine similarities between query and documents
    similarities = cosine_similarity(query_vector, doc_vectors)

    # Find the index of the most similar document
    most_similar_idx = similarities[0].argsort()[-1]

    # Return the most similar document and its similarity score
    return documents[most_similar_idx], similarities[0][most_similar_idx]

In [62]:
# Function for semantic search
def semantic_search(query, documents):
    # Encode the query and the documents
    query_vector = encode_text(query)
    doc_vectors = [encode_text(doc) for doc in documents]

    # Compute cosine similarities between query and documents
    similarities = cosine_similarity(query_vector, doc_vectors)

    # Find the index of the most similar document
    most_similar_idx = similarities[0].argsort()[-1]

    # Return the most similar document and its similarity score
    return documents[most_similar_idx], similarities[0][most_similar_idx]

In [50]:
import pandas as pd

# Your code for the PatentSearchEngine and InvertedIndex classes

# Instantiate the search engine
search_engine = PatentSearchEngine()

# Load your dataset (Replace 'your_dataset.csv' with your actual dataset)
data = pd.read_excel('/content/cleaned_data.xlsx')





In [64]:
pip install rasa

Collecting rasa
  Downloading rasa-3.6.15-py3-none-any.whl (837 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m837.9/837.9 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting CacheControl<0.13.0,>=0.12.9 (from rasa)
  Downloading CacheControl-0.12.14-py2.py3-none-any.whl (21 kB)
Collecting SQLAlchemy<1.5.0,>=1.4.0 (from rasa)
  Downloading SQLAlchemy-1.4.50-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
Collecting aio-pika<8.2.4,>=6.7.1 (from rasa)
  Downloading aio_pika-8.2.3-py3-none-any.whl (49 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.0/50.0 kB[0m [31m1.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting aiogram<2.26 (from rasa)
  Downloading aiogram-2.25.2-py3-none-any.whl (203 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20

In [65]:
from sentence_transformers import SentenceTransformer

# Load the BERT model. Various models trained on Natural Language Inference (NLI) https://github.com/UKPLab/sentence-transformers/blob/master/docs/pretrained-models/nli-models.md and
# Semantic Textual Similarity are available https://github.com/UKPLab/sentence-transformers/blob/master/docs/pretrained-models/sts-models.md

model = SentenceTransformer('bert-base-nli-mean-tokens')

.gitattributes:   0%|          | 0.00/391 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.95k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/399 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

In [66]:
# A corpus is a list with documents split by sentences.

sentences = ['A system for autonomous navigation of an unmanned aerial vehicle.','An autonomous drone system uses an onboard command and control system for controlling operations of a drone without the need for a radio frequency controller or an external electronic device programming unit. The system uses a control unit that interacts with the drones unmanned aerial system flight controller. The control unit is programmed via an HMI button that is resident onboard the drone. Various sequences of HMI button depressions program the drone for its missions as well as command the drone to perform the missions. A microphone can be substituted for or can augment the HMI button. Various devices, such as a speaker, lights, a visual display screen, etc., can be resident on the drone for giving a user feedback during command and programming of the drone.','A drone comprising a camera and a controller. The camera is configured to output data representing an object within a field of view of the camera. The controller is configured to attempt to maintain a visual line of sight with the object. The controller is also configured to cause control equipment of an operator of the drone to notify the operator of the drone, visually, audibly and/or haptically, as to whether or not the object is being tracked by the drone.']
# Each sentence is encoded as a 1-D vector with 78 columns
sentence_embeddings = model.encode(sentences)

print('Sample BERT embedding vector - length', len(sentence_embeddings[0]))

print('Sample BERT embedding vector - note includes negative values', sentence_embeddings[0])

Sample BERT embedding vector - length 768
Sample BERT embedding vector - note includes negative values [-0.8632692   0.02647944  0.9582975   0.41382873 -0.24252455  0.03367572
  0.2080628   0.57209814 -0.6822097  -0.39670852 -0.11647936 -0.27050644
  0.35967052  0.41277134 -0.2492695  -0.28567168 -0.58445925 -0.20339395
  0.7745359   0.18665996 -0.8651751   0.28154778 -0.18893613 -0.773696
 -0.39783996 -0.29736754 -0.45500687  1.0028659  -0.13070048  0.44209343
 -0.36826918  0.0190572   0.46967202 -0.5518372  -0.62298876 -0.0039642
  0.01378072 -0.02520599  0.06200161 -0.22256923 -0.80077165 -0.99290097
  0.05263304 -0.12088043 -1.0184543  -0.3001689   0.20906639  0.96134824
  0.01926609 -0.17018005 -0.03401539 -0.19353597 -0.9392074  -0.6133583
 -0.20360161  0.7916613   0.47927266 -1.9414853  -0.2723232   1.5981148
 -0.9143059  -0.17601334  0.6899698   0.46401787 -0.01063456 -0.6511864
 -0.08903118 -0.55631363 -0.50279486 -0.89792603 -0.03657483 -0.29234248
 -0.6486917   1.0181196  -0

In [67]:
import pandas as pd
from collections import defaultdict

# Read data from an Excel file
# Replace 'your_dataset.xlsx' with the path to your Excel file
df = pd.read_excel('/content/cleaned_data.xlsx')

# Now 'df' contains your dataset from the Excel file

# Tokenize the query into keywords
query = input("Enter your query: ")
keywords = set(re.findall(r'\w+', query.lower()))  # Tokenize query into keywords as a set

# Calculate relevance score for each abstract based on keyword matches
abstract_scores = defaultdict(int)
for index, row in df.iterrows():
    abstract_keywords = set(re.findall(r'\w+', str(row['Abstract']).lower()))  # Tokenize Abstract into keywords as a set

    # Calculate relevance score by counting matching keywords
    relevance_score = len(keywords.intersection(abstract_keywords))

    if relevance_score > 0:
        abstract_scores[(row['Title'], row['Abstract'])] = relevance_score

# Select top 5 most relevant responses based on relevance score
selected_responses = sorted(abstract_scores, key=abstract_scores.get, reverse=True)[:1]

# Display the selected relevant responses
if selected_responses:
    print("Selected Relevant Responses:")
    for title, abstract in selected_responses:
        print(f"Title: {title}")
        print(f"Abstract: {abstract}")

        print()
else:
    print("No relevant responses found.")





Enter your query: drone
Selected Relevant Responses:
Title: AERIAL DRONE COMPANION DEVICE AND A METHOD OF OPERATING AN AERIAL DRONE COMPANION DEVICE
Abstract: A method of operating an aerial drone companion device includes detecting a first voice command spoken by a first user. The aerial drone companion device is autonomously oriented such that an image capture device faces the first user in response to detecting the first voice command. A second voice command spoken by the first user is detected while the image capture device faces the first user. The second voice command is transmitted from the aerial drone companion device to a computer located remotely from the aerial drone companion device. A task signal is received indicating a task to be performed. The task signal is generated by the computer based on the second voice command, and the task signal is transmitted by the computer and received by the aerial drone companion device. The method includes autonomously executing the task