In [1]:
import numpy as np
import spacy
from sklearn.metrics.pairwise import cosine_similarity

# load pre-trained model
spacy.cli.download("en_core_web_sm")
nlp = spacy.load("en_core_web_sm")

# define corpus
corpus = [
    "This is the first document.",
    "This document is the second document.",
    "And this is the third one.",
    "Is this the first document?",
]

# encode corpus into vectors
corpus_vectors = [nlp(doc).vector for doc in corpus]

# get user input
user_query = input("Enter your search query: ")

# encode user query into a vector
query_vector = nlp(user_query).vector

# compute similarity scores between query and corpus vectors
similarity_scores = cosine_similarity([query_vector], corpus_vectors)[0]

# get indices of top 5 most similar documents
top_indices = similarity_scores.argsort()[-5:][::-1]

# print top 5 most similar documents and their scores
for i, index in enumerate(top_indices):
    print(f"Result {i+1}: '{corpus[index]}'\nScore: {similarity_scores[index]:.4f}\n")
    
most_similar_index = similarity_scores.argmax()

# Print the most similar document
print(corpus[most_similar_index])


✔ Download and installation successful
You can now load the package via spacy.load('en_core_web_sm')
Enter your search query: lauda
Result 1: 'This document is the second document.'
Score: 0.2796

Result 2: 'Is this the first document?'
Score: 0.2341

Result 3: 'This is the first document.'
Score: 0.2269

Result 4: 'And this is the third one.'
Score: 0.1302

This document is the second document.


In [None]:
from transformers import AutoTokenizer, AutoModel

# load pre-trained BERT tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModel.from_pretrained("bert-base-uncased")

# define a function to encode a preprocessed document as a vector
def encode_document(document):
    # tokenize the preprocessed document
    tokens = tokenizer(document, padding=True, truncation=True, return_tensors="pt")
    # encode the tokens using the pre-trained BERT model
    with torch.no_grad():
        output = model(**tokens)
        vector = output.pooler_output
    return vector

# encode each preprocessed document as a vector and add it to the Pinecone index
for i, preprocessed_document in enumerate(preprocessed_documents):
    vector = encode_document(preprocessed_document)
    pinecone.upsert_index(index_name, [i], [vector])


Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [None]:
import pinecone

# initialize Pinecone client
pinecone.init(api_key="YOUR_API_KEY")

# create a Pinecone index for text documents
index_name = "text_index"
pinecone.create_index(index_name, {"metric": "cosine"})

# define a list of documents to index
documents = [
    "The quick brown fox jumps over the lazy dog",
    "A quick brown dog jumps over the lazy cat",
    "The lazy dog is quick to bark but slow to bite",
    "The cat is lazy and likes to sleep all day",
]

# define a function to preprocess a document (i.e., tokenize and remove stop words)
def preprocess_document(document):
    stop_words = {"the", "a", "an", "is", "and", "but", "to"}
    tokens = document.lower().split()
    return [token for token in tokens if token not in stop_words]

# preprocess each document and store the preprocessed documents in a list
preprocessed_documents = [preprocess_document(document) for document in documents]

# define a function to encode a preprocessed document as a vector
def encode_document(document):
    # TODO: use a pre-trained language model to encode the preprocessed document as a vector
    return vector

# encode each preprocessed document as a vector and add it to the Pinecone index
for i, preprocessed_document in enumerate(preprocessed_documents):
    vector = encode_document(preprocessed_document)
    pinecone.upsert_index(index_name, [i], [vector])

# define a function to search for documents containing a query
def search_documents(query, n_results=10):
    # preprocess the query
    preprocessed_query = preprocess_document(query)
    # encode the preprocessed query as a vector
    query_vector = encode_document(preprocessed_query)
    # search the Pinecone index for similar vectors
    results = pinecone.query(index_name, [query_vector], top_k=n_results)
    # return the indices of the top-k most similar documents
    return results.ids[0]

# example usage: search for documents containing the word "lazy"
similar_documents = search_documents("lazy")
print(similar_documents)


In [None]:
def similarity_search(query):
    # preprocess the query and encode it as a vector
    preprocessed_query = preprocess_query(query)
    query_vector = encode_query(preprocessed_query)

    # perform the similarity search using Pinecone
    search_results = pinecone_index.query(query_vector, top_k=10)

    # return the most similar documents
    similar_documents = []
    for result in search_results:
        document_index = result.id
        document_score = result.score
        similar_document = { "The quick brown fox jumps over the lazy dog",
    "A quick brown dog jumps over the lazy cat",
    "The lazy dog is quick to bark but slow to bite",
    "The cat is lazy and likes to sleep all day"} # get the actual document from the index using the index id
        similar_documents.append(similar_document)
    return similar_documents

In [None]:
import pinecone

# initialize Pinecone client
pinecone.init(api_key="YOUR_API_KEY")

# create a Pinecone index for text documents
index_name = "text_index"
pinecone.create_index(index_name, {"metric": "cosine"})

# define a list of documents to index
documents = [
    "The quick brown fox jumps over the lazy dog",
    "A quick brown dog jumps over the lazy cat",
    "The lazy dog is quick to bark but slow to bite",
    "The cat is lazy and likes to sleep all day",
]

# define a function to preprocess a document (i.e., tokenize and remove stop words)
def preprocess_document(document):
    stop_words = {"the", "a", "an", "is", "and", "but", "to"}
    tokens = document.lower().split()
    return [token for token in tokens if token not in stop_words]

# preprocess each document and store the preprocessed documents in a list
preprocessed_documents = [preprocess_document(document) for document in documents]

# define a function to encode a preprocessed document as a vector
def encode_document(document):
    return vector

# encode each preprocessed document as a vector and add it to the Pinecone index
for i, preprocessed_document in enumerate(preprocessed_documents):
    vector = encode_document(preprocessed_document)
    pinecone.upsert_index(index_name, [i], [vector])

# define a function to search for documents containing a query
def search_documents(query, n_results=10):
    # preprocess the query
    preprocessed_query = preprocess_document(query)
    # encode the preprocessed query as a vector
    query_vector = encode_document(preprocessed_query)
    # search the Pinecone index for similar vectors
    results = pinecone.query(index_name, [query_vector], top_k=n_results)
    # return the indices of the top-k most similar documents
    return results.ids[0]

# get user input
user_query = input("Enter your search query: ")

# encode user query into a vector
query_vector = nlp(user_query).vector
similar_documents = search_documents(user_query)
print(similar_documents)


In [6]:
import nltk
import spacy
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from enchant.checker import SpellChecker
from gensim.models import LdaModel
from gensim.corpora.dictionary import Dictionary
import re
#import polyglot
#from polyglot.detect import Detector
#from polyglot.text import Text
#from pyxdameraulevenshtein import damerau_levenshtein_distance

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
documents = [
    "The quick brown fox jumps over the lazy dog",
    "A quick brown dog jumps over the lazy cat",
    "The lazy dog is quick to bark but slow to bite",
    "The cat is lazy and likes to sleep all day",
]
def preprocess_document(document):
    stop_words = {"the", "a", "an", "is", "and", "but", "to"}
    tokens = document.lower().split()
    return [token for token in tokens if token not in stop_words]

# preprocess each document and store the preprocessed documents in a list
preprocessed_documents = [preprocess_document(document) for document in documents]

# Define paths to documents and stop words file
DOCUMENTS_PATH = "documents.txt"
STOPWORDS_PATH = "stopwords.txt"

# Read documents from file
with open('C:/Users/abhis/OneDrive/Desktop/AI semanitc search/documents.txt', "r") as f:
    documents = f.readlines()

# Read stop words from file
with open('C:/Users/abhis/OneDrive/Desktop/AI semanitc search/stopwords.txt', "r") as f:
    stop_words = set([line.strip() for line in f.readlines()])

# Tokenize and preprocess documents
lemmatizer = WordNetLemmatizer()
stemmer = SnowballStemmer("english")
preprocessed_docs = []
def get_wordnet_pos(word):
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)
for doc in documents:
    # Tokenize words
    words = word_tokenize(doc)
    # Remove stop words and lemmatize or stem remaining words
    words = [word.lower() for word in words if not word in stop_words]
    words = [lemmatizer.lemmatize(word, get_wordnet_pos(word)) for word in words]
    words = [stemmer.stem(word) for word in words]
    # Join words back into a single string
    preprocessed_docs.append(' '.join(words))

# Build TF-IDF vectorizer and calculate document similarity matrix
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(preprocessed_docs)
similarity_matrix = cosine_similarity(tfidf_matrix)

# Load spaCy model for NER
nlp = spacy.load("en_core_web_sm")

# Build dictionary and corpus for LDA topic modeling
dictionary = Dictionary([doc.split() for doc in preprocessed_docs])
corpus = [dictionary.doc2bow(doc.split()) for doc in preprocessed_docs]

# Train LDA model and get topic probabilities for each document
lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=5)
topic_probabilities = [lda_model.get_document_topics(corpus[i]) for i in range(len(corpus))]



# Function to perform semantic search
def semantic_search(query, documents, similarity_matrix, vectorizer, include_score=True, language='en', max_distance=1):
    # Detect query language and translate if necessary
    if language != 'en':
        text = Text(query, hint_language_code=language)
        query = text.translate('en').string.lower()
    # Preprocess query
    words = word_tokenize(query)
    # Check spelling and correct if necessary
    chkr = SpellChecker("en_US")
    for word in words:
        chkr.set_text(word)
        if not chkr.check():
            for suggestion in chkr.suggest():
                if suggestion in wordnet.words():
                    query = query.replace(word, suggestion)
                    break
    # Remove stop words and lemmatize
    words = [lemmatizer.lemmatize(word, get_wordnet_pos(word)) for word in words if not word in stop_words]
    # Expand query using synonyms
    expanded_query = []


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\abhis\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\abhis\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\abhis\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


LookupError: 
**********************************************************************
  Resource [93maveraged_perceptron_tagger[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('averaged_perceptron_tagger')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtaggers/averaged_perceptron_tagger/averaged_perceptron_tagger.pickle[0m

  Searched in:
    - 'C:\\Users\\abhis/nltk_data'
    - 'C:\\python39\\nltk_data'
    - 'C:\\python39\\share\\nltk_data'
    - 'C:\\python39\\lib\\nltk_data'
    - 'C:\\Users\\abhis\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
**********************************************************************
