<a href="https://colab.research.google.com/github/azizdhaoui/-Document-Similarity-Retrieval-System-/blob/main/KNNDoc_Document_Similarity_Retrieval_System.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
import string
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')



def preprocess_text(text):
    # Lowercasing
    text = text.lower()
    # Tokenization
    tokens = word_tokenize(text)
    # Removing punctuation
    tokens = [token for token in tokens if token not in string.punctuation]
    # Removing stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    # Stemming
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(token) for token in tokens]
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return ' '.join(tokens)

def KNNDoc(X, corpus, k, mesureSim='cosine'):
    # Preprocess X
    X_processed = preprocess_text(X)
    # Preprocess corpus
    corpus_processed = [preprocess_text(doc) for doc in corpus]
    # Vectorize X and corpus
    vectorizer = TfidfVectorizer()
    X_rep = vectorizer.fit_transform([X_processed] + corpus_processed)
    print(X_rep)
    print(X_rep.shape)

    # Calculate similarity
    if mesureSim == 'cosine':
        similarities = cosine_similarity(X_rep)[0][1:]
        print()
        print()
        print(similarities)
    else:
        raise ValueError("Invalid similarity measure. Supported measures: 'cosine'.")
    # Sort indices based on similarity
    sorted_indices = np.argsort(similarities)[::-1]
    # Select top k indices
    top_k_indices = sorted_indices[:k]
    # Get top k documents
    top_k_documents = [corpus[i] for i in top_k_indices]
    return top_k_documents

# Example usage
X = "How to learn Python?"
corpus = [
    "I want to learn Python programming.",
    "Python is a great programming language.",
    "Where can I find Python tutorials?",
    "Learning Python is easy and fun."
]
k = 2
result = KNNDoc(X, corpus, k)
print("Top", k, "documents similar to : ", X , " are", result)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


  (0, 5)	0.8148024746671689
  (0, 7)	0.5797386715376657
  (1, 5)	0.43907380594868767
  (1, 7)	0.31240462919759415
  (1, 9)	0.6556161938708753
  (1, 6)	0.5289470171197818
  (2, 7)	0.2808823162882302
  (2, 6)	0.47557510189256375
  (2, 3)	0.5894630806320427
  (2, 4)	0.5894630806320427
  (3, 7)	0.3193023297639811
  (3, 1)	0.6700917930430479
  (3, 8)	0.6700917930430479
  (4, 5)	0.4094299535661046
  (4, 7)	0.2913127840770238
  (4, 0)	0.6113525885101619
  (4, 2)	0.6113525885101619
(5, 10)


[0.53887147 0.16283834 0.18511191 0.50248983]
Top 2 documents similar to :  How to learn Python?  are ['I want to learn Python programming.', 'Learning Python is easy and fun.']
