In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from datasketch import MinHash, MinHashLSH


nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')


def get_tokens(text):
    tokens = word_tokenize(text.lower())
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words and token.isalnum()]
    return tokens

def compute_precision_from_jaccard(jaccard_similarity: float, len_a: int, len_b: int) -> float:
    shorter_string_len = min(len_a, len_b)
    intersection = jaccard_similarity * (len_a + len_b) / (1 + jaccard_similarity)
    return intersection / shorter_string_len


def compute_recall_from_jaccard(jaccard_similarity: float, len_a: int, len_b: int) -> float:
    longer_string_len = max(len_a, len_b)
    intersection = jaccard_similarity * (len_a + len_b) / (1 + jaccard_similarity)
    return intersection / longer_string_len


def cosine_similarity(a: str, b: str) -> float:
    vectorizer = TfidfVectorizer()
    tfidf = vectorizer.fit_transform([a, b])
    cosine_sim = cosine_similarity(tfidf[0], tfidf[1])
    print(f"Cosine Similarity: {cosine_sim[0][0]:.4f}")
    return cosine_sim[0][0]

texts = [
    "This is the first text about natural language processing.",
    "This is the second text. It also discusses natural language processing.",
    "Natural language processing is a field of computer science.",
    "This is the first text about natural language processing and stufffff."  # Duplicate
]

def minhash_duplicate_candidates(corpus: List[str]) -> float:
    lsh = MinHashLSH(threshold=0.8, num_perm=128)
    minhashes = {}
    for i, text in enumerate(texts):
        tokens = preprocess_text(text)
        m = MinHash(num_perm=128)
        for token in tokens:
            m.update(token.encode('utf8'))
        lsh.insert(i, m)
        minhashes[i] = m

    # Find potential duplicates
    potential_duplicates = []
    for i in range(len(texts)):
        result = lsh.query(minhashes[i])
        if len(result) > 1:
            potential_duplicates.append((i, result))

minhash_duplicate_candidates()
