In [1]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from collections import defaultdict

In [2]:
# Sample list of articles
articles = ["Mastercard unveils international payment tools", 
            "JPMorgan Chase boss Dimon hails 'groundbreaking' AI", 
            "Visa partners PayPal for interoperable P2P payments",
            "JPMorgan Chase looks to quantum tech for deep hedging",
            "Wells Fargo to Participate in Nacha’s Smarter Faster Payments Conference",
            "Visa partners PayPal for interoperable P2P payments",
            "Visa and Partners Bring Interoperability to Digital Person-to-Person Payments",
            "Visa partners with PayPal, Venmo, and others to power interoperable digital payments",
            "Mastercard's Cross-Border Services Express Helps Modernize International Payments",
            "Mastercard makes sustainable card pledge.",
            "BMO Launches Industry Leading Digital Pre-Arrival Account Opening Capability for Newcomers to Canada"
           ]

In [3]:
# Define function to preprocess and tokenize text
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Tokenize text
    tokens = word_tokenize(text)
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if not token in stop_words]
    # Lemmatize words
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    # Join tokens back into string
    text = " ".join(tokens)
    return text

In [4]:
# Define function to compare articles
def compare_articles(articles):
    # Create dictionary to store article similarities
    similarities = defaultdict(set)
    # Loop through each article
    for i, article1 in enumerate(articles):
        # Preprocess and tokenize article
        article1_tokens = set(preprocess_text(article1).split())
        # Compare to other articles
        for j in range(i+1, len(articles)):
            article2_tokens = set(preprocess_text(articles[j]).split())
            # Calculate similarity using Jaccard similarity coefficient
            similarity = len(article1_tokens.intersection(article2_tokens)) / len(article1_tokens.union(article2_tokens))
            # Add articles to dictionary if similarity threshold is met
            print(similarity)
            if similarity > 0.2:
                similarities[article1].add(articles[j])
    return similarities

In [5]:
# Remove duplicates from list of articles
def remove_duplicates(articles):
    # Get similarities between articles
    similarities = compare_articles(articles)
    # Create set of duplicates
    duplicates = set()
    for key, values in similarities.items():
        for value in values:
            duplicates.add(value)
    # Remove duplicates from list of articles
    unique_articles = [article for article in articles if article not in duplicates]
    return unique_articles

In [6]:
# Call remove_duplicates function
unique_articles = remove_duplicates(articles)


0.0
0.1
0.0
0.07692307692307693
0.1
0.09090909090909091
0.07142857142857142
0.2727272727272727
0.1
0.0
0.0
0.15384615384615385
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.07142857142857142
1.0
0.3
0.45454545454545453
0.07142857142857142
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.07142857142857142
0.06666666666666667
0.05555555555555555
0.058823529411764705
0.0
0.0
0.3
0.45454545454545453
0.07142857142857142
0.0
0.0
0.3076923076923077
0.06666666666666667
0.0
0.058823529411764705
0.05555555555555555
0.0
0.05
0.07142857142857142
0.0
0.0


In [8]:
unique_articles

['Mastercard unveils international payment tools',
 "JPMorgan Chase boss Dimon hails 'groundbreaking' AI",
 'JPMorgan Chase looks to quantum tech for deep hedging',
 'Wells Fargo to Participate in Nacha’s Smarter Faster Payments Conference',
 'Mastercard makes sustainable card pledge.',
 'BMO Launches Industry Leading Digital Pre-Arrival Account Opening Capability for Newcomers to Canada']