In [5]:
pip install nltk wikipedia pandas scikit-learn tk rapidfuzz


Note: you may need to restart the kernel to use updated packages.


In [6]:
pip install ttkbootstrap


Note: you may need to restart the kernel to use updated packages.


In [7]:
pip install PyQt5 rapidfuzz wikipedia-api scikit-learn


Note: you may need to restart the kernel to use updated packages.


In [68]:
import ttkbootstrap as ttk
from ttkbootstrap.constants import *
from tkinter import simpledialog, messagebox
import pandas as pd
import wikipedia
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from rapidfuzz import fuzz, process
import re


ABBREVIATIONS = {
    "AI": "Artificial Intelligence",
    "ML": "Machine Learning"
}


PRIORITY_TOPICS = {
    "Artificial Intelligence": {
        "aliases": ["AI", "Artificial Intelligence"],
        "url": "https://en.wikipedia.org/wiki/Artificial_intelligence"
    },
    "Machine Learning": {
        "aliases": ["ML", "Machine Learning"],
        "url": "https://en.wikipedia.org/wiki/Machine_learning"
    },
    "Data Science": {
        "aliases": ["Data Science"],
        "url": "https://en.wikipedia.org/wiki/Data_science"
    },
    "Data Mining": {
        "aliases": ["Data Mining"],
        "url": "https://en.wikipedia.org/wiki/Data_mining"
    },
    "Cloud Computing": {
        "aliases": ["Cloud Computing"],
        "url": "https://en.wikipedia.org/wiki/Cloud_computing"
    },
    "Cybersecurity": {
        "aliases": ["Cybersecurity", "Computer Security"],
        "url": "https://en.wikipedia.org/wiki/Computer_security"
    },
}


def get_articles_for_topic(topic, num_articles=10):
    search_results = wikipedia.search(topic, results=num_articles + 5)
    articles = []
    for title in search_results:
        try:
            page = wikipedia.page(title)
            articles.append({"title": page.title, "url": page.url, "content": page.content})
            if len(articles) == num_articles:
                break
        except (wikipedia.exceptions.DisambiguationError, wikipedia.exceptions.PageError):
            continue
    return articles


topics = PRIORITY_TOPICS.keys()
articles = []
for topic in topics:
    articles.extend(get_articles_for_topic(topic))
articles_df = pd.DataFrame(articles)


articles_df['is_priority'] = articles_df['title'].isin(PRIORITY_TOPICS.keys())


tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(articles_df['content'])


def expand_abbreviations(query):
    words = query.split()
    return " ".join([ABBREVIATIONS.get(word.upper(), word) for word in words])


def correct_spelling(query):
    suggestions = []
    for topic, data in PRIORITY_TOPICS.items():
        for alias in data['aliases']:
            similarity = fuzz.ratio(query.lower(), alias.lower())
            if similarity > 80:
                suggestions.append((alias, similarity))
    suggestions = sorted(suggestions, key=lambda x: -x[1])
    if suggestions:
        return suggestions[0][0], [s[0] for s in suggestions]
    return query, []


def add_priority_articles(query, results):
    priority_articles = []
    for topic, data in PRIORITY_TOPICS.items():
        if query.lower() in [alias.lower() for alias in data['aliases']]:
            priority_articles.append({
                'title': topic,
                'url': data['url'],
                'content': f"Direct match for {topic}",
                'similarity': 1.0
            })
    priority_df = pd.DataFrame(priority_articles)
    combined_results = pd.concat([priority_df, results]).drop_duplicates(subset=['title', 'url']).reset_index(drop=True)
    return combined_results


def remove_near_duplicates(results):
    unique_results = []
    seen_contents = set()
    seen_titles = set()
    for _, row in results.iterrows():
        content_hash = hash(row['content'].strip().lower())
        title_hash = hash(row['title'].strip().lower())
        if content_hash not in seen_contents and title_hash not in seen_titles:
            unique_results.append(row)
            seen_contents.add(content_hash)
            seen_titles.add(title_hash)
    return pd.DataFrame(unique_results)


def search_articles(query, top_n=5, min_similarity=0.1):
    original_query = query.strip().lower()
    expanded_query = expand_abbreviations(original_query)
    corrected_query, suggestions = correct_spelling(original_query)

    query_to_use = corrected_query if corrected_query != original_query else expanded_query

    if query.startswith('"') and query.endswith('"'):
        query_to_use = query.strip('"')

    query_vector = tfidf_vectorizer.transform([query_to_use])
    cosine_similarities = cosine_similarity(query_vector, tfidf_matrix).flatten()

    for i, (title, content) in enumerate(zip(articles_df['title'], articles_df['content'])):
        if query_to_use.lower() in title.lower():
            cosine_similarities[i] += 3.0
        elif expanded_query.lower() in title.lower():
            cosine_similarities[i] += 2.0
        if query_to_use.lower() in content.lower():
            cosine_similarities[i] += 1.5
        elif expanded_query.lower() in content.lower():
            cosine_similarities[i] += 1.0

    articles_df['similarity'] = cosine_similarities

    filtered_df = articles_df[articles_df['similarity'] >= min_similarity]

    if filtered_df.empty:
        return pd.DataFrame(columns=['title', 'url', 'content', 'similarity']), suggestions

    sorted_df = (
        filtered_df.sort_values(by=['is_priority', 'similarity'], ascending=[False, False])
        .drop_duplicates(subset=['title', 'url'])
        .reset_index(drop=True)
    )

    results_with_priority = add_priority_articles(query_to_use, sorted_df)

   
    results_with_priority = remove_near_duplicates(results_with_priority)

    
    if len(results_with_priority) < top_n:
        additional_results = sorted_df.loc[~sorted_df['title'].isin(results_with_priority['title'])]
        results_with_priority = pd.concat([results_with_priority, additional_results]).drop_duplicates(subset=['title', 'url']).reset_index(drop=True)

    return results_with_priority.head(top_n), suggestions


def display_placeholder():
    placeholder_text = """
\n\u2728 Welcome to the Wikipedia Search System! \u2728\n
Features:
- Abbreviation Expansion
- Case Insensitivity
- Exact Phrase Matching (use "quotes")
- Intelligent Query Priority
- Spelling Correction with Suggestions
- Exact and Near-Duplicate Removal

Start by entering a topic to search for Wikipedia articles.
"""
    results_text.delete('1.0', "end")
    results_text.insert("1.0", placeholder_text)


def on_search_button_click():
    query = simpledialog.askstring("Input", "Enter your search query:")
    if query:
        results, suggestions = search_articles(query)

        results_text.delete('1.0', "end")

        if suggestions and query not in [s.lower() for s in suggestions]:
            suggestion_text = "Did you mean: " + ", ".join(suggestions) + "?\n\n"
            results_text.insert("1.0", suggestion_text)

        if results.empty:
            results_text.insert("end", "No results found. Please try another search term.\n")
            return

        for _, row in results.iterrows():
            title = row['title']
            url = row['url']
            results_text.insert("end", f"\U0001F4D1 {title}\n\U0001F517 {url}\n\n")
    else:
        messagebox.showinfo("Info", "No query entered. Please type a search query.")


root = ttk.Window(themename="darkly")
root.title("Wikipedia Search System")

label = ttk.Label(root, text="Wikipedia Article Search", font=("Helvetica", 20), bootstyle=INFO)
label.pack(pady=10)

search_button = ttk.Button(root, text="Search Articles \U0001F50D", bootstyle=SUCCESS, command=on_search_button_click)
search_button.pack(pady=5)

results_text = ttk.Text(root, height=20, width=80, font=("Consolas", 12))
results_text.pack(padx=10, pady=10)

display_placeholder()

root.mainloop()




  lis = BeautifulSoup(html).find_all('li')
