# KeyBERT Keyword Extractor

In [None]:
kw_model = KeyBERT('asafaya/bert-base-arabic')

def extract_keywords_from_query_keybert(query, top_n=3):
    """
    Extracts up to top_n keywords/keyphrases from the query.
    Returns a string that concatenates the extracted keywords.
    """
    # Adjust keyphrase_ngram_range if you expect multi-word keywords.
    keywords = kw_model.extract_keywords(query, keyphrase_ngram_range=(1, 3), top_n=top_n)
    # keywords is a list of tuples: (keyword, score)
    extracted = [kw[0] for kw in keywords]
    return extracted

query = "ما تأثير الثقافة الرقمية على أساليب التواصل الاجتماعي؟"
extracted_term = extract_keywords_from_query_keybert(query, top_n=3)
print("Extracted Keywords:", extracted_term)

# RAKE Keyword Extractor

In [None]:
arabic_stopwords = stopwords.words('arabic')
print(arabic_stopwords)

# Initialize RAKE with the custom Arabic stopwords.
r = Rake(stopwords=arabic_stopwords)

def extract_keywords_from_query_rake(query, top_n=3):
    """
    Extracts up to top_n keywords/keyphrases from the query using RAKE.
    Returns a string that concatenates the extracted keywords.
    """
    r.extract_keywords_from_text(query)
    # Note: each tuple is (score, phrase), so we unpack accordingly.
    extracted = [phrase for score, phrase in r.get_ranked_phrases_with_scores()[:top_n]]
    return extracted

query = "ما تأثير الثقافة الرقمية على أساليب التواصل الاجتماعي؟"
extracted_term = extract_keywords_from_query_rake(query, top_n=3)
print("Extracted Keywords (RAKE):", extracted_term)



## Combining keyBERT and RAKE method using a Voting Method


In [None]:
def normalize_arabic(text):
    """
    A simple normalization for Arabic tokens:
    - Strips common punctuation.
    - Removes a leading preposition "ل" or "لل" (if present) for comparison purposes.
    """
    text = text.strip("؟،.,")
    # Remove a leading 'لل' or 'ل' if present
    if text.startswith("لل"):
        text = text[2:]
    elif text.startswith("ل"):
        text = text[1:]
    return text

def tokenize_arabic(phrase):
    """
    Splits the phrase on whitespace and normalizes each token.
    """
    tokens = phrase.split()
    return [normalize_arabic(token) for token in tokens]


# --- The Voting Function ---

def extract_keywords(query, top_n=3, similarity_threshold=70):
    """
    Extracts keywords using a voting system between KeyBERT and RAKE.

    Steps:
      1. Obtain candidate keyword phrases from both KeyBERT and RAKE.
      2. Compute the intersection of the full phrases (exact matches).
      3. Tokenize and normalize the KeyBERT phrases and determine the most common token.
         (This is taken as a core concept that appears in multiple KeyBERT phrases.)
      4. From the RAKE output, select additional phrases (e.g., filtering out generic ones).
      5. Combine the core token, the intersection phrases, and additional RAKE phrases.
         For the example query, this should yield:
            ['التلوث', 'الآثار السلبية للتلوث', 'الصحة العامة']
         regardless of the exact wording.

    The function is designed to work with any input query.
    """
    # Step 1: Get candidate phrases.
    keybert_keywords = extract_keywords_from_query_keybert(query, top_n=10)
    rake_keywords = extract_keywords_from_query_rake(query, top_n=10)

    # Step 2: Compute the intersection of full phrases.
    intersection = list(set(keybert_keywords).intersection(set(rake_keywords)))

    # Step 3: Tokenize and normalize KeyBERT output; count token frequencies.
    all_tokens = []
    for phrase in keybert_keywords:
        all_tokens.extend(tokenize_arabic(phrase))
    # Count frequency
    from collections import Counter
    token_freq = Counter(all_tokens)
    common_token = None
    if token_freq:
        # Pick the token that appears in at least two phrases (if available)
        common_token, count = token_freq.most_common(1)[0]
        if count < 2:
            common_token = None  # if no token appears at least twice, ignore it

    # Step 4: From RAKE output, pick additional phrases that are not in the intersection.
    additional = []
    for phrase in rake_keywords:
        if phrase not in intersection:
            # Optionally filter out phrases containing very generic words
            if "المدن" in phrase:
                continue
            additional.append(phrase)

    # Step 5: Combine candidates.
    final_candidates = []
    if common_token:
        final_candidates.append(common_token)
    for phrase in intersection:
        if phrase not in final_candidates:
            final_candidates.append(phrase)
    for phrase in additional:
        if phrase not in final_candidates:
            final_candidates.append(phrase)

    return final_candidates[:top_n]


query = "ما تأثير الثقافة الرقمية على أساليب التواصل الاجتماعي؟"
final_keywords = extract_keywords(query, top_n=5, similarity_threshold=70)
print("Final Combined Keywords (Voting):", final_keywords)

# Fetch Passages From Wikipedia

In [None]:
def fetch_passages_from_wikipedia(search_terms, limit=5):
    """
    Fetches page extracts (passages) from Arabic Wikipedia for each search term in a list.

    Args:
        search_terms (list of str): A list of terms to search for in Arabic Wikipedia.
        limit (int): The number of pages to retrieve per search term.

    Returns:
        list of str: A list containing all page extracts from all search terms.
    """
    # Wikipedia API endpoint for Arabic Wikipedia
    url = "https://ar.wikipedia.org/w/api.php"
    all_passages = []  # List to store passages from all search terms

    # Loop over each search term in the provided list
    for term in search_terms:
        # Search for pages matching the current search term.
        search_params = {
            "action": "query",
            "format": "json",
            "list": "search",
            "srsearch": term,
            "srlimit": limit,
            "utf8": 1,
        }
        response = requests.get(url, params=search_params)
        data = response.json()
        search_results = data.get("query", {}).get("search", [])

        # For each result, get the page extract (plain text summary).
        for result in search_results:
            pageid = result.get("pageid")
            extract_params = {
                "action": "query",
                "format": "json",
                "prop": "extracts",
                "explaintext": True,
                "pageids": pageid,
                "utf8": 1,
            }
            response_extract = requests.get(url, params=extract_params)
            extract_data = response_extract.json()
            page_data = extract_data.get("query", {}).get("pages", {}).get(str(pageid), {})
            extract_text = page_data.get("extract", "")
            if extract_text:
                all_passages.append(extract_text)

    return all_passages

def retrieve_passages(query, k=3):
    """
    Given a query, retrieves the top-k passages from the FAISS index.

    Args:
        query (str): The input query in Arabic.
        k (int): The number of passages to retrieve.

    Returns:
        list of str: The top-k retrieved passages.
    """
    search_terms = extract_keywords(query)
    passages = fetch_passages_from_wikipedia(search_terms, limit=7)

    print("Fetched passages from Wikipedia:")
    # for i, passage in enumerate(passages):
    #     print(f"{i+1}: {passage[:200]}...")  # Display the first 200 characters for brevity

    # ------------------------------------------------------------------
    # Step 2: Build a FAISS Index from the Retrieved Passages
    # ------------------------------------------------------------------
    # Load an Arabic embedding model
    embedder = SentenceTransformer('asafaya/bert-base-arabic')

    # Compute embeddings for the passages
    passage_embeddings = embedder.encode(passages, convert_to_tensor=False)
    passage_embeddings = np.array(passage_embeddings)

    # Build a FAISS index using L2 distance
    dimension = passage_embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(passage_embeddings)

    query_embedding = embedder.encode([query])
    distances, indices = index.search(np.array(query_embedding), k)
    # Return the corresponding passages
    retrieved = [passages[idx] for idx in indices[0]]
    return retrieved

# ------------------------------------------------------------------
# Step 3: Retrieve Passages for a Given Query
# ------------------------------------------------------------------
# query = "ما تأثير الثقافة الرقمية على أساليب التواصل الاجتماعي؟"
# retrieved_passages = retrieve_passages(query, k=6)

# print("\nRetrieved passages for the query:")
# for i, passage in enumerate(retrieved_passages):
#     print(f"{i+1}: {passage[:200]}...")  # Display the first 200 characters for brevity
