In [1]:
import pandas as pd
import requests

In [2]:
def similarity_score(name, title):
    name_words = set(name.lower().replace('.', '').split())
    title_words = set(title.lower().replace('.', '').split())
    common_words = name_words.intersection(title_words)
    return len(common_words) / max(len(name_words), len(title_words))

In [3]:
def get_wikipedia_url(name):
    endpoint = "https://en.wikipedia.org/w/api.php"
    params = {
        "action": "query",
        "format": "json",
        "list": "search",
        "srsearch": name
    }
    response = requests.get(endpoint, params=params)
    data = response.json()

    if data["query"]["search"]:
        search_result_title = data["query"]["search"][0]["title"]

        # Calculate similarity score between search query and result title
        score = similarity_score(name, search_result_title)

        # Check if score is above a certain threshold (e.g., 0.5)
        if score > 0.5:
            title = search_result_title.replace(' ', '_')
            return f"https://en.wikipedia.org/wiki/{title}"
        else:
            return "No Wikipedia page found"
    else:
        return "No Wikipedia page found"

In [4]:
# Load CSV into Pandas DataFrame
df = pd.read_csv('wiki_with_code.csv')

In [5]:
#  Process in chunks of 100 until no more rows with Status_Code 404
df_to_process = df[df['Status_Code'] == 404.0].head(50)

for index, row in df_to_process.iterrows():
    new_url = get_wikipedia_url(row['NAME'])
    if new_url != "No Wikipedia page found":
        print(new_url)
        df.at[index, 'WIKI_PAGE'] = new_url.split('/')[-1]
        df.at[index, 'Status_Code'] = 200.0  # Update status code if page is found


https://en.wikipedia.org/wiki/Lorenzo_Zambrano
https://en.wikipedia.org/wiki/Alfred_Zeien
https://en.wikipedia.org/wiki/Annette_Ziegler
https://en.wikipedia.org/wiki/John_Bosley_Ziegler
https://en.wikipedia.org/wiki/Michael_Zimmerman_(jurist)
https://en.wikipedia.org/wiki/Mark_Zupan
https://en.wikipedia.org/wiki/Thomas_Francis_Wade
https://en.wikipedia.org/wiki/Anthony_Wagner
https://en.wikipedia.org/wiki/G._Harold_Wagner
https://en.wikipedia.org/wiki/Marcelle_Wahba


In [None]:
# Save the updated DataFrame back to CSV
df.to_csv('wiki_with_validated_links.csv', index=False)