## Try to Figure Out a Better URL

Original counts 

```
Status_Code
200.0    35645
404.0     4686
Name: count, dtype: int64
```

In [1]:
import pandas as pd
import requests

In [2]:
def similarity_score(name, title):
    name_words = set(name.lower().replace('.', '').split())
    title_words = set(title.lower().replace('.', '').split())
    common_words = name_words.intersection(title_words)
    return len(common_words) / max(len(name_words), len(title_words))

In [3]:
def get_wikipedia_url(name):
    endpoint = "https://en.wikipedia.org/w/api.php"
    params = {
        "action": "query",
        "format": "json",
        "list": "search",
        "srsearch": name
    }
    response = requests.get(endpoint, params=params)
    data = response.json()

    if data["query"]["search"]:
        search_result_title = data["query"]["search"][0]["title"]

        # Calculate similarity score between search query and result title
        score = similarity_score(name, search_result_title)

        # Check if score is above a certain threshold (e.g., 0.5)
        if score > 0.5:
            title = search_result_title.replace(' ', '_')
            return f"https://en.wikipedia.org/wiki/{title}"
        else:
            return "No Wikipedia page found"
    else:
        return "No Wikipedia page found"

In [4]:
# Load CSV into Pandas DataFrame
df = pd.read_csv('wiki_with_second_pass.csv')

In [5]:
df['Status_Code'].value_counts()

Status_Code
200.0    35803
404.0     4528
Name: count, dtype: int64

In [6]:
#  Process in chunks of 100 until no more rows with Status_Code 404
df_to_process = df[df['Status_Code'] == 404.0]  # .head(1000)

for index, row in df_to_process.iterrows():
    new_url = get_wikipedia_url(row['NAME'])
    if new_url != "No Wikipedia page found":
        df.at[index, 'WIKI_PAGE'] = new_url.split('/')[-1]
        df.at[index, 'Status_Code'] = 200.0  # Update status code if page is found


In [7]:
# Save the updated DataFrame back to CSV
df.to_csv('wiki_with_second_pass.csv', index=False)

In [8]:
df['Status_Code'].value_counts()

Status_Code
200.0    36423
404.0     3908
Name: count, dtype: int64

In [9]:
count_200 = df['Status_Code'].value_counts().get(200, 0)
count_404 = df['Status_Code'].value_counts().get(404, 0)

#Find percent failed
round(count_404 / (count_404 + count_200) * 100, 1)

9.7