# Creating Table of Name/ Nickname Relationships from URL
---

### Imports


In [208]:
import sys
from bs4 import BeautifulSoup
import requests
import pandas as pd

### The best URL I was able to find with a decent amount of nicknames in a parsable format
[Website Used](https://www.familysearch.org/en/wiki/Traditional_Nicknames_in_Old_Documents_-_A_Wiki_List)

In [4]:
static_url = "https://www.familysearch.org/en/wiki/Traditional_Nicknames_in_Old_Documents_-_A_Wiki_List"

In [8]:
data = requests.get(static_url).text

In [9]:
soup = BeautifulSoup(data, 'html5lib')

In [81]:
soup.title

<title>Traditional Nicknames in Old Documents - A Wiki List • FamilySearch</title>

In [42]:
html_data = soup.find_all(name='p')

### Find out where "name = nickname" begins and ends
We'll just loop through all of the lines in soup object to find out at which index the names/ nicknames start/ end

In [82]:
lines = [s for s in soup.strings if '=' in s]
k = 0
for line in lines:
    if '=' in line:
        print(k)
        k += 1
        print(line)

0
document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"525d2d8845237f130e988178","wgCSPNonce":!1,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"Traditional_Nicknames_in_Old_Documents_-_A_Wiki_List","wgTitle":"Traditional Nicknames in Old Documents - A Wiki List","wgCurRevisionId":4288292,"wgRevisionId":4288292,"wgArticleId":2987,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Encyclopedias and Dictionaries","Beginners","Personal Names"],"wgPageContentLanguage":"en","wgPageContentModel":"wikitext","wgRelevantPageName":"Traditional_Nicknames_in_Old_Documents_-_A_Wiki_List","wgRelevantArticleId":2987,
"wgIsProbablyEditable":!

### Names for Nicknames
Our desired information starts at index 5 and ends index 801

#### Note: While the above filter got us close, we still be extracting "nickname = formal name" at the start of each new letter
we will filter those out with some simple Boolean logic

In [161]:
# With lines already defined and our desired indices determined, we will loop through our soup object again
# Storing names, nicknames as key : value pairs ()
# There were a few escaped characters and descriptions that were leftover after filtering
# These will also be filtered out

name_dict = {}
filters = [' ', '\xa0', '\n', '(orusedasanameforthebabyofthefamily)', '(f)']

for line in range(5, 802):
    if "Nickname = Formal Name" not in lines[line] and "Butch" not in lines[line]:     # Butch is a train wreck 
        pairs = lines[line]
        for f in filters:
            pairs = pairs.replace(f, '')
        pairs = pairs.split('=')
        if len(pairs) == 2:
            name_dict[pairs[1]] = pairs[0]
        else:
            nicknames = ''
            for p in pairs[:-1]:
                nicknames += p + ', '
            nicknames = nicknames[:-2]    # get rid of the extra comma
            name_dict[pairs[-1]] = nicknames


In [162]:
name_dict

{'Acera': 'Acer',
 'Adeline': 'Ada',
 'Adelaide': 'Addie',
 'Agatha': 'Agnes',
 'Agnes': 'Senga',
 'Nancy': 'Nanny',
 'Inez': 'Agnes',
 'Alan': 'Al',
 'Albert': 'Bertie',
 'Alexander': 'Xander',
 'Alfred': 'Alf',
 'Allen': 'Al',
 'Alfons': 'Fonzo',
 'Emilia*Emile': 'Amelia',
 'Amanda': 'Mandy',
 'Amelia': 'Millie',
 'Andreas': 'Andy',
 'Andrew': 'Drew',
 'Angela': 'Angie',
 'Angeline': 'Angie',
 'Deanne': 'Dee',
 'Hannah': 'Nana',
 'Susanna': 'Susan',
 'Ann': 'Nina',
 'Anna': 'Nancy',
 'Annika': 'Anni',
 'Antonin': 'Anton',
 'Anastasia': 'Stasi',
 'Apollonia': 'Appy',
 'Archibald': 'Archy',
 'Arnold': 'Arny',
 'Artemis': 'Art',
 'Arthur': 'Arty',
 'Barbara': 'Barb',
 'Mary': 'Polly',
 'Barnabas': 'Barney',
 'Bartholomew': 'Barty',
 'Sebastian': 'Seby',
 'Beatta': 'Bea',
 'Beatrice': 'Trixie',
 'Rebecca': 'Reba',
 'Isabell': 'Bell',
 'Isabella': 'Tibbie',
 'Arabella': 'Bella',
 'Belinda': 'Bella',
 'Elizabeth': 'Tess',
 'Isabel': 'Bella',
 'Mirabel': 'Mira',
 'Mabel': 'Belle',
 'Sybil':

In [163]:
# Two escaped the filters. 

# 'Emilia*Emile'
# 'namegivento7thchild'

name_dict.pop('Emilia*Emile')
name_dict.pop('namegivento7thchild')

name_dict

{'Acera': 'Acer',
 'Adeline': 'Ada',
 'Adelaide': 'Addie',
 'Agatha': 'Agnes',
 'Agnes': 'Senga',
 'Nancy': 'Nanny',
 'Inez': 'Agnes',
 'Alan': 'Al',
 'Albert': 'Bertie',
 'Alexander': 'Xander',
 'Alfred': 'Alf',
 'Allen': 'Al',
 'Alfons': 'Fonzo',
 'Amanda': 'Mandy',
 'Amelia': 'Millie',
 'Andreas': 'Andy',
 'Andrew': 'Drew',
 'Angela': 'Angie',
 'Angeline': 'Angie',
 'Deanne': 'Dee',
 'Hannah': 'Nana',
 'Susanna': 'Susan',
 'Ann': 'Nina',
 'Anna': 'Nancy',
 'Annika': 'Anni',
 'Antonin': 'Anton',
 'Anastasia': 'Stasi',
 'Apollonia': 'Appy',
 'Archibald': 'Archy',
 'Arnold': 'Arny',
 'Artemis': 'Art',
 'Arthur': 'Arty',
 'Barbara': 'Barb',
 'Mary': 'Polly',
 'Barnabas': 'Barney',
 'Bartholomew': 'Barty',
 'Sebastian': 'Seby',
 'Beatta': 'Bea',
 'Beatrice': 'Trixie',
 'Rebecca': 'Reba',
 'Isabell': 'Bell',
 'Isabella': 'Tibbie',
 'Arabella': 'Bella',
 'Belinda': 'Bella',
 'Elizabeth': 'Tess',
 'Isabel': 'Bella',
 'Mirabel': 'Mira',
 'Mabel': 'Belle',
 'Sybil': 'Belle',
 'Benedict': 'Ben

### I know none of this is optimal...
I have a habit of wasting far too much time trying to optimize trivial tasks.

This is not one of those times.



In [173]:
df = pd.DataFrame.from_dict([name_dict])

In [220]:
# We will transpose for better viewing

df = df.transpose()
df.head(5)

Unnamed: 0,0
Acera,Acer
Adeline,Ada
Adelaide,Addie
Agatha,Agnes
Agnes,Senga


In [221]:
df.to_csv('Name_Nicknames.csv')