# Creating Table of Name/ Nickname Relationships from URL
---

### Imports


In [208]:
import sys
from bs4 import BeautifulSoup
import requests
import pandas as pd

### The best URL I was able to find with a decent amount of nicknames in a parsable format
[Website Used](https://www.familysearch.org/en/wiki/Traditional_Nicknames_in_Old_Documents_-_A_Wiki_List)

In [4]:
static_url = "https://www.familysearch.org/en/wiki/Traditional_Nicknames_in_Old_Documents_-_A_Wiki_List"

In [8]:
data = requests.get(static_url).text

In [9]:
soup = BeautifulSoup(data, 'html5lib')

In [81]:
soup.title

<title>Traditional Nicknames in Old Documents - A Wiki List • FamilySearch</title>

In [42]:
html_data = soup.find_all(name='p')

### Find out where "name = nickname" begins and ends
We'll just loop through all of the lines in soup object to find out at which index the names/ nicknames start/ end

In [82]:
lines = [s for s in soup.strings if '=' in s]
k = 0
for line in lines:
    if '=' in line:
        print(k)
        k += 1
        print(line)

5
Acer = Acera
6
Xander = Alexander
789

Xina = Christina
790
Nickname = Formal Name
791
Yost = Josef
792
Nickname = Formal Name
793
Zac = Isaac
794
Zach = Zachariah
795
Zach = Zachary
796
Zak = Isaac
797
Zeb = Zebulon
798
Zed = Zedekiah
799
Zeke = Ezekiel
800
Zena = Albertina
	})();

805
(RLQ=window.RLQ||[]).push(function(){mw.config.set({"wgBackendResponseTime":212});});


### Names for Nicknames
Our desired information starts at index 5 and ends index 801

#### Note: While the above filter got us close, we still be extracting "nickname = formal name" at the start of each new letter
we will filter those out with some simple Boolean logic

In [161]:
# With lines already defined and our desired indices determined, we will loop through our soup object again
# Storing names, nicknames as key : value pairs ()
# There were a few escaped characters and descriptions that were leftover after filtering
# These will also be filtered out

name_dict = {}
filters = [' ', '\xa0', '\n', '(orusedasanameforthebabyofthefamily)', '(f)']

for line in range(5, 802):
    if "Nickname = Formal Name" not in lines[line] and "Butch" not in lines[line]:     # Butch is a train wreck 
        pairs = lines[line]
        for f in filters:
            pairs = pairs.replace(f, '')
        pairs = pairs.split('=')
        if len(pairs) == 2:
            name_dict[pairs[1]] = pairs[0]
        else:
            nicknames = ''
            for p in pairs[:-1]:
                nicknames += p + ', '
            nicknames = nicknames[:-2]    # get rid of the extra comma
            name_dict[pairs[-1]] = nicknames


In [162]:
name_dict

{'Acera': 'Acer',
 'Adeline': 'Ada',
 'Adelaide': 'Addie',
 'Agatha': 'Agnes',
 'Zephaniah': 'Zeph'}

In [163]:
# Two escaped the filters. 

# 'Emilia*Emile'
# 'namegivento7thchild'

name_dict.pop('Emilia*Emile')
name_dict.pop('namegivento7thchild')

name_dict

{'Acera': 'Acer',
 'Adeline': 'Ada',
 'Adelaide': 'Addie',
 'Agatha': 'Agnes',
 'Agnes': 'Senga',
 'Zephaniah': 'Zeph'}

### I know none of this is optimal...
I have a habit of wasting far too much time trying to optimize trivial tasks.

This is not one of those times.



In [173]:
df = pd.DataFrame.from_dict([name_dict])

In [220]:
# We will transpose for better viewing

df = df.transpose()
df.head(5)

Unnamed: 0,0
Acera,Acer
Adeline,Ada
Adelaide,Addie
Agatha,Agnes
Agnes,Senga


In [221]:
df.to_csv('Name_Nicknames.csv')