In [1]:
import pandas as pd
import lxml             # Needed for read_html function.
                        # This imports fine on my machine; may need to be pip- or conda-installed on IBM server
import numpy as np
import requests

In [3]:
names_url = r"https://www.irelandbeforeyoudie.com/ranked-top-100-irish-surnames-and-meanings"

In [4]:
# Pretend to be a browser, to avoid a 403: Forbidden error when scraping

header = {
  "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.75 Safari/537.36",
  "X-Requested-With": "XMLHttpRequest"
}

r = requests.get(names_url, headers = header)
df = pd.read_html(r.text)

In [5]:
len(df)

11

In [6]:
df[-1]

Unnamed: 0,0,1,2,3
0,Rank,Name,Gaelic Equivalent,Meaning
1,1,Murphy,ó Murchadha,sea-battler
2,2,Kelly,ó Ceallaigh,bright-headed
3,3,O’Sullivan,ó Súilleabháin,dark-eyed
4,4,Walsh,Breathnach,Welshman
...,...,...,...,...
96,96,MacDermott,Mac Diarmada,free from jealousy
97,97,Molony,ó Maolomhnaigh,servant of the Church
98,98,O’Rourke,ó Ruairc,
99,99,Buckley,ó Buachalla,cow herd


In [7]:
# I'll extract all the names and Gaelic equivalents to a list:
names_list = df[-1][1].tolist()         # the English names
names_list += df[-1][2].tolist()         # the Gaelic names

In [8]:
names_list

['Name',
 'Murphy',
 'Kelly',
 'O’Sullivan',
 'Walsh',
 'Smith',
 'O’Brien',
 'Byrne',
 'Ryan',
 'O’Connor',
 'O’Neill',
 'O’Reilly',
 'Doyle',
 'McCarthy',
 'Gallagher',
 'O’Doherty',
 'Kennedy',
 'Lynch',
 'Murray',
 'Quinn',
 'Moore',
 'McLoughlin',
 'O’Carroll',
 'Connolly',
 'Daly',
 'O’Connell',
 'Wilson',
 'Dunne',
 'Brennan',
 'Burke',
 'Collins',
 'Campbell',
 'Clarke',
 'Johnston',
 'Hughes',
 'O’Farrell',
 'Fitzgerald',
 'Brown',
 'Martin',
 'Maguire',
 'Nolan',
 'Flynn',
 'Thompson',
 'O’Callaghan',
 'O’Donnell',
 'Duffy',
 'O’Mahony',
 'Boyle',
 'Healy',
 'O’Shea',
 'White',
 'Sweeney',
 'Hayes',
 'Kavanagh',
 'Power',
 'McGrath',
 'Moran',
 'Brady',
 'Stewart',
 'Casey',
 'Foley',
 'Fitzpatrick',
 'O’Leary',
 'McDonnell',
 'MacMahon',
 'Donnelly',
 'Regan',
 'Donovan',
 'Burns',
 'Flanagan',
 'Mullan',
 'Barry',
 'Kane',
 'Robinson',
 'Cunningham',
 'Griffin',
 'Kenny',
 'Sheehan',
 'Ward',
 'Whelan',
 'Lyons',
 'Reid',
 'Graham',
 'Higgins',
 'Cullen',
 'Keane',
 'King',

In [29]:
# remove the 'nan' entries, which are of type float, and the initial 'Name' entry:
names_list = [el for el in names_list if type(el) == str]
names_list = names_list[1:]

In [30]:
len(names_list)

194

In [2]:
# I'll also grab the "List of streets and squares in Dublin" from 
# Wikipedia -- the "English" column, the "Irish" column, the "Continuations" and "Termini" columns:
dublin_url = r"https://en.wikipedia.org/wiki/List_of_streets_and_squares_in_Dublin"
dub_df = pd.read_html(dublin_url)

In [3]:
dub_df = dub_df[0]

In [4]:
dub_df

Unnamed: 0,English Street or square name,Irish Street or square name[1][2][3],Date[4],Historical names[5][6],Route number[7],Dublin postal district,Sections,Continuations,Termini and major cross-streets
0,Abbey Street,Sráid na Mainistreach,1728.0,,,1,"Lwr, mid, upr",Mary's Abbey,"Beresford St, Capel St"
1,Ailesbury Road,"Bóthar Ailesbury, Bóthar Aelsbaire",,,R824,4,,,"Sydney Parade Ave, Merrion Rd, Stillorgan Rd"
2,Amiens Street,Sráid Amiens,1800.0,The Strand (1728),R105,1,,N Strand Rd,"Memorial Rd/Beresford Pl, Talbot St, Portland ..."
3,Anglesea Road,Bóthar Mon,,,R815,4,,Beaver Row,"Merrion Road, Stillorgan Road"
4,Anne Street South (South Anne Street),Sráid Anna Theas,1723.0,South St Anne St,,2,,Harry St,"Grafton St, Dawson St"
...,...,...,...,...,...,...,...,...,...
66,Thormanby Road,,,,R105 (part),13,,,"Main St, Carrickbrack Rd"
67,Tritonville Road,Bóthar Tritonville,,,,4,,Irishtown Rd,"Londonbridge Rd/Church Ave, Claremont Rd/Serpe..."
68,Werburgh Street,"Sráid San Werburgh, Sráid San Werburga, Sráid ...",1257.0,,,2,,Bride St,"Christchurch Pl/Lord Edward St, Bull Alley St/..."
69,Westland Row,Rae an Iarthair,1773.0,Westland's,,2,,Lombard Street,Pearse Street/Lincoln Place


In [5]:
# Replace the slashes in the last column with commas:
df_mask = dub_df['Termini and major cross-streets'].str.contains("/", na=False)
dub_df['Termini and major cross-streets'].mask(df_mask, dub_df['Termini and major cross-streets'].str.replace("/", ", ").replace(" ,", ","), inplace=True)

In [6]:
dub_df

Unnamed: 0,English Street or square name,Irish Street or square name[1][2][3],Date[4],Historical names[5][6],Route number[7],Dublin postal district,Sections,Continuations,Termini and major cross-streets
0,Abbey Street,Sráid na Mainistreach,1728.0,,,1,"Lwr, mid, upr",Mary's Abbey,"Beresford St, Capel St"
1,Ailesbury Road,"Bóthar Ailesbury, Bóthar Aelsbaire",,,R824,4,,,"Sydney Parade Ave, Merrion Rd, Stillorgan Rd"
2,Amiens Street,Sráid Amiens,1800.0,The Strand (1728),R105,1,,N Strand Rd,"Memorial Rd, Beresford Pl, Talbot St, Portland..."
3,Anglesea Road,Bóthar Mon,,,R815,4,,Beaver Row,"Merrion Road, Stillorgan Road"
4,Anne Street South (South Anne Street),Sráid Anna Theas,1723.0,South St Anne St,,2,,Harry St,"Grafton St, Dawson St"
...,...,...,...,...,...,...,...,...,...
66,Thormanby Road,,,,R105 (part),13,,,"Main St, Carrickbrack Rd"
67,Tritonville Road,Bóthar Tritonville,,,,4,,Irishtown Rd,"Londonbridge Rd, Church Ave, Claremont Rd, Ser..."
68,Werburgh Street,"Sráid San Werburgh, Sráid San Werburga, Sráid ...",1257.0,,,2,,Bride St,"Christchurch Pl, Lord Edward St, Bull Alley St..."
69,Westland Row,Rae an Iarthair,1773.0,Westland's,,2,,Lombard Street,"Pearse Street, Lincoln Place"


In [8]:
en_streets_list = dub_df["English Street or square name"].tolist()
ei_streets_list = dub_df["Irish Street or square name[1][2][3]"].tolist()
en_streets_list += dub_df["Continuations"].tolist()
en_streets_list += dub_df["Termini and major cross-streets"].tolist()
# (I'm a bit uncertain about adding the last two columns to the English streets
# list, but am OK either way)

In [9]:
# I could have removed 'nan's during the list generation, but I'll do it now:
en_streets_list = [el for el in en_streets_list if type(el) == str]
ei_streets_list = [el for el in ei_streets_list if type(el) == str]

In [10]:
en_streets_list

['Abbey Street',
 'Ailesbury Road',
 'Amiens Street',
 'Anglesea Road',
 'Anne Street South (South Anne Street)',
 'Baggot Street',
 'Bayside Boulevard',
 'Bridge Street',
 'Capel Street',
 'Carrickbrack Road',
 'Clyde Road',
 'College Green',
 'Dame Street',
 'Dawson Street',
 'Dorset Street',
 'Drumcondra Road',
 'East Wall Road',
 'Eccles Street',
 'Ely Place',
 'Fishamble Street',
 'Fitzwilliam Square',
 'Gardiner Street',
 'Gilford Road',
 'Grafton Street',
 'Greenfield Road',
 'Henrietta Street',
 'Henry Street',
 'Herbert Park',
 'Herbert Road',
 'Heytesbury Street',
 'Howth Road',
 'Kildare Street',
 'Leeson Street',
 'Marlborough Street',
 'Marrowbone Lane',
 'Merrion Road',
 'Merrion Square',
 'Merrion Street',
 'Molesworth Street',
 'Mountjoy Square',
 'Naas Road',
 'Nassau Street',
 'Newbridge Avenue',
 'North Circular Road',
 "North Great George's Street",
 'North Strand Road',
 'Nutley Lane',
 "O'Connell Street",
 'Park Avenue',
 'Parliament Street',
 'Parnell Square',
 '

In [11]:
ei_streets_list

['Sráid na Mainistreach',
 'Bóthar Ailesbury, Bóthar Aelsbaire',
 'Sráid Amiens',
 'Bóthar Mon',
 'Sráid Anna Theas',
 'Sráid Bhagóid',
 'Búlbhard Chois Bá',
 'Sráid an Droichid',
 'Sráid Chéipil',
 'Bóthar Charraig Bhreac',
 'Bóthar Cluaidh',
 'Faiche an Choláiste',
 'Sráid an Dáma',
 'Sráid Dhásain',
 'Sráid Dorset',
 'Bóthar Dhroim Conrach',
 'Bóthar an Phoiirt Thoir',
 'Sráid Eccles',
 'Plás Íle',
 'Sráid Sheamlas an Éisc',
 'Cearnóg Mhic Liam, Cearnóg Fitzwilliam',
 'Sráid Ghairdinéir',
 'Bóthar Gilford',
 'Sráid Grafton',
 'Sráid Henrietta',
 'Sráid Anraí',
 'Páirc Hoirbeaird',
 'Bóthar Hoirbeaird',
 'Sráid Heytesbury',
 'Bóthar Bhinn Éadair',
 'Sráid Chill Dara',
 'Sráid Líosain',
 'Sráid Mhaoilbhríde, Sráid Maoilbhríde, Sráid Marlborough',
 'Lána Mhuire Mhaith, Lána Mhuire',
 'Bóthar Mhuirfean',
 'Cearnóg Mhuirfean',
 'Sráid Mhuirfean',
 'Sráid Theach Laighean',
 'Cearnóg Mhuinseo',
 'Bóthar an Náis',
 'Sráid Thobar Phádraig, Sráid Nassau',
 'Ascaill an Droichid Nua',
 'Cuarbhó

In [12]:
# And a lot of the entries are comma-separated street names. Here I'll split these up into separate entries. 
en_sl = [name.split(",") for name in en_streets_list] 
ei_sl = [name.split(",") for name in ei_streets_list]

In [13]:
# Now flatten the result:
import itertools
en_sl = list(itertools.chain(*en_sl))
ei_sl = list(itertools.chain(*ei_sl))
# And strip out extraneous spaces:
en_streets_list = [el.strip() for el in en_sl]
ei_streets_list = [el.strip() for el in ei_sl]

In [14]:
en_streets_list

['Abbey Street',
 'Ailesbury Road',
 'Amiens Street',
 'Anglesea Road',
 'Anne Street South (South Anne Street)',
 'Baggot Street',
 'Bayside Boulevard',
 'Bridge Street',
 'Capel Street',
 'Carrickbrack Road',
 'Clyde Road',
 'College Green',
 'Dame Street',
 'Dawson Street',
 'Dorset Street',
 'Drumcondra Road',
 'East Wall Road',
 'Eccles Street',
 'Ely Place',
 'Fishamble Street',
 'Fitzwilliam Square',
 'Gardiner Street',
 'Gilford Road',
 'Grafton Street',
 'Greenfield Road',
 'Henrietta Street',
 'Henry Street',
 'Herbert Park',
 'Herbert Road',
 'Heytesbury Street',
 'Howth Road',
 'Kildare Street',
 'Leeson Street',
 'Marlborough Street',
 'Marrowbone Lane',
 'Merrion Road',
 'Merrion Square',
 'Merrion Street',
 'Molesworth Street',
 'Mountjoy Square',
 'Naas Road',
 'Nassau Street',
 'Newbridge Avenue',
 'North Circular Road',
 "North Great George's Street",
 'North Strand Road',
 'Nutley Lane',
 "O'Connell Street",
 'Park Avenue',
 'Parliament Street',
 'Parnell Square',
 '

In [15]:
ei_streets_list

['Sráid na Mainistreach',
 'Bóthar Ailesbury',
 'Bóthar Aelsbaire',
 'Sráid Amiens',
 'Bóthar Mon',
 'Sráid Anna Theas',
 'Sráid Bhagóid',
 'Búlbhard Chois Bá',
 'Sráid an Droichid',
 'Sráid Chéipil',
 'Bóthar Charraig Bhreac',
 'Bóthar Cluaidh',
 'Faiche an Choláiste',
 'Sráid an Dáma',
 'Sráid Dhásain',
 'Sráid Dorset',
 'Bóthar Dhroim Conrach',
 'Bóthar an Phoiirt Thoir',
 'Sráid Eccles',
 'Plás Íle',
 'Sráid Sheamlas an Éisc',
 'Cearnóg Mhic Liam',
 'Cearnóg Fitzwilliam',
 'Sráid Ghairdinéir',
 'Bóthar Gilford',
 'Sráid Grafton',
 'Sráid Henrietta',
 'Sráid Anraí',
 'Páirc Hoirbeaird',
 'Bóthar Hoirbeaird',
 'Sráid Heytesbury',
 'Bóthar Bhinn Éadair',
 'Sráid Chill Dara',
 'Sráid Líosain',
 'Sráid Mhaoilbhríde',
 'Sráid Maoilbhríde',
 'Sráid Marlborough',
 'Lána Mhuire Mhaith',
 'Lána Mhuire',
 'Bóthar Mhuirfean',
 'Cearnóg Mhuirfean',
 'Sráid Mhuirfean',
 'Sráid Theach Laighean',
 'Cearnóg Mhuinseo',
 'Bóthar an Náis',
 'Sráid Thobar Phádraig',
 'Sráid Nassau',
 'Ascaill an Droich

In [24]:
# Finally, get synonyms for "pub":
# This url has good info, but it's not easy to scrape: 
# pub_url = r"https://www.macmillandictionary.com/thesaurus-category/british/bars-pubs-and-clubs"
# Instead, I copied-and-pasted from that page into a local file.
# Looking through that file, I see that though the line-returns are screwy, each synonym is on a line by itself,
# ending with the word "noun". So it's easy to extract the terms themselves....
import sys, os
pub_file_path = os.path.abspath(os.path.join(".", "pub_synonyms.txt"))

In [25]:
pub_syns = []
with open(pub_file_path, 'r') as fp:
    for line in fp:
        if line.endswith("noun\n"):
            pub_syns.append(line.strip("noun\n").strip())
            

In [26]:
# Remove the duplicates (e.g. "shebeen")
pub_syns = list(set(pub_syns))

In [27]:
# Also, I notice that "brewpub", "brewery", "brew house" aren't included...
pub_syns += ["brewpub", "brew pub", "brewery", "brew house", "ale house"]

In [15]:
pub_syns

['saloon',
 'bar',
 'coaching inn',
 'free house',
 'hostelry',
 'alehouse',
 'local',
 'lounge',
 'beer garden',
 'bistro',
 'social club',
 'joint',
 'boozer',
 'gastropub',
 'pub',
 'roadhouse',
 'tavern',
 'shebeen',
 'hotel',
 'wine bar',
 'inn',
 'speakeasy',
 'club',
 'the house',
 'honky-tonk',
 'public house',
 'watering hole',
 'dive',
 'brewpub',
 'brew pub',
 'brewery',
 'brew house',
 'ale house']

In [31]:
# I'll save these lists locally, for easy importation and use later:
with open("names_list.txt", 'w') as file_obj:
    for item in names_list:
        file_obj.write(f"{item}\n")

In [16]:
with open("English_streets_list.txt", 'w') as file_obj:
    for item in en_streets_list:
        file_obj.write(f"{item}\n")
        
with open("Irish_streets_list.txt", 'w') as file_obj:
    for item in ei_streets_list:
        file_obj.write(f"{item}\n")

In [34]:
with open("pubs_list.txt", 'w') as file_obj:
    for item in pub_syns:
        file_obj.write(f"{item}\n")