<a href="https://colab.research.google.com/github/benardt/genealogyKPI/blob/main/genealogy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Genealogy

In [41]:
# configuration data

my_config = {
    'login': 'xxxxxx',
    'password': 'xxxxxx',
    'login_page': 'https://www.geneanet.org/connexion/',
    'page': 'https://gw.geneanet.org/benardt_w?lang=fr&pz=camille+marie+sylvie&nz=benard&ocz=0&m=A&p=camille+marie+sylvie&n=benard&sosab=10&color=&t=N&v=28'
}


In [42]:
# install dependencies

%%capture
!pip install selenium  --quiet
!apt-get update # to update ubuntu to correctly run apt install
!apt install chromium-chromedriver
!cp /usr/lib/chromium-browser/chromedriver /usr/bin

import math
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

import sys
sys.path.insert(0,'/usr/lib/chromium-browser/chromedriver')
from selenium import webdriver
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')

wd = webdriver.Chrome('chromedriver',chrome_options=chrome_options)

In [43]:
# connection to Geneanet
wd.get(my_config['login_page'])

username = wd.find_element_by_id("_username")
password = wd.find_element_by_id("_password")

username.send_keys(my_config['login'])
password.send_keys(my_config['password'])

element = wd.find_element_by_id('_submit')
wd.execute_script("arguments[0].click();", element)


In [44]:
# read the page
wd.get(my_config['page'])

r = wd.page_source.encode("utf-8")

print (r)

b'<html lang="fr" xmlns="http://www.w3.org/1999/xhtml" style="" class=" js flexbox flexboxlegacy canvas canvastext webgl no-touch geolocation postmessage websqldatabase indexeddb hashchange history draganddrop websockets rgba hsla multiplebgs backgroundsize borderimage borderradius boxshadow textshadow opacity cssanimations csscolumns cssgradients cssreflections csstransforms csstransforms3d csstransitions fontface generatedcontent video audio localstorage sessionstorage webworkers no-applicationcache svg inlinesvg smil svgclippaths"><!-- geneanet page --><head><style type="text/css">@charset "UTF-8";[ng\\:cloak],[ng-cloak],[data-ng-cloak],[x-ng-cloak],.ng-cloak,.x-ng-cloak,.ng-hide:not(.ng-hide-animate){display:none !important;}ng\\:form{display:block;}.ng-animate-shim{visibility:hidden;}.ng-anchor{position:absolute;}</style>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">\n<meta http-equiv="content-language" content="fr">\n<meta name="description" content="Arbre 

In [45]:

soup = BeautifulSoup(r)

# collection of list <LI> item with dedicated style
# this style allows to find correct <UL> parent
lis = soup.find_all("li", {"style": "list-style-type:disc"})
T = [tag for li in lis for tag in li.parent if tag.name == "li"]

# feuille de l'arbre ? 0 if not leaf, 1 if leaf
my_dict_leaf = {None:1, 'list-style-type:disc':0}

df = pd.DataFrame({
    "leaf": [my_dict_leaf[tag.get('style')] for tag in T],
    "text": [tag.text.replace(u'\xa0', u'') for tag in T],
    "name": [tag.find_all("a")[0].text for tag in T],
    })
df['name'] = df['name'].astype(str)

# LI collection algorithm above find multiple
# the same LI so we have to remove duplicate
# DataFrame is:
# | leaf | text | name |
df = df.drop_duplicates('text')
df = df.reset_index(drop=True)

# get value before ' - ' or ' => '
# this value is sosa number
df.loc[df['text'].str.contains(' - ' ), 'sosa'] = df['text'].str.split(' - ' ).str.get(0)
df.loc[df['text'].str.contains(' => '), 'sosa'] = df['text'].str.split(' => ').str.get(0)
df['sosa'] = df['sosa'].astype(int)

# case of doublon
# get value after ' => '
df.loc[df['text'].str.contains(' - '), 'sosa_id'] = '0'
df.loc[df['text'].str.contains(' => '), 'sosa_id'] = df['text'].str.split(' => ').str.get(1)
df['sosa_id'] = df['sosa_id'].astype(int)

# add merge sosa in only one person
DATA6 = [[sosa] if sosa_id == 0 else 0 for sosa,sosa_id in zip(df['sosa'],df['sosa_id'])]
for sosa,sosa_id in zip(df['sosa'],df['sosa_id']):
  if sosa_id != 0:
    idx = df[df['sosa'] == sosa_id].index[0]
    DATA6[idx] += [sosa]

df['sosas'] = DATA6
# remove all common ancesters to keep only one single person
df = df.drop(df[df['sosas'] == 0].index)
df = df.reset_index(drop=True)

# build sosa for parent
# parent must exist in DataFrame

DATA6 = [[] for _ in df['sosa']]
for sosas in df['sosas'][1:]:
  # person with index 0 is sosa 1
  # and no children for sosa 1

  for sosa in sosas:
    # find child of current sosa inside df['sosa']
    # and add current sosa to parent of child
    child = int(sosa/2)
    idx = df[df['sosa']==child].index[0]
    DATA6[idx] += [sosas[0]]

df['parents'] = DATA6
del DATA6

df['generation'] = df.apply(lambda x: int(math.log2(x.sosa)),axis=1)

# DataFrame is:
# | leaf | text | name | sosa | sosa_id | sosas | parents | generation | 

In [46]:
# cell to visualize data
# print(df.head())
df_print = df.drop(['text'], axis=1)
 
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', 3)
 
#print(df_print)

In [47]:
# BFS algorithm fo find all ancesters
# some changes to return all relative ancesters
 
def bfs(graph, node):
  '''
  graph: dict - all nodes 'child':['sir','dam']
  node: str - starting node (de-cujus)
  '''
  visited = []   # List to keep track of visited nodes.
  queue = []     # Initialize a queue
  res = []
  rel = []       # relative tree with sosa number
  sosas = []
  visited.append(node)
  queue.append(node)

  sosas.append(1)
 
  while queue:
    s = queue.pop(0)
    sosa = sosas.pop(0)
    res.append(int(s))
    rel.append(sosa)
 
    for parent in graph[s]:
      if parent not in visited:
        visited.append(parent)
        queue.append(parent)
        sosas.append(2*sosa+int(parent)%2)
 
  return rel

In [48]:
# build graph with sosa for BFS algorithm
# graph uses dictonnary so string is mandatory

graph = {}

for s,ps in zip(df['sosa'],df['parents']):
  if len(ps) == 2:
    graph[str(s)] = [str(ps[0]),str(ps[1])]
  elif len(ps) == 1:
    graph[str(s)] = [str(ps[0])]
  else:
    graph[str(s)] = []

In [49]:
# find all ancesters to each sosa
# ancesters has relative sosa [1, 2, 3, ....]

df['ancetres'] = [bfs(graph, str(sosa)) for sosa in df['sosa']]

df_print = df.drop(['name','text','leaf','parents'], axis=1)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

# print for debug
# print(df_print)

In [50]:
# utilities function

def getSosa_real(sosa,p):
  '''
  give real sosa from root sosa (sosa)
  and relative sosa (p)
  '''
  cur_gen = int(math.log2(p))
  q = p - 2**cur_gen
  return sosa * 2**cur_gen + q

def position(sosa):
  generation = int(math.log2(sosa))
  n_total = (2**generation - 2**(generation-1))
  phi = (sosa-2**generation) * math.pi / n_total
  return generation * np.cos(phi),generation * np.sin(phi)

In [69]:

SO,NA,REEL,UN = [],[],[],[]
for sosas,name in zip(df['sosas'],df['name']):
  for sosa in sosas:
    SO.append(sosa)
    NA.append(name)
    REEL.append(sosas[0])
    UN.append('not doublon')


stop = -1
while stop != 0:
  stop = 0
  s = zip(reversed(df['sosas']),reversed(df['ancetres']))
  for sosas,ancetres in s:
    ancs = []
    for sosa in sosas:
      ancs.append([getSosa_real(sosa,sosa_relative) for sosa_relative in ancetres])
    
    names,reels = [],[]
    for so in ancs[0]:
      if so in SO:
        names.append(NA[SO.index(so)])
        reels.append(REEL[SO.index(so)])
      else:
        # loop 'while' until all NONE names are not found
        stop += 1
        names.append(None)
        reels.append(None)
    
    for ancss in ancs:
      for name,sosa,reel in zip(names,ancss,reels):
        if name != None:
          if sosa not in SO:
            SO.append(sosa)
            NA.append(name)
            UN.append('doublon')
            REEL.append(reel)

# TODO avec les REEL pour calculer le nombre de fois où
# un sosa not doublon is an ancester

X,Y = [position(sosa)[0] for sosa in SO],[position(sosa)[1] for sosa in SO]

print("Nombre d'ancêtres uniques : ",len(df['sosas']))
print("Nombre total d'ancêtres : ",len(SO))
print("Pourcentage d'ancêtres en commun : ",int(100*(1-len(df['sosas'])/len(SO))))

# Nombre de fois où le sosa est répété (nombre de doublon pour une même personne)
CC = np.ones(len(REEL))
unique, counts = np.unique(REEL, return_counts=True)
for u,c in zip(unique,counts):
    CC[SO == u] = c

# build DataFrame to display scatter plot with all sosas
dfp = pd.DataFrame({
    "X": X,
    "Y": Y,
    "C": UN,
    "N": NA,
    "S": SO,
    "CC": CC,
    "RE": REEL
    })
# convert column in str to get palette color (not linear color)
dfp["CC"] = dfp["CC"].astype(str)
fig = px.scatter(data_frame=dfp, x="X", y="Y",color="CC", hover_data=['N','S','RE'],width=800, height=800)
fig.update_yaxes(scaleanchor = "x", scaleratio = 1)
fig.show()

Nombre d'ancêtres uniques :  2463
Nombre total d'ancêtres :  4404
Pourcentage d'ancêtres en commun :  44


In [67]:
N = 6

L = []
P = []
V = []

# root node
for sosa in [1]:
  idx = df[df['sosa']==sosa].index[0]
  L.append(df['name'][idx])
  P.append(None)
  V.append(360)

# other nodes
for sosa in range(2,2**N):
  child = int(sosa/2)
  generation = int(math.log2(sosa))
  V.append(360/(2**generation))
  if sosa in df['sosa'].to_list():
    idx = df[df['sosa']==sosa].index[0]
    L.append(df['name'][idx])
    idx = df[df['sosa']==child].index[0]
    P.append(df['name'][idx])
  else:
    L.append(None)
    P.append(None)

fig =go.Figure(go.Sunburst(
    labels=L,
    parents=P,
    values=V,
    branchvalues="total",
))

fig.update_layout(margin = dict(t=0, l=0, r=0, b=0))

# fig.show()

In [72]:

U,C,G,N = [],[],[],[]

unique, counts = np.unique(REEL, return_counts=True)
for u,c in zip(unique,counts):
  if c != 1:
    for sosas,name in zip(df['sosas'],df['name']):
      for sosa in sosas:
        if u == sosa:
          U.append(u)
          C.append(c)
          G.append(int(math.log2(u)))
          N.append(name)

newdf = pd.DataFrame({
    "sosa": U,
    "generation": G,
    "count": C,
    "name": N,
    "sosas": ''
    })

fig = make_subplots(
    rows=1, cols=1,
    specs=[[{"type": "table"}]]
)

fig.add_trace(
    go.Table(
        header=dict(
            values=["sosa", "generation", "count", "name"],
            font=dict(size=10), align="left"
        ),
        cells=dict(
            values=[newdf[k].tolist() for k in newdf.columns],
            align = "left")
    ),
    row=1, col=1
)
fig.update_layout(
    height=800,
    showlegend=False,
    title_text="Most several anecesters",
)

fig.show()



In [73]:
# inbreeding

n = len(df['parents'])
P = []
for parents in df['parents']:
  pp = [n-1-df[df['sosa']==parent].index[0] for parent in parents]
  P.append(pp)

P.reverse()

# Animal Sire Dam
#     1    0   0
#     2    1   0
#     3    1   0
#     4    1   0
#     5    3   2
#     6    3   4
#     7    5   6 
#P = [[],[0],[0],[0],[2,1],[2,3],[4,5]]

A = np.zeros((n,n))

for i,_ in enumerate(A):
  for j,_ in enumerate(A):
    if i < j:
      for idx in P[j]:
        A[i,j] += 0.5 * A[i,idx]

    elif i == j:
      A[i,j] = 1.0
      if len(P[i]) == 2:
        id_p,id_q = P[i][0],P[i][1]
        A[i,j] += 0.5 * A[id_p,id_q]
  A[:,i] = A[i,:].T


In [74]:

I = []
for (i,j),val in np.ndenumerate(A):
  if i == j:
    I.append(100*(val-1))
I.reverse()
print(I)

df['inbreeding'] = I


[0.0, 0.007321138400584459, 0.0, 0.0007885042577981949, 0.0017342623323202133, 0.0, 0.0, 4.76837158203125e-05, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.006103515625, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.02447068691253662, 1.5625, 0.0, 0.0, 0.0, 0.0, 0.0, 6.594407558441162, 0.0, 1.6845703125, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 6.25, 0.0, 0.0, 0.78125, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0244140625, 0.0, 0.0, 0.18854141235351562, 0.0, 0.9765625, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 7.8125, 0.0, 0.0, 0.0, 0.0, 0.0, 0.000667572021484375, 12.6953125, 0.78125, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.7874488830566406, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.3973960876464844, 0.00152587890625, 0.3973960876464844, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.953125, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0