<a href="https://colab.research.google.com/github/benardt/JScad2d/blob/master/genealogy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Genealogy

In [1]:
# configuration data

my_config = {
    'login': 'xxx',
    'password': 'xxx',
    'login_page': 'https://www.geneanet.org/connexion/',
    'page': 'https://gw.geneanet.org/benardt_w?lang=fr&m=A&p=camille+marie+sylvie&n=benard&t=N&v=100'
}


In [2]:
# code linter

!pip install pycodestyle
!pip install --index-url https://test.pypi.org/simple/ nbpep8

from nbpep8.nbpep8 import pep8

# Add pep8(_ih) at the end of the code cell to see PEP8 analysis.

Looking in indexes: https://test.pypi.org/simple/


In [3]:
# install dependencies

%%capture
!pip install selenium  --quiet
!apt-get update # to update ubuntu to correctly run apt install
!apt install chromium-chromedriver
!cp /usr/lib/chromium-browser/chromedriver /usr/bin

import math
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

import sys
sys.path.insert(0,'/usr/lib/chromium-browser/chromedriver')
from selenium import webdriver
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')

wd = webdriver.Chrome('chromedriver',chrome_options=chrome_options)

In [4]:
# connection to Geneanet
from selenium.webdriver.common.by import By

wd.get(my_config['login_page'])

username = wd.find_element(By.ID, "_username")
password = wd.find_element(By.ID, "_password")

username.send_keys(my_config['login'])
password.send_keys(my_config['password'])

element = wd.find_element(By.ID, '_submit')
wd.execute_script("arguments[0].click();", element)


In [5]:

# read the page
wd.get(my_config['page'])

r = wd.page_source.encode("utf-8")
soup = BeautifulSoup(r)
# print(r)

In [54]:
class Ancestors:
    """     Global class    """

    my_dict_leaf = {None: 1, 'list-style-type:disc': 0}

    def get_sosa(self, mytext, origin=False):

        if ' - ' in mytext:
            sosa = mytext.split(' - ')[0]
            sosa_o = '0'

        elif ' => ' in mytext:
            sosa = mytext.split(' => ')[0]
            sosa_o = mytext.split(' => ')[1]
        arr = [int(sosa), int(sosa_o)]

        if origin:
            n = 1
        else:
            n = 0

        return arr[n]

    def bfs(self, node):
        '''
        graph: dict - all nodes 'child':['sir','dam']
        node: str - starting node (de-cujus)
        '''
        visited = []   # List to keep track of visited nodes.
        queue = []     # Initialize a queue
        res = []
        rel = []       # relative tree with sosa number
        sosas = []
        visited.append(node)
        queue.append(node)

        sosas.append(1)

        while queue:
            s = queue.pop(0)
            sosa = sosas.pop(0)
            res.append(int(s))
            rel.append(sosa)

            for parent in self.graph[s]:
                if parent not in visited:
                    visited.append(parent)
                    queue.append(parent)
                    sosas.append(2 * sosa + int(parent) % 2)

        return rel

    def getSosa_real(self, sosa, p):
        '''
        give real sosa from root sosa (sosa)
        and relative sosa (p)
        '''
        cur_gen = int(np.log2(p))
        q = p - 2**cur_gen
        return sosa * 2**cur_gen + q

    def __init__(self, soup):

        self.graph = {}

        self.leaf = []
        self.text = []
        self.name = {}
        self.sosa = []

        all_lis = soup.find_all("li")

        for li0 in all_lis:
            if "Génération" in li0.text:
                ul = li0.find("ul")
                for li in ul.find_all("li"):

                    self.leaf.append(self.my_dict_leaf[li.get('style')])

                    name = li.find_all("a")[0].text
                    text = li.text.replace(u'\xa0', u'')
                    sosa = self.get_sosa(text)

                    self.name[sosa] = name
                    self.text.append(text)
                    self.sosa.append(sosa)

    def build_sosas(self):
        self.sosas = []

        for text, sosa in zip(self.text, self.sosa):

            sosa_o = self.get_sosa(text, True)

            if sosa_o == 0:
                self.sosas.append([sosa])
            else:
                self.sosas.append(0)
                idx = self.sosa.index(sosa_o)
                self.sosas[idx].append(sosa)

        # individu with sosa_origin == 0 are not unique
        # we need to remove them
        # and this for all lists

        idxs = [idx for idx, val in enumerate(self.sosas) if val == 0]

        for index in sorted(idxs, reverse=True):
            del self.leaf[index]
            del self.text[index]
            del self.sosa[index]
            del self.sosas[index]

    def getName(self, sosa):
        '''
        return name of person identified by sosa number

        @param: int - sosa number
        @return: str - name of sosa
        '''
        
        if sosa != 0:
            idx = mya.all_sosa.index(sosa)
            sosa_r = mya.all_base[idx]
            name = self.name[sosa_r]
        else:
            name = "None"
        return name

    def build_allsosa(self):

        self.all_sosa = [sosa for sosa in self.sosa]
        self.all_base = [sosa for sosa in self.sosa]

        # Flatten list "self.sosas"
        for sosas, ancs in zip(self.sosas, self.ancetres):
            # There are only duplicate if len(sosas) > 1
            if len(sosas) > 1:
                sosa_r = sosas[0]
                for anc in ancs:
                    new_r = self.getSosa_real(sosa_r, anc)
                    for sosa in sosas[1:]:
                        sosa_new = self.getSosa_real(sosa, anc)
                        if sosa_new not in self.all_sosa:
                            self.all_sosa.append(sosa_new)
                            self.all_base.append(new_r)

        # find recursilvely sosa root for 'all_base' sosa
        for idx, sosa in enumerate(self.all_base):
            temp = sosa
            n = 0
            while temp not in self.sosa:
                n += 1
                idxt = self.all_sosa.index(temp)
                temp = self.all_base[idxt]
            self.all_base[idx] = temp

    def build_parents(self):

        self.parents = [[] for _ in self.sosas]

        for sosas in self.sosas[1:]:
            # person with index 0 is sosa 1
            # and no children for sosa 1

            for sosa in sosas:
                # find child of current sosa inside df['sosa']
                # and add current sosa to parent of child
                child = int(sosa/2)
                idx = self.sosa.index(child)
                self.parents[idx] += [sosas[0]]

    def build_ancetres(self):

        # start by making graph
        for s, ps in zip(self.sosa, self.parents):
            if len(ps) == 2:
                self.graph[str(s)] = [str(ps[0]), str(ps[1])]
            elif len(ps) == 1:
                self.graph[str(s)] = [str(ps[0])]
            else:
                self.graph[str(s)] = []

        self.ancetres = [self.bfs(str(sosa)) for sosa in self.sosa]

    def generation(self, idx):
        return int(math.log2(self.sosa[idx]))

    def length(self):
        print(self.sosa)
        return len(self.leaf)

    def count_unique(self):
        '''
        Nombre de fois où le sosa est répété (nombre de doublon
        pour une même personne)
        '''
        self.count = [self.all_base.count(sosa) for sosa in self.all_base]

    def inbreeding(self):

        nb = len(self.parents)
        P = []
        for parents in self.parents:
            pp = [nb-1-self.sosa.index(parent) for parent in parents]
            P.append(pp)

        P.reverse()

        A = np.zeros((nb, nb))

        for i, _ in enumerate(A):
            for j, _ in enumerate(A):
                if i < j:
                    for idx in P[j]:
                        A[i, j] += 0.5 * A[i, idx]

                elif i == j:
                    A[i, j] = 1.0
                    if len(P[i]) == 2:
                        id_p, id_q = P[i][0], P[i][1]
                        A[i, j] += 0.5 * A[id_p, id_q]
            # copy line to column (transposition)
            A[:, i] = A[i, :].T

        INBREEDINGS = []
        # inbreeding coefficients are on diagonal of matrix A
        for (i, j), val in np.ndenumerate(A):
            if i == j:
                INBREEDINGS.append(100*(val-1))

        # reverse to get data from youngest person to oldest
        INBREEDINGS.reverse()

        self.inbreeding = INBREEDINGS



In [55]:
mya = Ancestors(soup)
mya.build_sosas()
mya.build_parents()
mya.build_ancetres()
mya.build_allsosa()
mya.count_unique()
mya.inbreeding()


In [56]:
print(len(mya.text))
print(len(mya.sosas))
print(len(mya.all_sosa))

print(mya.all_sosa[2470:2550])
print(mya.all_base[2470:2550])

# for sosas in myancestors.sosas:
#     if len(sosas) > 1:
#         print(sosas)

for idx in range(10):
    print(mya.sosa[idx], mya.name[mya.sosa[idx]], len(mya.ancetres[idx]))


2475
2475
4416
[96120645, 96120646, 96120647, 192241288, 192241289, 652, 1304, 1305, 2608, 2609, 2610, 2611, 5216, 5217, 5218, 5219, 5220, 5221, 10432, 10433, 10434, 10435, 10436, 10437, 10438, 10439, 10440, 10441, 10442, 10443, 20868, 20869, 20870, 20871, 20874, 20875, 20878, 20879, 41736, 41737, 41738, 41739, 41740, 41741, 41742, 41743, 83476, 83477, 83478, 83479, 83484, 83485, 83486, 83487, 166952, 166953, 166956, 166957, 166958, 166959, 166968, 166969, 166972, 166973, 333904, 333905, 333912, 333913, 333914, 333915, 333936, 333937, 333946, 333947, 667828, 667829, 667872, 667873, 653, 1306]
[96120645, 96120646, 96120647, 192241288, 192241289, 322, 644, 645, 1288, 1289, 1290, 1291, 2576, 2577, 2578, 2579, 2580, 2581, 5152, 5153, 5154, 5155, 5156, 5157, 5158, 5159, 5160, 5161, 5162, 5163, 8886, 8887, 10310, 10311, 10314, 10315, 10318, 10319, 17772, 17773, 17774, 17775, 17768, 17769, 20622, 20623, 35548, 35549, 35550, 35551, 41244, 41245, 41246, 41247, 71096, 71097, 71100, 71101, 71102,

In [113]:
sosa_target = 278945

idxs = [idx for idx, val in enumerate(mya.all_base) if val == sosa_target]

family = []
for idx in idxs:
    sosa = mya.all_sosa[idx]
    family.append(sosa)
    child = int(sosa/2)
    while child != 1:
        family.append(child)
        child = int(child/2)


def get_polar_coor(sosa):
    generation = int(math.log2(sosa))
    n_total = (2**generation - 2**(generation-1))
    phi = (sosa-2**generation) * np.pi / n_total
    return generation * np.cos(phi), generation * np.sin(phi)


def wrap2lines(word):
    '''
    Wrap in line on 2 lines with <br> separator

    @param: str
    @return: str
    '''
    perfect_idx = int(len(word)/2)
    possible_idxs = [pos for pos, char in enumerate(word) if char == ' ']
    final_idx = min(possible_idxs, key=lambda x: abs(x-perfect_idx))
    return word[:final_idx] + '<br>' + word[final_idx:]

# build DataFrame to display scatter plot with all sosas

dfp2 = pd.DataFrame({
    "X": [get_polar_coor(sosa)[0] for sosa in mya.all_sosa],
    "Y": [get_polar_coor(sosa)[1] for sosa in mya.all_sosa],
    "U": ["unique" if sosa in mya.sosa else "not unique" for sosa in mya.all_sosa],
    "G": [int(math.log2(sosa)) for sosa in mya.all_sosa],
    "R": [sosa for sosa in mya.all_base],
    "S": [sosa for sosa in mya.all_sosa],
    "Sbin": [bin(sosa) for sosa in mya.all_sosa],
    "Sun_L": [wrap2lines(str(sosa) + " - " + mya.getName(sosa)) for sosa in mya.all_sosa],
    "Sun_P": [wrap2lines(str(int(sosa/2)) + " - " + mya.getName(int(sosa/2))) for sosa in mya.all_sosa],
    "Sun_V": [360/(2**int(math.log2(sosa))) for sosa in mya.all_sosa],
    "Child": [int(sosa/2) for sosa in mya.all_sosa],
    "V": [360/(2**int(math.log2(sosa))) for sosa in mya.all_sosa],
    "T": ["sosa: " + str(sosa) + " / " + mya.getName(sosa) + "<br>" + str(c) for sosa,c in zip(mya.all_base,mya.count)],
    "C": [color for color in mya.count]
    })

dfp3 = pd.DataFrame({
    "X": [get_polar_coor(sosa)[0] for sosa in family],
    "Y": [get_polar_coor(sosa)[1] for sosa in family],
    "T": ["sosa: " + str(sosa) + " / " + mya.getName(sosa) for sosa in family],
    "C": [1] * len(family)
    })

fig = make_subplots(rows=1, cols=2)

fig.add_trace(
    go.Scatter(x=dfp2["X"][dfp2["U"]=="unique"], y=dfp2["Y"][dfp2["U"]=="unique"], mode='markers', text=dfp2["T"][dfp2["U"]=="unique"]),
    row=1, col=1
)

fig.add_trace(
    go.Scatter(x=dfp2["X"], y=dfp2["Y"], mode='markers', text=dfp2["T"],
                   marker=dict(color=dfp2["C"])),
    row=1, col=2
)

fig.add_trace(
    go.Scatter(x=dfp3["X"], y=dfp3["Y"], mode='markers', text=dfp3["T"],
                   marker=dict(color=dfp3["C"])),
    row=1, col=1
)

fig.update_yaxes(
    scaleanchor="x",
    scaleratio=1,
  )


In [11]:

# build DataFrame to display scatter plot with all sosas
dfp = pd.DataFrame({
    "X": [get_polar_coor(sosa)[0] for sosa in mya.all_sosa],
    "Y": [get_polar_coor(sosa)[1] for sosa in mya.all_sosa],
    "G": [int(math.log2(sosa)) for sosa in mya.all_sosa],
    "N": [mya.getName(sosa) for sosa in mya.all_base],
    "S": mya.all_sosa,
    "CC": mya.count,
    "RE": mya.all_base
    })

# print(dfp)
# convert column in str to get palette color (not linear color)
dfp["CC"] = dfp["CC"].astype(str)
fig = px.scatter(data_frame=dfp, x="X", y="Y",color="CC", hover_data=['N', 'S', 'RE', 'G'],width=800, height=800)
fig.update_yaxes(scaleanchor = "x", scaleratio = 1)
fig.show()

In [159]:
from ipywidgets import interact
import ipywidgets as widgets


def draw_sun(gene,root,df):
    idxs = []
    for idx, _ in enumerate(mya.all_sosa):
        if df['G'][idx] < gene and df['Sbin'][idx].startswith(bin(root)):
            idxs.append(idx)

    fig = go.Figure(go.Sunburst(
        labels=df['Sun_L'][idxs][1:],
        parents=df['Sun_P'][idxs][1:],
        values=df['Sun_V'][idxs][1:],
        branchvalues="total"
    ))
    fig.update_layout(margin=dict(t=0, l=0, r=0, b=0))
    fig.show()

myw1 = widgets.BoundedIntText(
    value=4,
    min=0,
    max=50000000,
    step=1,
    description='Sosa:',
    disabled=False
)

myw2 = widgets.IntSlider(
    value=7,
    min=0,
    max=30,
    step=1,
    description='Génération:',
    disabled=False,
    continuous_update=False,
    orientation='horizontal',
    readout=True,
    readout_format='d'
)

@interact
def make_fig(generation=myw2, root=myw1):
    return draw_sun(generation, root, dfp2)



interactive(children=(IntSlider(value=7, continuous_update=False, description='Génération:', max=30), BoundedI…

In [13]:

U, C, G, N, S = [], [], [], [], []

for sosa in mya.sosa:
    idx = mya.all_sosa.index(sosa)
    c = mya.count[idx]
    if c > 2:
        U.append(sosa)
        C.append(c)
        G.append(int(math.log2(sosa)))
        N.append(mya.getName(sosa))
        idxs = [idx for idx, val in enumerate(mya.all_base) if val == sosa]
        S.append([mya.all_sosa[idx] for idx in idxs])

newdf = pd.DataFrame({
    "sosa": U,
    "generation": G,
    "count": C,
    "name": N,
    "sosas": S
    })

fig = make_subplots(
    rows=1, cols=1,
    specs=[[{"type": "table"}]]
)

fig.add_trace(
    go.Table(
        header=dict(
            values=["sosa", "generation", "count", "name", "sosas"],
            align="center"
        ),
        cells=dict(
            values=[newdf[k].tolist() for k in newdf.columns],
            align="left"),
        columnwidth=[1,1,1,3,8]
    ),
    row=1, col=1
)

fig.update_layout(
    height=600,
    showlegend=False,
    title_text="Most several anecesters",
)

fig.show()


In [14]:
df = pd.DataFrame({
    "x": [get_polar_coor(sosa)[0] for sosa in mya.sosa],
    "y": [get_polar_coor(sosa)[1] for sosa in mya.sosa],
    "parents": mya.parents,
    "sosa": mya.sosa,
    "name": [mya.getName(sosa) for sosa in mya.sosa],
    "inbreeding": mya.inbreeding,
    "inbreeding_color": [np.log(inbreeding+0.00001) for inbreeding in mya.inbreeding]
    })

fig = px.scatter(data_frame=df, x="x", y="y",color="inbreeding_color", hover_data=['name', 'sosa', 'inbreeding'],width=800, height=800)
fig.update_yaxes(scaleanchor = "x", scaleratio = 1)
fig.show()

In [32]:

generations, gene_tot = [], []
Th, T, U, C, G = [], [], [], [], []

for sosa in mya.sosa:
    generations.append(int(np.log2(sosa)))

for sosa in mya.all_sosa:
    gene_tot.append(int(np.log2(sosa)))

G = sorted(dfp2['G'].unique())

for generation in G:
    Th.append(2**generation)
    U.append(generations.count(generation))
    T.append(gene_tot.count(generation))
    C.append(int(10000*gene_tot.count(generation)/(2**generation))/100)

fig = make_subplots(
    rows=2, cols=2,
    specs=[[{"type": "xy"}, {"type": "violin"}], [{"type": "table"}, {"type": "sunburst"}]]
)

fig.add_trace(
    go.Table(
        header=dict(
            values=["generation", "unique nb", "total nb", "completion", "theoritical nb"],
            align="center"
        ),
        cells=dict(
            values=[newdf[k].tolist() for k in newdf.columns],
            align="left"),
        columnwidth=[1, 1, 1, 1, 1]
    ),
    row=2, col=1
)

fig.update_layout(
    height=600,
    showlegend=False,
    title_text="Data by generation"
)

fig.add_trace(go.Bar(name='Total', x=T, y=G, orientation='h'), row=1, col=1)
fig.add_trace(go.Bar(name='Unique', x=U, y=G, orientation='h'), row=1, col=1)
fig.add_trace(go.Scatter(name='Completition', x=C, y=G, xaxis="x2"), row=1, col=1)

fig.add_trace(go.Violin(y=dfp2["G"][dfp2['U'] == 'unique'],
                        side='negative', name="Unique",
                        line_color='blue', x0="G"), row=1, col=2)
fig.add_trace(go.Violin(y=dfp2["G"],
                        side='positive', name="Total",
                        line_color='orange', x0="G"), row=1, col=2)

G = 9

fig.add_trace(go.Sunburst(
    labels=dfp2['S'][dfp2['G'] < G][1:],
    parents=dfp2['Child'][dfp2['G'] < G][1:],
    values=dfp2['V'][dfp2['G'] < G][1:],
    branchvalues="total"
), row=2, col=2)

fig.update_xaxes(title="people #", row=1, col=1)
fig.update_yaxes(title="generation #", row=1, col=1)
fig.update_yaxes(title="generation #", row=1, col=2)

fig.update_layout(
    height=700,
    width=900,
    showlegend=True,
    title_text="Global charts"
)

fig.show()
