<a href="https://colab.research.google.com/github/benardt/genealogyKPI/blob/main/genealogy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Genealogy

In [1]:
# configuration data

my_config = {
    'login': 'xxxx',
    'password': 'xxxx',
    'login_page': 'https://www.geneanet.org/connexion/',
    'page': 'https://gw.geneanet.org/benardt_w?lang=fr&m=A&p=camille+marie+sylvie&n=benard&t=N&v=100'
}


In [2]:
# code linter

!pip install pycodestyle
!pip install --index-url https://test.pypi.org/simple/ nbpep8

from nbpep8.nbpep8 import pep8

# Add pep8(_ih) at the end of the code cell to see PEP8 analysis.

Looking in indexes: https://test.pypi.org/simple/


In [3]:
# install dependencies

%%capture
!pip install selenium  --quiet
!apt-get update # to update ubuntu to correctly run apt install
!apt install chromium-chromedriver
!cp /usr/lib/chromium-browser/chromedriver /usr/bin

import math
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

import sys
sys.path.insert(0,'/usr/lib/chromium-browser/chromedriver')
from selenium import webdriver
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')

wd = webdriver.Chrome('chromedriver',chrome_options=chrome_options)

In [4]:
# connection to Geneanet
from selenium.webdriver.common.by import By

wd.get(my_config['login_page'])

username = wd.find_element(By.ID, "_username")
password = wd.find_element(By.ID, "_password")

username.send_keys(my_config['login'])
password.send_keys(my_config['password'])

element = wd.find_element(By.ID, '_submit')
wd.execute_script("arguments[0].click();", element)


In [5]:

# read the page
wd.get(my_config['page'])

r = wd.page_source.encode("utf-8")
soup = BeautifulSoup(r)
# print(r)

In [6]:
class Ancestors:
    """     Global class    """

    my_dict_leaf = {None: 1, 'list-style-type:disc': 0}

    def get_sosa(self, mytext, origin=False):

        if ' - ' in mytext:
            sosa = mytext.split(' - ')[0]
            sosa_o = '0'

        elif ' => ' in mytext:
            sosa = mytext.split(' => ')[0]
            sosa_o = mytext.split(' => ')[1]
        arr = [int(sosa), int(sosa_o)]

        if origin:
            n = 1
        else:
            n = 0

        return arr[n]

    def bfs(self, node):
        '''
        graph: dict - all nodes 'child':['sir','dam']
        node: str - starting node (de-cujus)
        '''
        visited = []   # List to keep track of visited nodes.
        queue = []     # Initialize a queue
        res = []
        rel = []       # relative tree with sosa number
        sosas = []
        visited.append(node)
        queue.append(node)

        sosas.append(1)

        while queue:
            s = queue.pop(0)
            sosa = sosas.pop(0)
            res.append(int(s))
            rel.append(sosa)

            for parent in self.graph[s]:
                if parent not in visited:
                    visited.append(parent)
                    queue.append(parent)
                    sosas.append(2 * sosa + int(parent) % 2)

        return rel

    def getSosa_real(self, sosa, p):
        '''
        give real sosa from root sosa (sosa)
        and relative sosa (p)
        '''
        cur_gen = int(np.log2(p))
        q = p - 2**cur_gen
        return sosa * 2**cur_gen + q

    def __init__(self, soup):

        self.graph = {}

        self.leaf = []
        self.text = []
        self.name = {}
        self.sosa = []

        all_lis = soup.find_all("li")

        for li0 in all_lis:
            if "Génération" in li0.text:
                ul = li0.find("ul")
                for li in ul.find_all("li"):

                    self.leaf.append(self.my_dict_leaf[li.get('style')])

                    name = li.find_all("a")[0].text
                    text = li.text.replace(u'\xa0', u'')
                    sosa = self.get_sosa(text)

                    self.name[sosa] = name
                    self.text.append(text)
                    self.sosa.append(sosa)

    def build_sosas(self):
        self.sosas = []

        for text, sosa in zip(self.text, self.sosa):

            sosa_o = self.get_sosa(text, True)

            if sosa_o == 0:
                self.sosas.append([sosa])
            else:
                self.sosas.append(0)
                idx = self.sosa.index(sosa_o)
                self.sosas[idx].append(sosa)

        # individu with sosa_origin == 0 are not unique
        # we need to remove them
        # and this for all lists

        idxs = [idx for idx, val in enumerate(self.sosas) if val == 0]

        for index in sorted(idxs, reverse=True):
            del self.leaf[index]
            del self.text[index]
            del self.sosa[index]
            del self.sosas[index]

    def getName(self, sosa):
        '''
        return name of person identified by sosa number

        @param: int - sosa number
        @return: str - name of sosa
        '''
        idx = mya.all_sosa.index(sosa)
        sosa_r = mya.all_base[idx]
        return self.name[sosa_r]

    def build_allsosa(self):

        # Flatten list "self.sosas"
        self.all_sosa = [sosa for sosas in self.sosas for sosa in sosas]
        self.all_base = [sosas[0] for sosas in self.sosas for sosa in sosas]
        self.all_type = ['not doublon' for sosas in self.sosas for _ in sosas]

        stop = -1
        while stop != 0:
            stop = 0
            zz = zip(reversed(self.sosas), reversed(self.ancetres))
            for sosas, ancetres in zz:
                ancs = []
                for so in sosas:
                    ancs.append([self.getSosa_real(so, sr) for sr in ancetres])

                reels = []
                # All real sosas are in ancs[0] (= ancetres of sosas[0])
                for so in ancs[0]:
                    if so in self.all_sosa:
                        reels.append(self.all_base[self.all_sosa.index(so)])
                    else:
                        # loop 'while' until all NONE reels are not found
                        stop += 1
                        reels.append(None)

                for ancss in ancs:
                    for sosa_anc, reel in zip(ancss, reels):
                        if reel is not None:
                            if sosa_anc not in self.all_sosa:
                                self.all_sosa.append(sosa_anc)
                                self.all_type.append('doublon')
                                self.all_base.append(reel)

    def build_parents(self):

        self.parents = [[] for _ in self.sosas]

        for sosas in self.sosas[1:]:
            # person with index 0 is sosa 1
            # and no children for sosa 1

            for sosa in sosas:
                # find child of current sosa inside df['sosa']
                # and add current sosa to parent of child
                child = int(sosa/2)
                idx = self.sosa.index(child)
                self.parents[idx] += [sosas[0]]

    def build_ancetres(self):

        # start by making graph
        for s, ps in zip(self.sosa, self.parents):
            if len(ps) == 2:
                self.graph[str(s)] = [str(ps[0]), str(ps[1])]
            elif len(ps) == 1:
                self.graph[str(s)] = [str(ps[0])]
            else:
                self.graph[str(s)] = []

        self.ancetres = [self.bfs(str(sosa)) for sosa in self.sosa]

    def generation(self, idx):
        return int(math.log2(self.sosa[idx]))

    def length(self):
        print(self.sosa)
        return len(self.leaf)

    def count_unique(self):
        '''
        Nombre de fois où le sosa est répété (nombre de doublon pour une même personne)
        '''
        self.count = [self.all_base.count(sosa) for sosa in self.all_base]

    def inbreeding(self):

        nb = len(self.parents)
        P = []
        for parents in self.parents:
            pp = [nb-1-self.sosa.index(parent) for parent in parents]
            P.append(pp)

        P.reverse()

        A = np.zeros((nb, nb))

        for i, _ in enumerate(A):
            for j, _ in enumerate(A):
                if i < j:
                    for idx in P[j]:
                        A[i, j] += 0.5 * A[i, idx]

                elif i == j:
                    A[i, j] = 1.0
                    if len(P[i]) == 2:
                        id_p, id_q = P[i][0], P[i][1]
                        A[i, j] += 0.5 * A[id_p, id_q]
            # copy line to column (transposition)
            A[:, i] = A[i, :].T

        INBREEDINGS = []
        # inbreeding coefficients are on diagonal of matrix A
        for (i, j), val in np.ndenumerate(A):
            if i == j:
                INBREEDINGS.append(100*(val-1))

        # reverse to get data from youngest person to oldest
        INBREEDINGS.reverse()

        self.inbreeding = INBREEDINGS



In [7]:
mya = Ancestors(soup)
mya.build_sosas()
mya.build_parents()
mya.build_ancetres()
mya.build_allsosa()
mya.count_unique()
mya.inbreeding()


In [8]:
print(len(mya.text))
print(len(mya.sosas))
print(len(mya.all_sosa))

print(mya.all_sosa[1000:1200])
print(mya.all_base[1000:1200])

# for sosas in myancestors.sosas:
#     if len(sosas) > 1:
#         print(sosas)

for idx in range(10):
    print(mya.sosa[idx], mya.name[mya.sosa[idx]], len(mya.ancetres[idx]))


2475
2475
4416
[2231, 2240, 2241, 2242, 2243, 2244, 2245, 2246, 2247, 2248, 2249, 2250, 4488, 2251, 4489, 2252, 2253, 2254, 2255, 2256, 2257, 2258, 2259, 2260, 2261, 2262, 2263, 2264, 2265, 2268, 2269, 2272, 2273, 2274, 2275, 2276, 2277, 2278, 2279, 2288, 2289, 2290, 2291, 2296, 2297, 2298, 2299, 2300, 2301, 2308, 2309, 2310, 2311, 2320, 2321, 2330, 2331, 2334, 2335, 2348, 2349, 2428, 2429, 2430, 2431, 2432, 2433, 2434, 2435, 2440, 3048, 2441, 3049, 2448, 2449, 2472, 2473, 2476, 2477, 2480, 2481, 2486, 2487, 2498, 2499, 2500, 2501, 2508, 2509, 2510, 2511, 2528, 2529, 2546, 2547, 2560, 2561, 2562, 2563, 2564, 2565, 2566, 2567, 2568, 2569, 2572, 2573, 2574, 2575, 2576, 2577, 2578, 2579, 2580, 2581, 2584, 2585, 2586, 2587, 2588, 2589, 2590, 2591, 2592, 2593, 2594, 2595, 2600, 2601, 2604, 2605, 2606, 2607, 2616, 2617, 2618, 2619, 2620, 2621, 2622, 2623, 2624, 2630, 2625, 2631, 2626, 5286, 2627, 5287, 2628, 2629, 2632, 2633, 2634, 2635, 2636, 2637, 2638, 2639, 2640, 2641, 2642, 2643, 2644, 

In [26]:
sosa_target = 278945

idxs = [idx for idx, val in enumerate(mya.all_base) if val == sosa_target]

family = []
for idx in idxs:
    sosa = mya.all_sosa[idx]
    family.append(sosa)
    child = int(sosa/2)
    while child != 1:
        family.append(child)
        child = int(child/2)


def get_polar_coor(sosa):
    generation = int(math.log2(sosa))
    n_total = (2**generation - 2**(generation-1))
    phi = (sosa-2**generation) * np.pi / n_total
    return generation * np.cos(phi), generation * np.sin(phi)

# build DataFrame to display scatter plot with all sosas
dfp1 = pd.DataFrame({
    "X": [get_polar_coor(sosa)[0] for sosa in mya.sosa],
    "Y": [get_polar_coor(sosa)[1] for sosa in mya.sosa],
    "T": [str(sosa) + " / " + mya.getName(sosa) for sosa in mya.sosa]
    })

dfp2 = pd.DataFrame({
    "X": [get_polar_coor(sosa)[0] for sosa in mya.all_sosa],
    "Y": [get_polar_coor(sosa)[1] for sosa in mya.all_sosa],
    "R": [sosa for sosa in mya.all_base],
    "T": ["sosa: " + str(sosa) + " / " + mya.getName(sosa) + "<br>" + str(c) for sosa,c in zip(mya.all_base,mya.count)],
    "C": [color for color in mya.count]
    })

dfp3 = pd.DataFrame({
    "X": [get_polar_coor(sosa)[0] for sosa in family],
    "Y": [get_polar_coor(sosa)[1] for sosa in family],
    "T": ["sosa: " + str(sosa) + " / " + mya.getName(sosa) for sosa in family],
    "C": [1] * len(family)
    })

fig = make_subplots(rows=1, cols=2)

fig.add_trace(
    go.Scatter(x=dfp1["X"], y=dfp1["Y"], mode='markers', text=dfp1["T"]),
    row=1, col=1
)

fig.add_trace(
    go.Scatter(x=dfp2["X"], y=dfp2["Y"], mode='markers', text=dfp2["T"],
                   marker=dict(color=dfp2["C"])),
    row=1, col=2
)

fig.add_trace(
    go.Scatter(x=dfp3["X"], y=dfp3["Y"], mode='markers', text=dfp3["T"],
                   marker=dict(color=dfp3["C"])),
    row=1, col=1
)

fig.update_yaxes(
    scaleanchor="x",
    scaleratio=1,
  )


In [10]:

# build DataFrame to display scatter plot with all sosas
dfp = pd.DataFrame({
    "X": [get_polar_coor(sosa)[0] for sosa in mya.all_sosa],
    "Y": [get_polar_coor(sosa)[1] for sosa in mya.all_sosa],
    "N": [mya.getName(sosa) for sosa in mya.all_base],
    "S": mya.all_sosa,
    "CC": mya.count,
    "RE": mya.all_base
    })

# print(dfp)
# convert column in str to get palette color (not linear color)
dfp["CC"] = dfp["CC"].astype(str)
fig = px.scatter(data_frame=dfp, x="X", y="Y",color="CC", hover_data=['N', 'S', 'RE'],width=800, height=800)
fig.update_yaxes(scaleanchor = "x", scaleratio = 1)
fig.show()

In [11]:

def wrap2lines(word):
    '''
    Wrap in line on 2 lines with <br> separator

    @param: str
    @return: str
    '''
    perfect_idx = int(len(word)/2)
    possible_idxs = [pos for pos, char in enumerate(word) if char == ' ']
    final_idx = min(possible_idxs, key=lambda x: abs(x-perfect_idx))
    return word[:final_idx] + '<br>' + word[final_idx:]


N = 7
L, P, V = [], [], []
root = 4

if root not in mya.all_sosa:
    print("individu do not exist")

# root node
for sosa in [root]:
    V.append(360)
    if sosa in mya.all_sosa:
        L.append(wrap2lines(str(sosa) + " - " + mya.getName(sosa)))
        P.append(None)

# other nodes
for sosa_rel in range(2, 2**(N+1)):
    sosa = mya.getSosa_real(root, sosa_rel)
    child = int(sosa/2)
    generation = int(math.log2(sosa_rel))
    V.append(360/(2**generation))
    if sosa in mya.all_sosa and child in mya.all_sosa:
        L.append(wrap2lines(str(sosa) + " - " + mya.getName(sosa)))
        P.append(wrap2lines(str(child) + " - " + mya.getName(child)))

fig = go.Figure(go.Sunburst(
    labels=L,
    parents=P,
    values=V,
    branchvalues="total"
))

fig.update_layout(margin=dict(t=0, l=0, r=0, b=0))


In [12]:

U, C, G, N, S = [], [], [], [], []

for sosa in mya.sosa:
    idx = mya.all_sosa.index(sosa)
    c = mya.count[idx]
    if c > 2:
        U.append(sosa)
        C.append(c)
        G.append(int(math.log2(sosa)))
        N.append(mya.getName(sosa))
        idxs = [idx for idx, val in enumerate(mya.all_base) if val == sosa]
        S.append([mya.all_sosa[idx] for idx in idxs])

newdf = pd.DataFrame({
    "sosa": U,
    "generation": G,
    "count": C,
    "name": N,
    "sosas": S
    })

fig = make_subplots(
    rows=1, cols=1,
    specs=[[{"type": "table"}]]
)

fig.add_trace(
    go.Table(
        header=dict(
            values=["sosa", "generation", "count", "name", "sosas"],
            align="center"
        ),
        cells=dict(
            values=[newdf[k].tolist() for k in newdf.columns],
            align="left"),
        columnwidth=[1,1,1,3,8]
    ),
    row=1, col=1
)

fig.update_layout(
    height=600,
    showlegend=False,
    title_text="Most several anecesters",
)

fig.show()


In [13]:
df = pd.DataFrame({
    "x": [get_polar_coor(sosa)[0] for sosa in mya.sosa],
    "y": [get_polar_coor(sosa)[1] for sosa in mya.sosa],
    "parents": mya.parents,
    "sosa": mya.sosa,
    "name": [mya.getName(sosa) for sosa in mya.sosa],
    "inbreeding": mya.inbreeding
    })

fig = px.scatter(data_frame=df, x="x", y="y",color="inbreeding", hover_data=['name', 'sosa'],width=800, height=800)
fig.update_yaxes(scaleanchor = "x", scaleratio = 1)
fig.show()

In [14]:
G = []
generations, gene_tot = [], []
Th, T, U = [], [], []

for sosa in mya.sosa:
    generations.append(int(np.log2(sosa)))

for sosa in mya.all_sosa:
    gene_tot.append(int(np.log2(sosa)))

G = list(dict.fromkeys(generations))

for generation in G:
    Th.append(2**generation)
    U.append(generations.count(generation))
    T.append(gene_tot.count(generation))


newdf = pd.DataFrame({
    "generation": G,
    "unique nb": U,
    "total nb": T,
    "theoritical nb": Th,
    })

fig = make_subplots(
    rows=1, cols=1,
    specs=[[{"type": "table"}]]
)

fig.add_trace(
    go.Table(
        header=dict(
            values=["generation", "unique nb", "total nb", "theoritical nb"],
            align="center"
        ),
        cells=dict(
            values=[newdf[k].tolist() for k in newdf.columns],
            align="left"),
        columnwidth=[1,1,1,1]
    ),
    row=1, col=1
)

fig.update_layout(
    height=600,
    showlegend=False,
    title_text="Data by generation",
)

fig.show()