<a href="https://colab.research.google.com/github/benardt/genealogyKPI/blob/main/genealogy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Genealogy

## Configuration file

In [None]:
# configuration data

my_config = {
    'login': 'xxx',
    'password': 'xxx',
    'login_page': 'https://www.geneanet.org/connexion/',
}


## Modules and dependencies

In [None]:
%%capture
!apt-get update
!wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb
!apt install ./google-chrome-stable_current_amd64.deb
!echo $PATH
!google-chrome --product-version
!pip install seleniumbase

In [None]:
import math
import numpy as np
import pandas as pd
import io, zipfile, re

from bs4 import BeautifulSoup

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from _plotly_utils.importers import relative_import

from tqdm.notebook import tqdm

In [None]:
# code linter

!pip install pycodestyle
!pip install --index-url https://test.pypi.org/simple/ nbpep8

from nbpep8.nbpep8 import pep8

# Add pep8(_ih) at the end of the code cell to see PEP8 analysis.

## Main class

In [None]:

class Ancestors:
    """     Global class    """

    def get_polar_coordinates(self, sosa):
        """Sosa number to poler coordinates

        Args:
            param1 (int): The sosa number

        Returns:
            tuple (float, float): r and phi coordinates
        """

        # calculates the generation of the individual using the sosa number
        generation = int(np.log2(sosa))
        # calculates the total number of individuals in the generation
        n_total = 2**generation - 2**(generation - 1)
        # angle phi based on the sosa number and the total number of
        # individuals in the generation
        phi = (sosa - 2**generation) * np.pi / n_total

        return generation, phi


    def get_coordinates(self, sosa):
        """Sosa number to coordinates

        returns the x and y coordinates of the individual from polar coordinates
        based on the his sosa number

        Args:
            param1 (int): The sosa number.

        Returns:
            tuple (float, float): x and y coordinates.

        """
        radius, phi = self.get_polar_coordinates(sosa)
        return radius * np.cos(phi), radius * np.sin(phi)

    def bfs(self, node_id):
        """Build family graph with BFS algorithm

        breadth-first search algorithm for finding all ancestors of a given node
        in a family tree

        @param: str  - node: starting node id (de-cujus)
        """
        # visited = []   # List to keep track of visited nodes.
        # visited.append(node)
        # Initialize an empty queue and adding starting node
        queue = []
        queue.append(node_id)
        # Initialize an empty queue and adding root sosa is sosa = 1
        sosas = []
        sosas.append(1)

        # maps the sosa to its corresponding node
        li = {}

        while queue:
            child_sosa = sosas.pop(0)
            child_id = queue.pop(0)
            li[child_sosa] = child_id

            print("\rBFS: " + str(len(li)), end="")
            # idx = 0 or 1
            #  + index=0 is sir (ie man)
            #  + index=1 is dam (ie woman)
            for idx, parent_id in enumerate(self.id_graph[child_id]):
                # visited is not used because :
                #   - a parent can have several child
                #   - a parent can be multiple ancestor
                # if parent not in visited:
                #     visited.append(parent)
                if parent_id != None:
                    parent_sosa = 2 * child_sosa + idx
                    sosas.append(parent_sosa)
                    queue.append(parent_id)

        return li

    def soup_parser(self, soup):
        """Parse soup data got from geneanet.org

        Args:
            param1 (soup): complete web page

        """
        print("\nSoup parser: Started!")

        texts = []
        id_childs = []
        id_with_noname = []

        pattern = r'(?<=\d)[^\d^-](?=\d+)' # ne match pas correctement

        all_lis = soup.find_all("li")

        for li0 in all_lis:
            if "Génération" in li0.text:
                ul = li0.find("ul")
                for li in ul.find_all("li"):
                    # remove white space thousands separator
                    string = re.sub(pattern, '', li.text)
                    #string = li.text.replace(u'\xa0', u'')
                    #string = li.text.replace(u'\x202f', u'')
                    texts.append(string)

        print("Soup parser - Number of lines to be processed: ", len(texts))

        # Get all individuals by sosa
        for string in texts:
            if re.match('^[0-9]+\s-\s\?\s\?$', string):
                noname = re.search(r'^([0-9]+)\s-\s\?\s\?$', string).group(1)
                id_with_noname.append(int(noname))
            else:
                if re.match('^[0-9]+\s-\s', string):
                    sosa = re.search(r'^([0-9]+)\s-\s', string).group(1)
                    nom = re.search(r'^[0-9]+\s-\s(.*)', string).group(1)
                    self.ids.append(int(sosa))
                    id_childs.append(int(sosa))
                    self.id_names[int(sosa)] = nom
                if re.match('^[0-9]+\s=>\s[0-9]+$', string):
                    sosa = re.search(r'^([0-9]+)\s=>\s[0-9]+$', string).group(1)
                    sosa1 = re.search(r'^[0-9]+\s=>\s([0-9]+)$', string).group(1)
                    self.ids.append(int(sosa))
                    id_childs.append(int(sosa1))

        # Build graph - dict child:[sir,dam]
        max_elements = len(id_childs)
        for sosa, child in tqdm(zip(self.ids, id_childs), total = max_elements, desc ="Soup parser - Graph of parents being created..."):
            # work only with sosa in child[]
            if sosa in id_childs:
                # add parent if parent exist in sosas[]
                idx_dad = self.ids.index(sosa*2)   if sosa*2   in self.ids else None
                idx_mom = self.ids.index(sosa*2+1) if sosa*2+1 in self.ids else None
                husb = id_childs[idx_dad] if idx_dad is not None else None
                wife = id_childs[idx_mom] if idx_mom is not None else None
                self.id_graph[sosa] = [husb, wife]

        # Remove parent with name '? ?'
        for key in self.id_graph:
            for idx, parent in enumerate(self.id_graph[key]):
                if parent in id_with_noname:
                    self.id_graph[key][idx] = None

        print("Soup parser: Done!")


    def parse_gedcom_line(self, line):
        """Parse a GEDCOM line and return the tag, pointers, and value (if present).

        Args:
            line (str): a single line from a GEDCOM file

        Returns:
            tuple: the tag, pointer_source, pointer_target and value (if present)
        """
        # match a GEDCOM line with five optional fields:
        # level, pointers (x2), tag, and value.
        # Each field is separated by one or more whitespace characters.


        # ^: Matches the start of the string.
        # (?P<level>\d+): Captures one or more digits as the "level" group.
        # (\s+(?P<pointer_source>@\S+@|))?: Optionally matches one or more whitespace characters followed by an "@"-delimited string (captured as the "pointer_source" group) or an empty string.
        # (?:\s+(?P<tag>[A-Z]+)){1}: Matches one or more whitespace characters followed by one or more uppercase letters as the "tag" group.
        # (?:\s+(?!@)(?P<value>.+))?: Optionally matches one or more whitespace characters followed by one or more characters that are not "@" (captured as the "value" group). The negative lookahead assertion (?!@) ensures that the value does not start with "@".
        # (\s+(?P<pointer_target>@\S+@|))?: Optionally matches one or more whitespace characters followed by an "@"-delimited string (captured as the "pointer_target" group) or an empty string.
        # $: Matches the end of the string.


        pattern_ged_line =  "^"
        pattern_ged_line += "(?P<level>\d+)" # Captures one or more digits as the "level" group.
        pattern_ged_line += "(\s+(?P<pointer_source>@\S+@|))?" # match pointer @...@ ! optional
        pattern_ged_line += "(?:\s+(?P<tag>[A-Z]+)){1}" # match maj alpha char ! Mandatory
        pattern_ged_line += "(?:\s+(?!@)(?P<value>.+))?" # match string ! optional
        pattern_ged_line += "(\s+(?P<pointer_target>@\S+@|))?" # match second pointer @...@ ! optional
        pattern_ged_line += "$"

        # The ^ and $ anchors ensure that the entire line is matched.

        mymatch = re.match(pattern_ged_line, line)

        if mymatch:
            level = mymatch.group("level")
            ps = mymatch.group("pointer_source")
            pointer_source = ps.strip("@") if ps is not None else None
            tag = mymatch.group("tag")
            pt = mymatch.group("pointer_target")
            pointer_target = pt.strip("@") if pt is not None else None
            value = mymatch.group("value")
            return level, tag, pointer_source, pointer_target, value
        else:
            return None, None, None, None, None

    def file_GED_parser(self, file_string):
        """Parse GEDCOM file

        """
        # Initialize variables to hold individual and family data
        individuals = {}
        families = {}

        # Iterate over each line in the GEDCOM data and parse it
        current_tag = None
        current_pointer = None

        # Initialize flag to skip the header
        skip_header = True

        file_array = file_string.splitlines(True)

        tags = {"BIRT": "birth", "DEAT": "death", "HUSB": "husband", "WIFE": "wife"}

        for line in file_array:
            level, tag, pointer_source, pointer_target, value = self.parse_gedcom_line(line)

            # Skip the header
            if tag == "INDI" and skip_header==True:
                skip_header = False
            if skip_header:
                continue

            if tag == "INDI":
                current_tag = "INDI"
                current_pointer = pointer_source
                individuals[current_pointer] = {"name": None, "sex": None, "birth": {}, "death": {}, "fams": [], "famc": None}
            elif tag == "NAME":
                individuals[current_pointer]["name"] = value
            elif tag == "SEX":
                individuals[current_pointer]["sex"] = value
            elif tag == "BIRT":
                current_tag = "BIRT"
            elif tag == "DEAT":
                current_tag = "DEAT"
            elif tag == "FAMS":
                individuals[current_pointer]["fams"].append(pointer_target)
            elif tag == "FAMC":
                individuals[current_pointer]["famc"] = pointer_target
            elif tag == "DATE":
                if current_tag in ["BIRT", "DEAT"]:
                    individuals[current_pointer][tags[current_tag]]["date"] = value
            elif tag == "PLAC":
                if current_tag in ["BIRT", "DEAT"]:
                    individuals[current_pointer][tags[current_tag]]["date"] = value
            elif tag == "FAM":
                current_tag = "FAM"
                current_pointer = pointer_source
                families[current_pointer] = {"husband": None, "wife": None, "marriage": {}, "children": []}
            elif tag in ["HUSB", "WIFE"]:
                families[current_pointer][tags[tag]] = pointer_target
            elif tag == "MARR":
                current_tag = "MARR"
            elif tag == "CHIL":
                families[current_pointer]["children"].append(pointer_target)

        return individuals, families


    def file_parser(self, file_string):
        """Parse GEDCOM file

        """

        file_array = file_string.splitlines(True)

        # get family
        pattern_indi = "^0\s@I[0-9]+@\sINDI$"
        pattern_name = "^1\sNAME\s"
        pattern_sexe = "^1\sSEX\s"
        pattern_fam = "^0\s@F[0-9]+@\sFAM$"
        pattern_chil = "^1 CHIL @I[0-9]+@$"
        pattern_husb = "^1 HUSB @I[0-9]+@$"
        pattern_wife = "^1 WIFE @I[0-9]+@$"
        chi = []
        sir = []
        dam = []

        # Get all families
        for string in file_array:
            if re.match(pattern_fam, string):
                husb, wife = None, None
            if re.match(pattern_husb, string):
                husb = re.search(r'^1 HUSB @I([0-9]+)@$', string).group(1)
            if re.match(pattern_wife, string):
                wife = re.search(r'^1 WIFE @I([0-9]+)@$', string).group(1)
            if re.match(pattern_chil, string):
                chil = re.search(r'^1 CHIL @I([0-9]+)@$', string).group(1)
                chi.append(int(chil))
                sir.append(int(husb) if husb != None else None)
                dam.append(int(wife) if wife != None else None)

        # get all individuals
        is_indi = 1
        for string in file_array:
            if re.match(r'^0\s', string) and is_indi == 0:
                self.ids.append(id)
                self.id_names[id] = nom
                is_indi = 1
            if re.match(pattern_indi, string):
                id = int(re.search(r'^0\s@I([0-9]+)@\sINDI$', string).group(1))
                nom = ""
                is_indi = 0
            if re.match(pattern_name, string):
                nom = re.search(r'^1\sNAME\s(.*)$', string).group(1)


        # Add father and mother to all individuals
        for id in self.ids:
            if id in chi:
                idx = chi.index(id)
                self.id_graph[id] = [sir[idx], dam[idx]]
            else:
                self.id_graph[id] = [None, None]



    def __init__(self, soup=None, file_string='', mode='connect'):

        self.ids = []
        # dict - id (int): name (str)
        self.id_names = {}
        # dict - child: [sir, dam]
        self.id_graph = {}

        if mode == "connect":
            print('=> Geneanet parser')
            if soup:
                self.soup_parser(soup)
            else:
                print("Error: No data")
        elif mode == "file":
            print('=> File parser')
            self.file_parser(file_string)
        else:
            print('Parser error')

        # ids
        # id_names
        # id_graph

    def getName(self, sosa):
        """

        return name of person identified by sosa number

        @param: int - sosa number
        @return: str - name of sosa
        """

        if sosa != 0:
            ids = self.sosa2ids[sosa]
            name = self.id_names[ids]
        else:
            name = "None"
        return name

    def calculate_harmonic_sum(self, n):
        total_sum = 0.0
        for i in range(1, n):
            total_sum = total_sum + 1.0/(2**i);
        return total_sum;

    def build_sosas(self, souche_id):
        """

        Build 4 lists:
            sosas{}:          dict - all sosas (with implex) and unique sosas
            individual_occurrence[]:
                                list - number of occurence for individual (implex)


        """

        # 'all' and 'unique_all' work together, len('all') = len('unique_all')
        # -> 'all' no duplicate
        # -> 'unique_all' some duplicates, sosas from the same indiviual are
        #    replaced by the lower sosa number



        print("\nBuild sosas: Breadth-first search starting!")
        self.sosa2ids = self.bfs(souche_id)
        print("   BFS end!")

        self.sosas = {}

        self.sosas['all'] = np.array(list(self.sosa2ids.keys()), dtype=int)
        self.generation = np.log2(self.sosas['all']).astype(int)
        self.generation[0] = 1
        self.n_total = 2**self.generation - 2**(self.generation - 1)
        self.generation[0] = 0
        # angle phi based on the sosa number and the total number of
        # individuals in the generation
        phi = (self.sosas['all'] - 2**self.generation) * np.pi / self.n_total
        vcalculate_harmonic_sum = np.vectorize(self.calculate_harmonic_sum)
        phi_offset = 0.5 * np.pi * vcalculate_harmonic_sum(self.generation)
        print(vcalculate_harmonic_sum([0,1,2,3,4,5]))
        print(180 * phi_offset[:10] / np.pi)
        self.coord_x = self.generation * np.cos(phi - phi_offset)
        self.coord_y = self.generation * np.sin(phi - phi_offset)


        all_ids = np.array(list(self.sosa2ids.values()), dtype=int)

        unique_ids = np.unique(all_ids)
        all_length = len(self.sosas['all'])

        self.individual_occurrence = np.empty(all_length, dtype=int)
        for idx, sosa in tqdm(enumerate(self.sosas['all']), total=all_length, desc ="Multiple ancestors processing"):
            id = self.sosa2ids[sosa]
            self.individual_occurrence[idx] = np.count_nonzero(all_ids == id)

        self.sosas['unique_all'] = np.empty(all_length, dtype=int)
        for idx, id in tqdm(enumerate(all_ids), total=all_length, desc ="Unique all processing"):
            sosas_id = self.sosas['all'][all_ids==id]
            self.sosas['unique_all'][idx] = np.min(sosas_id)

        self.sosas['unique_only'] = np.unique(self.sosas['unique_all'])

        self.unique = mya.sosas['unique_all'] == mya.sosas['all']


    def build_parents(self):
        """
        Convert the `id_graph` dictionary, which maps a unique ID to a list of its
        parents' IDs, to a list of lists that maps each unique sosa number to its
        parents' sosa numbers.

        Returns:
            None.

        Parameters:
            self (object): An instance of a class with the following attributes:
                - `sosa2ids` (dict): A dictionary mapping unique sosa numbers to
                their corresponding unique IDs.
                - `sosas` (dict): A dictionary mapping different types of sosa
                numbers to lists of sosa numbers.
                - `id_graph` (dict): A dictionary mapping unique IDs to lists of
                two IDs representing the parents of the individual with that ID.

        Notes:
            Assumes that there are no missing sosa numbers in `self.sosas['unique_only']`,
            and that every sosa number in `self.sosas['unique_only']` corresponds to a
            unique individual in the family tree represented by `self.id_graph`.

            source: id_graph{} dict - {child: [sir, dam], ...}
            target: self.parents[] list - [[], [sosa_sir2, sosa_dam2], [sosa_sir3], ...]

        """

        sosas = list(self.sosa2ids.keys())
        ids = list(self.sosa2ids.values())

        self.parents = [[] for _ in  self.sosas['unique_only']]

        # only parents of sosas are needed. Not all individuals in id_graph{}
        for idx, sosa in enumerate(self.sosas['unique_only']):
            id = self.sosa2ids[sosa]
            for parent in self.id_graph[id]:
                if parent != None:
                    self.parents[idx] += [sosas[ids.index(parent)]]


    def longest_ancestral_path(self, idx_parents):
        """Return 1D array of 'longest ancestral path'

        the "longest ancestral path" refers to the maximum number of
        generations that exist between an individual and their most distant
        known ancestor within a given genealogical tree or lineage.

        """

        nb_individuals = len(self.parents)
        queue_of_individuals = np.linspace(
            0, nb_individuals-1, nb_individuals, dtype=int).tolist()
        laps = np.zeros(nb_individuals, dtype=int) - 1

        while queue_of_individuals:
            for individual in queue_of_individuals:
                lap_of_parents = []
                if not self.parents[individual]:
                    laps[individual] = 0
                    queue_of_individuals.remove(individual)
                else:
                    for parent in idx_parents[individual]:
                        lap_of_parents.append(laps[parent])
                    if -1 not in lap_of_parents:
                        laps[individual] = max(lap_of_parents) + 1
                        queue_of_individuals.remove(individual)

        print("max pseudo generation: ", np.max(laps))

        return laps


    def inbreeding_preparation(self):

        idx_parents = []
        for current_parents in self.parents:
            current_idx_parents = []
            for parent in current_parents:
                idx_sosa = np.where(self.sosas['unique_only'] == parent)[0][0]
                current_idx_parents.append(idx_sosa)
            idx_parents.append(current_idx_parents)

        nb_parents = len(idx_parents)
        laps = self.longest_ancestral_path(idx_parents)

        # build 2D array for parents

        P = np.zeros((nb_parents, 4), dtype=int) - 1
        for individual, current_parents in enumerate(idx_parents):
            for idx_parent, single_parent in enumerate(current_parents):
                P[individual][idx_parent+1] = single_parent
            P[individual][3] = laps[individual]
            P[individual][0] = individual

        return P


    def inbreeding_fast(self, P):
        """Compute inbreeding with 'fast method'

        """

        nb_parents = P.shape[0]

        max_lap = np.amax(P, axis=0)[3]

        # first element of array is not use
        # first individual are number 1
        # number of individual and index in array are equal
        f = np.zeros((nb_parents+1), dtype=float)
        l = np.zeros((nb_parents), dtype=float)
        d = np.zeros((nb_parents), dtype=float)

        f[0] = - 1

        for pseudo_generation in range(max_lap+1):

            idx_indi_list = np.where(P[:, 3] == pseudo_generation)[0]

            for i in idx_indi_list:

                ANC_i = [i]  # list of ancestor
                fi = -1.0
                l[i] = 1.0

                # sir and dam at index i
                si, di = P[i, 1], P[i, 2]
                d[i] = 0.5 - 0.25 * (f[si+1] + f[di+1])

                while ANC_i:
                    # start with youngest individual
                    j = min(ANC_i)
                    # sir and dam at index j
                    sj, dj = P[j, 1], P[j, 2]

                    R = 0.5 * l[j]
                    if sj != -1:
                        ANC_i.append(sj)
                        l[sj] += R
                    if dj != -1:
                        ANC_i.append(dj)
                        l[dj] += R

                    fi += d[j]*l[j]**2
                    l[j] = 0.0

                    ANC_i.remove(j)

                f[i+1] = fi

        return 100*f[1:]


    def get_indices_lap(self, P):
        """re order 1D array for tabular inbreeding method

        """

        max_lap = np.amax(P, axis=0)[3]
        nb_parents = P.shape[0]
        new2old = []
        for pseudo_gene in range(max_lap+1):
            idx_indi_list = np.where(P[:, 3] == pseudo_gene)[0]
            idxes = np.flip(idx_indi_list).tolist()
            new2old.extend(idxes)

        old2new = []
        for i in range(0, nb_parents):
            old2new.append(new2old.index(i))

        return new2old, old2new


    def inbreeding_tabular(self, PA):
        """Compute inbreeding with 'tabular method'

        """

        nb_parents = PA.shape[0]

        new2old, old2new = self.get_indices_lap(PA)

        A = np.diag(nb_parents * [1.0])

        print("Diag matrix done!")

        for (i, j), _ in tqdm(np.ndenumerate(A), total=nb_parents*nb_parents):
            j_p = new2old[j]
            if i == j:
                has_2_parents = PA[j_p][1] != -1 and PA[j_p][2] != -1
                if has_2_parents:
                    id_sir_dam = old2new[PA[j_p][1]], old2new[PA[j_p][2]]
                    A[i, j] += 0.5 * A[id_sir_dam]

            elif i < j:
                current_parents = []
                if PA[j_p][1] != -1:
                    current_parents.append(old2new[PA[j_p][1]])
                if PA[j_p][2] != -1:
                    current_parents.append(old2new[PA[j_p][2]])

                for idx_tab_parent in current_parents:
                    A[i, j] += 0.5 * A[i, idx_tab_parent]
                    A[j, i] = A[i, j]

        # flip matrix A to have youngest at bottom and left
        #A = np.flip(A)
        print('\nFirst step done!')

        # inbreeding coefficients are on diagonal of matrix A : 100 * (value - 1)
        f = 100 * (A.diagonal() - 1)

        f_true = np.zeros((nb_parents), dtype=float)
        for i in range(0, nb_parents):
            j = new2old[i]
            f_true[j] = f[i]

        return f_true


    def inbreeding(self, mode='fast', n=4):
        """Compute inbreeding coefficients

        Compute inbreeding coefficient for all individuals from parents data.
        Two individuals can have the same parents. This is the starting point
        for implex.

        Returns:
            numpy 1D array: inbreeding coefficients

        """
        print("Number of individuals to be processed: ", len(self.parents))
        Z = self.inbreeding_preparation()

        if mode == 'fast':
            INBREEDINGS = self.inbreeding_fast(Z)
        elif mode == 'tabular':
            INBREEDINGS = self.inbreeding_tabular(Z)
        else:
            INBREEDINGS = [0]
            print('Error: Inbredding')


        df = pd.DataFrame({
            "x": self.coord_x[self.unique][:n],
            "y": self.coord_y[self.unique][:n],
            "parents": self.parents[:n],
            "sosa": self.sosas['unique_only'][:n],
            "name": [self.getName(sosa) + "<br>" + str(inbreed) for sosa, inbreed in zip(self.sosas['unique_only'][:n],INBREEDINGS[:n])],
            "inbreeding": INBREEDINGS[:n],
            "inbreeding_color": [np.log(inbreeding+0.00001) for inbreeding in INBREEDINGS[:n]]
            })

        return df





##Roglo scrapping

In [None]:
# Get data by connecting on http://roglo.eu/roglo website
# Authentification : Digest


from seleniumbase import SB

with SB(browser="chrome", chromium_arg="--no-sandbox, --disable-dev-shm-usage") as sb:
    sb.open("http://roglo.eu/roglo?lang=fr;w=f;username=BenardT;password=xxxx")
    #sb.type("#_username", my_config['login'])
    #sb.type("#_password", my_config['password'])
    #sb.click("#_submit")
    #sb.open("https://www.geneanet.org/")
    #suffixe = '&m=A&t=N&v=50&lang=fr'
    #r2 = sb.get_page_source()
    #soup2 = BeautifulSoup(r2, "html.parser")
    #td_tag_list = soup2.find_all("a", attrs={"gaq-event": "show-souche"})
    #print(r2)
    r = sb.get_page_source()

soup = BeautifulSoup(r, "html.parser")

# Souche de l'arbre

print(soup)

## Geneanet scrapping

In [None]:
# Get data by connecting on geneanet.org website

from seleniumbase import SB

with SB(browser="chrome", chromium_arg="--no-sandbox, --disable-dev-shm-usage") as sb:
    sb.open(my_config['login_page'])
    sb.type("#_username", my_config['login'])
    sb.type("#_password", my_config['password'])
    sb.click("#_submit")
    sb.open("https://www.geneanet.org/")
    suffixe = '&m=A&t=N&v=50&lang=fr'
    r2 = sb.get_page_source()
    soup2 = BeautifulSoup(r2, "html.parser")
    td_tag_list = soup2.find_all("a", attrs={"gaq-event": "show-souche"})
    print('https:'+td_tag_list[0]['href']+suffixe)
    sb.open('https:'+td_tag_list[0]['href']+suffixe)
    r = sb.get_page_source()

soup = BeautifulSoup(r, "html.parser")

# Souche de l'arbre

td_tag_list = soup2.find_all("a", attrs={"gaq-event": "show-souche"})
print(td_tag_list[0]['href'])

## Instanciation and basic checks

In [None]:
# Get data from local file GEDCOM (zip)
# Paser of GEDCOM file
def gedcom():
    archive = zipfile.ZipFile('/content/drive/MyDrive/data/benardt_2023-01-09.zip', 'r')
    file_data = archive.read('base.ged')
    file_string = file_data.decode("utf-8")
    mya = Ancestors(file_string=file_string, mode='file')

    return mya

# Cas ascendants télécharger de Roglo
def file():
    roglo_archive = zipfile.ZipFile('/content/drive/MyDrive/data/roglo1.zip', 'r')
    roglo_file_data = roglo_archive.read(roglo_archive.namelist()[0])
    roglo_file_string = roglo_file_data.decode("utf-8")
    soup = BeautifulSoup(roglo_file_string, "html.parser")


    # Get data by connecting to Geneanet website
    return Ancestors(soup, mode='connect')


#mya = gedcom()
#mya.build_sosas(288)
mya = file()
mya.build_sosas(1)
mya.build_parents()

print("Number of individuals: ", len(mya.id_graph))

print("sosas unique", len(mya.sosas['unique_only']), mya.sosas['unique_only'])
print("parents", len(mya.parents), mya.parents)
print("---------")
print("sosas all", len(mya.sosas['all']), mya.sosas['all'])
print("generation", len(mya.generation), mya.generation)
print("count", len(mya.individual_occurrence), mya.individual_occurrence)
print("unique", len(mya.unique), mya.unique)


for idx in range(11):
    print(mya.sosas['all'][idx], mya.id_names[mya.sosa2ids[mya.sosas['all'][idx]]])


In [None]:
# Families parser do not work

print(mya.file_GED_parser(file_string)[0]['I2343'])
mya.file_GED_parser(file_string)[1]

In [None]:
import networkx as nx

G = nx.DiGraph()
G.add_nodes_from(mya.sosas['unique_only'])

for idx, parents in enumerate(mya.parents):
    for parent in parents:
        G.add_edge(mya.sosas['unique_only'][idx], parent)

print(nx.degree(G))
print(nx.density(G))

pos=nx.spring_layout(G)
print(pos)

## Inbreedings

In [None]:
dfi = mya.inbreeding(mode='fast', n=7000)

In [None]:
# Figure
fig = make_subplots(rows=1, cols=2, column_titles=["Inbreeding coefficient for each individual [log scale]", "Matrix"])
# fig.add_trace(go.Heatmap(z=H),
#     row=1, col=2
# )

fig.add_trace(go.Scatter(x=dfi["x"], y=dfi["y"], mode='markers',
    marker=dict(color=dfi["inbreeding_color"], size=4, colorscale="Viridis_r"),
    text=dfi["name"]),
    row=1, col=1
)

fig.update_layout(
    yaxis=dict(scaleanchor = "x", scaleratio = 1),
    yaxis2=dict(scaleanchor = "x2", scaleratio = 1),
)

fig.show()

## Main scatter with sosas

### Unique vs All / Individuals occurrence

In [None]:

# build DataFrame to display scatter plot with all sosas
def wrap2lines(word):
    '''
    Wrap in line on 2 lines with <br> separator

    @param: str
    @return: str
    '''
    perfect_idx = int(len(word)/2)
    possible_idxs = [pos for pos, char in enumerate(word) if char == ' ']
    final_idx = min(possible_idxs, key=lambda x: abs(x-perfect_idx))
    return word[:final_idx] + '<br>' + word[final_idx:]

# build DataFrame to display scatter plot with all sosas

dfp2 = pd.DataFrame({
    "X": mya.coord_x,
    "Y": mya.coord_y,
    "Generation": mya.generation,
    "Sosa": mya.sosas['all'],
    "Nb of sosas": mya.individual_occurrence,
    "Unique sosa": mya.sosas['unique_all'],
    "U": mya.unique,
    "Name": [mya.getName(sosa) for sosa in mya.sosas['unique_all']],
    "Sbin": [bin(sosa) for sosa in mya.sosas['all']],
    "Sun_L": [wrap2lines(str(sosa) + " - " + mya.getName(sosa)) for sosa in mya.sosas['all']],
    "Sun_P": [wrap2lines(str(int(sosa/2)) + " - " + mya.getName(int(sosa/2))) for sosa in mya.sosas['all']],
    "Sun_V": [360/(2**int(math.log2(sosa))) for sosa in mya.sosas['all']],
    "Child": [int(sosa/2) for sosa in mya.sosas['all']],
    "V": [360/(2**int(math.log2(sosa))) for sosa in mya.sosas['all']],
    "T": ["sosa: " + str(sosa) + " / " + mya.getName(sosa) + "<br>" + str(c) for sosa,c in zip(mya.sosas['all'],mya.individual_occurrence)],
    "C": [color for color in mya.individual_occurrence]
    })

# convert column in str to get palette color (not linear color)
dfp2["Nb of sosas_c"] = dfp2["Nb of sosas"].astype(str)

title2 = "Unique: " + str(mya.sosas['unique_only'].shape[0]) + " / All: " + str(mya.sosas['all'].shape[0])

fig = make_subplots(rows=1, cols=2, column_titles=["Occurence", title2])

fig.add_trace(
    go.Scatter(x=dfp2["X"], y=dfp2["Y"], mode='markers', text=dfp2["T"],
                   marker=dict(color=dfp2["C"], colorscale="Viridis_r", size=4)),

    row=1, col=1
)

fig.add_trace(
    go.Scatter(x=dfp2["X"], y=dfp2["Y"], mode='markers', text=dfp2["T"],
                   marker=dict(color=1*dfp2["U"], colorscale="Bluered_r", size=4)),

    row=1, col=2
)

fig.update_layout(
    yaxis=dict(scaleanchor = "x", scaleratio = 1),
    yaxis2=dict(scaleanchor = "x2", scaleratio = 1),
)



### Descendant way for a multiple ancestor

In [None]:


def get_all_childs(sosa_target):
    """
    get all the descendants (who exits) of an individual
    @param: int - sosa
    @return: list - all sosas of descendants
    """

    idxs = [idx for idx, val in enumerate(mya.sosas['unique_all']) if val == sosa_target]

    family = []
    sosas_identical = []
    for idx in idxs:
        sosa = mya.sosas['all'][idx]
        sosas_identical.append(sosa)
        family.append(sosa)
        child = int(sosa/2)
        # All sosas are ancestors.
        # So end (of while loop) is sosa = 1 [de cujus]
        while child != 1:
            family.append(child)
            child = int(child/2)

    return family,sosas_identical

#sosa_target = 278944
sosa_target = 224551
#sosa_target = 266370
#sosa_target = 33296

descendants, identicals = get_all_childs(sosa_target)

dfp3 = pd.DataFrame({
    "X": [mya.get_coordinates(sosa)[0] for sosa in descendants],
    "Y": [mya.get_coordinates(sosa)[1] for sosa in descendants],
    "T": ["sosa: " + str(sosa) + " / " + mya.getName(sosa) for sosa in descendants],
    "C": [1] * len(descendants)
    })

dfp4 = pd.DataFrame({
    "X": [mya.get_coordinates(sosa)[0] for sosa in identicals],
    "Y": [mya.get_coordinates(sosa)[1] for sosa in identicals],
    "T": ["sosa: " + str(sosa) + " / " + mya.getName(sosa) for sosa in identicals],
    "C": [0] * len(identicals)
    })

fig = make_subplots(rows=1, cols=2, column_titles=["All descendants of "+ mya.getName(sosa_target)+" [" + str(sosa_target) +"]", "Number of times a sosa is an ancestor"])

fig.add_trace(
    go.Scatter(x=dfp2["X"][dfp2["U"]==True], y=dfp2["Y"][dfp2["U"]==True], mode='markers', text=dfp2["T"][dfp2["U"]==True],
                name="all unique sosas"),
    row=1, col=1
)

fig.add_trace(
    go.Scatter(x=dfp2["X"], y=dfp2["Y"], mode='markers', text=dfp2["T"],
                   marker=dict(color=dfp2["C"]),
               name="all sosas"),

    row=1, col=2
)

# Affiche en sur-impression (par dessus) tous les descendants de "sosa_target"
fig.add_trace(
    go.Scatter(x=dfp3["X"], y=dfp3["Y"], mode='markers', text=dfp3["T"],
                    name="descendants",
                   marker=dict(color=dfp3["C"])),
    row=1, col=1
)

# Affiche en sur-impression (par dessus) tous les descendants de "sosa_target"
fig.add_trace(
    go.Scatter(x=dfp4["X"], y=dfp4["Y"], mode='markers', text=dfp4["T"],
                    name="identical",
                   marker=dict(color=dfp4["C"],colorscale="Viridis_r")),
    row=1, col=1
)

fig.update_yaxes(
    scaleanchor="x",
    scaleratio=1,
  )


### Sunburst

In [None]:
import ipywidgets as widgets

myw1 = widgets.BoundedIntText(
    value=4,
    min=0,
    max=50000000,
    step=1,
    description='Sosa:',
    disabled=False
)

myw2 = widgets.IntSlider(
    value=7,
    min=0,
    max=30,
    step=1,
    description='Génération:',
    disabled=False,
    continuous_update=False,
    orientation='horizontal',
    readout=True,
    readout_format='d'
)

def draw_sun(gene,root):

    df = dfp2
    idxs = []
    for idx, _ in enumerate(mya.sosas['all']):
        if df['Generation'][idx] < gene and df['Sbin'][idx].startswith(bin(root)):
            idxs.append(idx)

    # fig = go.Figure(go.Sunburst(
    #     labels=df['Sun_L'][idxs][1:],
    #     parents=df['Sun_P'][idxs][1:],
    #     values=df['Sun_V'][idxs][1:],
    #     branchvalues="total"
    # ))

    fig1 = go.Figure(go.Sunburst(
        labels=df['Sosa'][idxs][1:],
        parents=df['Child'][idxs][1:],
        values=df['V'][idxs][1:],
        branchvalues="total",
        hovertext=df['Name'][idxs][1:],
    ))

    fig.update_layout(margin=dict(t=0, l=0, r=0, b=0))
    fig1.show()

ui = widgets.HBox([myw1, myw2])
out = widgets.interactive_output(draw_sun, {'gene': myw2, 'root': myw1})
display(ui, out)



### Multiple descendant table

In [None]:

U, C, G, N, S = [], [], [], [], []

mini_nbr_doublon = 3

for sosa in mya.sosas['unique_only']:
    idx = np.where(mya.sosas['all'] == sosa)[0][0]
    c = mya.individual_occurrence[idx]
    if c > mini_nbr_doublon-1:
        U.append(sosa)
        C.append(c)
        G.append(int(math.log2(sosa)))
        N.append(mya.getName(sosa))
        idxs = [idx for idx, val in enumerate(mya.sosas['unique_all']) if val == sosa]
        S.append([mya.sosas['all'][idx] for idx in idxs])

newdf = pd.DataFrame({
    "sosa": U,
    "generation": G,
    "count": C,
    "name": N,
    "sosas": S
    })

fig = make_subplots(
    rows=1, cols=1,
    specs=[[{"type": "table"}]]
)

fig.add_trace(
    go.Table(
        header=dict(
            values=["sosa", "generation", "count", "name", "sosas"],
            align="center"
        ),
        cells=dict(
            values=[newdf[k].tolist() for k in newdf.columns],
            align="left"),
        columnwidth=[1,1,1,3,8]
    ),
    row=1, col=1
)

fig.update_layout(
    height=600,
    showlegend=False,
    title_text="Most several anecesters",
)

fig.show()


## Global statistics

In [None]:

generations_sosa, gene_tot = [], []
Th, T, U, C, G, I = [], [], [], [], [], []

for sosa in mya.sosas['unique_only']:
    generations_sosa.append(int(np.log2(sosa)))

for sosa in mya.sosas['all']:
    gene_tot.append(int(np.log2(sosa)))

generations = sorted(dfp2['Generation'].unique())

for generation in generations:
    Th.append(2**generation)
    U.append(generations_sosa.count(generation))
    T.append(gene_tot.count(generation))
    C.append(int(10000*gene_tot.count(generation)/(2**generation))/100)
    I.append(int(100*generations_sosa.count(generation)/gene_tot.count(generation)))

newdf = pd.DataFrame({
    "generation": generations,
    "unique nb": U,
    "total nb": T,
    "Implex": I,
    "completion": C,
    "theoritical nb": Th
    })

fig = make_subplots(
    rows=2, cols=2,
    specs=[[{"type": "xy"}, {"type": "violin"}], [{"type": "table"}, {"type": "sunburst"}]]
)

fig.add_trace(go.Bar(name='Total', x=T, y=generations, orientation='h'), row=1, col=1)
fig.add_trace(go.Bar(name='Unique', x=U, y=generations, orientation='h'), row=1, col=1)

fig.add_trace(go.Scatter(name='Completition', x=C, y=generations), row=1, col=1)
fig.data[2].update(xaxis='x5')
fig.update_layout(xaxis5= {'anchor': 'y', 'overlaying': 'x', 'side': 'top'})

fig.add_trace(go.Scatter(name='Implexe', x=I, y=generations), row=1, col=1)
fig.data[3].update(xaxis='x6')
fig.update_layout(xaxis6= {'anchor': 'y', 'overlaying': 'x', 'side': 'top'})

fig.add_trace(go.Violin(y=dfp2["Generation"][dfp2['U'] == True],
                        side='negative', name="Unique", scalegroup='Unique',
                        line_color='blue', x0=0), row=1, col=2)
fig.add_trace(go.Violin(y=dfp2["Generation"],
                        side='positive', name="Total", scalegroup='Total',
                        line_color='orange', x0=0), row=1, col=2)


fig.add_trace(
    go.Table(
        header=dict(
            values=["generation", "unique nb", "total nb", "Implex", "completion", "theoritical nb"],
            align="center"
        ),
        cells=dict(
            values=[newdf[k].tolist() for k in newdf.columns],
            align="left"),
        columnwidth=[1, 1, 1, 1, 1, 1]
    ),
    row=2, col=1
)


G = 9

fig.add_trace(go.Sunburst(
    labels=dfp2['Sosa'][dfp2['Generation'] < G][1:],
    parents=dfp2['Child'][dfp2['Generation'] < G][1:],
    values=dfp2['V'][dfp2['Generation'] < G][1:],
    branchvalues="total",
    hovertext=dfp2['Name'][dfp2['Generation'] < G][1:],
), row=2, col=2)

fig.update_xaxes(title="people #", row=1, col=1)
fig.update_xaxes(title="people #", row=1, col=2)
fig.update_yaxes(title="generation #", row=1, col=1)
fig.update_yaxes(title="generation #", row=1, col=2)

fig.update_layout(
    height=800,
    width=900,
    showlegend=True,
    title_text="Global charts"
)

fig.show()


#Test Javascript

In [None]:
%%javascript

// Output

let my_output_area = document.querySelector("#output-area");

let iDiv1 = document.createElement('div');
iDiv1.id = 'graph';
my_output_area.appendChild(iDiv1);

let iDiv2 = document.createElement('div');
iDiv2.id = 'message';
my_output_area.appendChild(iDiv2);

let my_output_graph = document.querySelector("div#output-area div#graph");
let my_output_message = document.querySelector("div#output-area div#message");

// function

function calc(a, b) {
    let c;

    c = a + b;

    let message = 'Res = ' + c;

    return message;
}

// print

my_output_graph.appendChild(document.createTextNode(calc(1, 3.3)));
my_output_message.appendChild(document.createTextNode('end --'));


#Bibliography

Georgelis, A. (2018). Multiperspective visualization of genealogy data.
https://www.diva-portal.org/smash/get/diva2:1242034/FULLTEXT01.pdf



Ball, R., & Cook, D. (2014, February). A family-centric genealogy visualization paradigm. In Proceedings of 14th Annual Family History Technology Workshop.
http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.589.6435&rep=rep1&type=pdf


Köhle, D. Spatio-Temporal Genealogy Visualization with WorldLines.
https://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.645.7667&rep=rep1&type=pdf



 http://www.aviz.fr/geneaquilts



 Calculation of inbreeding and relationship, the tabular method http://www.ihh.kvl.dk/htm/kc/popgen/genetics/4/5.htm

B Tier. Computing inbreeding coefficients quickly. Genetics Selection Evolution, 1990, 22 (4), pp.419-430. hal-00893856 https://hal.archives-ouvertes.fr/hal-00893856/


Meuwissen, T., Luo, Z. Computing inbreeding coefficients in large populations. Genet Sel Evol 24, 305-313 (1992). https://doi.org/10.1186/1297-9686-24-4-305 https://gsejournal.biomedcentral.com/counter/pdf/10.1186/1297-9686-24-4-305.pdf


Mehdi Sargolzaei and Hiroaki Iwaisaki, An Efficient Algorithm for Computing Inbreeding Coefficients in Large Populations},Japanese Journal of Biometrics, vol 25, pages=25-36 (2004) https://www.jstage.jst.go.jp/article/jjb/25/1/25_1_25/_pdf


COLLEAU, J., & SARGOLZAEI, M. (2008). A proximal decomposition of inbreeding, coancestry and contributions. Genetics Research, 90(2), 191-198. doi:10.1017/S0016672307009202 https://www.cambridge.org/core/services/aop-cambridge-core/content/view/168DF022E465AC477BB39465227CCDAD/S0016672307009202a.pdf/div-class-title-a-proximal-decomposition-of-inbreeding-coancestry-and-contributions-div.pdf


