In [1]:
import pandas as pd
import sqlalchemy as sa
import os
from pathlib import Path

In [2]:
paper_to_delete = pd.read_excel("./data_commit/delete_list.xlsx")

paper_to_delete

Unnamed: 0,publication_id,title,first_author,doi,resource_uri,doi_hyperlink,delete_reason
0,827,Polymorphic ventricular tachycardia and non-co...,"Pérez Díaz, P",10.1016/j.rccar.2019.09.008,,https://doi.org/10.1016/j.rccar.2019.09.008,Not in English
1,1096,Genetics of ion channels in sudden unexplained...,"Koo, S.H.",10.2174/1875692110806030185,https://www.embase.com/records?id=L354806040,https://doi.org/10.2174/1875692110806030185,Cannot access article
2,690,Catecholaminergic Polymorphic Ventricular Tach...,"Hasdemir, C",,https://pubmed.ncbi.nlm.nih.gov/18849218,,Cannot access article
3,406,Multigene panel testing for arrhythmias: Diagn...,"Johnston, T.",,https://www.embase.com/records?id=L72283837,,No specific RYR2 mutation mentioned
4,121,Fatal arrhythmias associated with genetic vari...,"Horie, M.",10.1007/s10840-018-0338-y,,,
5,1343,Novel mutations in arrhythmogenic right ventri...,"Pamuru, PR",10.4103/0971-6866.86182,,,


# Delete the papers from the database

1. Create a folder called `init` in the root of the project
2. Download the backups from Zenodo into the `init` folder:

[![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.14056683.svg)](https://doi.org/10.5281/zenodo.14056683)

3. Start the docker container with the database (password/username: `postgres` unless you changed it)

In [3]:
from contextlib import contextmanager
from sqlalchemy import Engine, Connection, bindparam


@contextmanager
def get_engine():
    _engine: Engine | None = None

    try:
        _engine = sa.create_engine(
            "postgresql+psycopg://postgres:postgres@localhost:5432/postgres"
        )
        yield _engine
    except Exception as e:
        print("Error")
        raise e
    finally:
        if _engine:
            _engine.dispose()

# Get the Patients who are in the papers to delete

In [4]:
from sqlalchemy import text

with get_engine() as engine:
    qry = text(f"""
    SELECT * FROM publication
    WHERE publication_id IN :paper_ids
    """).bindparams(
        bindparam(
            "paper_ids", expanding=True
        )
    )

    papers_del = pd.read_sql(qry, engine, params={
        "paper_ids": paper_to_delete["publication_id"].tolist()})

papers_del

Unnamed: 0,publication_id,title,first_author,reference,doi,year,created_at,updated_at,publication_type_id,rob_publication_type_id
0,121,Fatal arrhythmias associated with genetic vari...,"Horie, M.",Horie M. Fatal arrhythmias associated with gen...,10.1007/s10840-018-0338-y,2018,2024-08-07 14:42:27.974705+00:00,2024-08-07 14:47:19.255467+00:00,,
1,1343,Novel mutations in arrhythmogenic right ventri...,"Pamuru, PR",,10.4103/0971-6866.86182,2011,2024-08-07 14:42:52.333719+00:00,2024-08-07 14:47:19.255467+00:00,,


In [5]:
with get_engine() as engine:
    qry = text("""
    SELECT 
        itp.individual_id,
        itp.publication_id,
        v.hgvs_string,
        ie.original_row
    FROM individual_to_publication itp
    JOIN individual_variant iv ON itp.individual_id = iv.individual_id
    JOIN variant v ON iv.variant_id = v.variant_id
    JOIN individual_original_excel_row ie ON itp.individual_id = ie.individual_id
    WHERE publication_id IN :paper_ids
    """).bindparams(
        bindparam(
            "paper_ids", expanding=True
        )
    )

    ind_to_papers_del = pd.read_sql(qry, engine, params={
        "paper_ids": paper_to_delete["publication_id"].tolist()})

ind_to_papers_del

Unnamed: 0,individual_id,publication_id,hgvs_string,original_row


# Delete the papers

Note since the original db had no delete cascades on the foreign keys we need to manually delete every row that is related to the paper, all the individuals related to the paper, etc... :(

In [6]:
from sqlalchemy import Sequence, Row


def get_id(lt_of_ids: Sequence[Row]):
    return [x[0] for x in lt_of_ids]


def delete_paper_and_related(paper_id: int, conn: Connection):
    individuals_ids = get_id(conn.execute(
        sa.text("""
        SELECT individual_id
        FROM individual_to_publication
        WHERE publication_id = :paper_id
        """),
        {"paper_id": paper_id}
    ).fetchall())

    print(f"Deleting paper {paper_id} with individuals {individuals_ids}")

    for individual_id in individuals_ids:
        delete_family_history(individual_id, conn)
        delete_variant_info(individual_id, conn)
        delete_others(individual_id, conn)

        # now can actually delete the individual
        conn.execute(sa.text(
            """
            DELETE FROM individual
            WHERE individual_id = :individual_id
            """
        ), {"individual_id": individual_id})

        print(f"Successfully deleted individual {individual_id}")

        print("-" * 10)

    unlink_pub(paper_id, conn)
    conn.execute(sa.text(
        """
        DELETE FROM publication
        WHERE publication_id = :paper_id
        """
    ), {"paper_id": paper_id})

    print(f"Successfully deleted paper {paper_id}")

    print("=" * 10)


def delete_family_history(individual_id: int, conn: Connection):
    fh_id = conn.execute(
        sa.text("""
        SELECT family_history_record_id, individual_id, condition_id
        FROM family_history_record
        WHERE individual_id = :individual_id
        """),
        {"individual_id": individual_id}
    ).fetchall()

    if not fh_id:
        print(f"Individual {individual_id} has no family history records")
        return

    print(
        f"Deleting family history records (record_id, individual_id, condition_id) {fh_id} ({len(fh_id)} records)")

    delete_family_member_history_record(get_id(fh_id), conn)

    conn.execute(sa.text(
        """
        DELETE FROM family_history_record
        WHERE individual_id = :individual_id
        """
    ), {"individual_id": individual_id})


def delete_family_member_history_record(fh_id: list[int], conn: Connection):
    fmh_id = conn.execute(
        sa.text(
            """
            SELECT family_history_record_id, kinship_name_id
            FROM family_member_history
            WHERE family_history_record_id IN :fh_id
            """
        ).bindparams(
            bindparam("fh_id", expanding=True)
        ),
        {"fh_id": fh_id}
    ).fetchall()

    if not fmh_id:
        print(
            f"Family history record {fh_id} has no individual family members with records")
        return

    print(
        f"The family history for this condition has {len(fmh_id)} individual members with records that will be deleted")

    conn.execute(sa.text(
        """
        DELETE FROM family_member_history
        WHERE family_history_record_id IN :fh_id
        """
    ).bindparams(
        bindparam("fh_id", expanding=True)
    ), {"fh_id": fh_id})

    print(f"Successfully deleted family member history records {fmh_id}")


def delete_variant_info(individual_id: int, conn: Connection):
    ivcl_deleted = conn.execute(sa.text(
        """
        DELETE FROM individual_variant_condition_link ivcl
        WHERE individual_id = :individual_id
        RETURNING individual_id, variant_id, condition_id
        """,
    ), {"individual_id": individual_id}).fetchall()

    print(f"Deleted individual variant condition link(s) {ivcl_deleted}")

    iv_deleted = conn.execute(
        sa.text(
            """
            DELETE FROM individual_variant iv
            WHERE individual_id = :individual_id
            RETURNING individual_id, variant_id
            """
        ),
        {
            "individual_id": individual_id
        }
    ).fetchall()

    print(f"Deleted individual variant(s) {iv_deleted}")


def delete_others(individual_id: int, conn: Connection):
    def _delete_others(sa_text: str, table_name: str):
        deleted = conn.execute(sa.text(sa_text),
                               {"individual_id": individual_id}).fetchall()
        print(f"From {table_name} deleted {deleted}")

    # conditions
    _delete_others(
        """
        DELETE FROM individual_condition ic
        WHERE individual_id = :individual_id
        RETURNING individual_id, condition_id
        """, "individual_condition"
    )

    # individual_treatment
    _delete_others(
        """
        DELETE FROM treatment_record it
        WHERE patient_id = :individual_id
        RETURNING patient_id, treatment_id
        """, "individual_treatment"
    )

    # org_excel_row
    _delete_others(
        """
        DELETE FROM individual_original_excel_row
        WHERE individual_id = :individual_id
        RETURNING individual_id
        """,
        "individual_original_excel_row"
    )

    # UNLINK PUBLICATION
    _delete_others(
        """
        DELETE FROM individual_to_publication
        WHERE individual_id = :individual_id
        RETURNING individual_id, publication_id
        """,
        "individual_to_publication"
    )


def unlink_pub(paper_id: int, conn: Connection):
    # unlink publication from other tables
    conn.execute(sa.text(
        """
        DELETE FROM publication_to_database
        WHERE publication_id = :paper_id
        """
    ), {"paper_id": paper_id})


for paper_id in paper_to_delete["publication_id"].tolist():
    with get_engine() as engine:
        with engine.connect() as conn:
            delete_paper_and_related(paper_id, conn)

            conn.commit()


Deleting paper 827 with individuals []
Successfully deleted paper 827
Deleting paper 1096 with individuals []
Successfully deleted paper 1096
Deleting paper 690 with individuals []
Successfully deleted paper 690
Deleting paper 406 with individuals []
Successfully deleted paper 406
Deleting paper 121 with individuals []
Successfully deleted paper 121
Deleting paper 1343 with individuals []
Successfully deleted paper 1343


In [7]:
with get_engine() as engine:
    qry = text(f"""
    SELECT * FROM publication
    WHERE publication_id IN :paper_ids
    """).bindparams(
        bindparam(
            "paper_ids", expanding=True
        )
    )

    papers_del_after = pd.read_sql(qry, engine, params={
        "paper_ids": paper_to_delete["publication_id"].tolist()})

papers_del_after

Unnamed: 0,publication_id,title,first_author,reference,doi,year,created_at,updated_at,publication_type_id,rob_publication_type_id


# DB dump the database

(do this in the terminal)

Recreate the database in docker

In [9]:
with get_engine() as engine:
    qry = text(f"""
    SELECT * FROM publication
    WHERE publication_id IN :paper_ids
    """).bindparams(
        bindparam(
            "paper_ids", expanding=True
        )
    )

    papers_del_after2 = pd.read_sql(qry, engine, params={
        "paper_ids": paper_to_delete["publication_id"].tolist()})

papers_del_after2

Unnamed: 0,publication_id,title,first_author,reference,doi,year,created_at,updated_at
