# Signature mutations in patient samples

Notebook to assess the prevalence of signature mutation defining the lineage B.1.1.7, P.1 and 501.V2 in all non-B.1.1.7, non-P.1 and non-501.V2 consensus sequences from GISAID obtained from clinical samples collected in Switzerland before December 24. 

In [None]:
import pandas as pd
import yaml

## Input yaml-file of variant

In [None]:
yaml_file = open("../voc/br_mutations.yaml")
parsed_yaml_file = yaml.load(yaml_file, Loader=yaml.FullLoader)

In [None]:
dict_mut = parsed_yaml_file.get("mut")
if parsed_yaml_file.get("extra") != None:
    temp = parsed_yaml_file.get("extra")
    dict_mut.update(temp)
if parsed_yaml_file.get("subset") != None:
    temp = parsed_yaml_file.get("subset")
    dict_mut.update(temp)
if parsed_yaml_file.get("shared") != None:
    temp = parsed_yaml_file.get("shared")
    dict_mut.update(temp)

### Collect Swiss patient samples from sars_cov_2-ETHZ-database 

In [None]:
import psycopg2

db_host = "id-hdb-psgr-cp61.ethz.ch"
db_name = "sars_cov_2"
db_user = input("Enter username for database" + db_name + ":\n")
db_password = input("Enter password for user " + db_user + ":\n")
# Connect to database
db_connection = (
    "dbname='"
    + db_name
    + "' user='"
    + db_user
    + "' host='"
    + db_host
    + "' password='"
    + db_password
    + "'"
)
try:
    conn = psycopg2.connect(db_connection)
except Exception as e:
    raise Exception("I am unable to connect to the database.", e)
cursor = conn.cursor()
cursor.execute(
    "select gs.strain, gs.date, gs.date_str, gs.division, gs.pangolin_lineage, gs.originating_lab, gs.submitting_lab, gs.date_submitted, gs.aligned_seq from gisaid_sequence gs where gs.country ='Switzerland'"
)
data = cursor.fetchall()
cursor.close()

conn.close()

### Summarize data in dataframe df

In [None]:
df = pd.DataFrame(
    data,
    columns=(
        "strain",
        "date",
        "date_str",
        "division",
        "pangolin_lineage",
        "originating_lab",
        "submitting_lab",
        "date_submitted",
        "aligned_seq",
    ),
)

### Exclude sequences that are already identified as UK- or SA-variant from the analysis

In [None]:
df = df[df["pangolin_lineage"] != "B.1.1.7"]  # UK variants
df = df[df["pangolin_lineage"] != "501.V2"]  # SA variants
df = df[df["pangolin_lineage"] != "P.1"]  # BR variants

### Analysis is only performed with sequences with sample date before before 24 Dec 2020

In [None]:
df["date_datetime"] = pd.to_datetime(df["date"])
df = df[df["date_datetime"] <= "2020-12-24"]

### Number of samples analysed 

In [None]:
df.shape[0]

### Prevalence of the signature mutations of the XX lineage summarized in df_counts

In [None]:
def parse2colname(mut_pos, mut):
    x = mut.split(">")
    if (len(x) == 1) and (x[0][0] == "-"):
        return "del_" + str(mut_pos)
    elif (len(x) == 1) and (x[0][0] == "+"):
        return "insertion"
    else:
        return x[0] + str(mut_pos) + x[1]

In [None]:
def count_mutations(sequence):
    """
    sequence: sequence as string
    return:   dict with counts of the signature mutations of the XX lineage
              and list of co-occurring mutations.
    TODO: case for insertions
    """
    dict_counts = {}

    for mut_pos in dict_mut:
        key_name = parse2colname(mut_pos, dict_mut[mut_pos])

        # mut_pos counts from one, we need position with start zero
        zero_pos = mut_pos - 1

        # deletion
        if dict_mut[mut_pos].startswith("-"):
            del_len = len(dict_mut[mut_pos])
            if sequence[zero_pos : zero_pos + del_len] == dict_mut[mut_pos]:
                dict_counts.update({key_name: 1})
            else:
                dict_counts.update({key_name: 0})
        elif dict_mut[mut_pos].startswith("+"):
            # TODO : count insertions
            dict_counts.update({key_name: 0})
        else:
            var = dict_mut[mut_pos].split(">")[1]
            set_var = {var, var.lower()}
            if sequence[zero_pos : zero_pos + len(var)] in set_var:
                dict_counts.update({key_name: 1})
            else:
                dict_counts.update({key_name: 0})

    # sum of mutations found in current consensus sequence
    sum_mut = 0
    co_occ_list = []
    for key in dict_counts:
        sum_mut += dict_counts[key]
        if dict_counts[key] == 1:
            co_occ_list.append(key)

    dict_counts.update({"sum": sum_mut})

    if (sum_mut > 1) and (len(co_occ_list) > 1):
        co_occ = str(co_occ_list)
    else:
        co_occ = str([])

    dict_counts.update({"co_occ": str(co_occ)})

    return dict_counts

In [None]:
# define the dataframe where we count the observed mutations
columns_df_counts = ["id", "division", "date", "sum", "co_occ", "n_sample"]

for mut_pos in dict_mut:
    columns_df_counts.append(parse2colname(mut_pos, dict_mut[mut_pos]))

df_counts = pd.DataFrame(columns=columns_df_counts)

# go through each consensus sequence of the clinical samples and check if the mutations are found
for index, row in df.iterrows():
    info_dict = {
        "id": row["strain"],
        "division": row["division"],
        "date": row["date"],
        "n_sample": 1,
    }
    counts_dict = count_mutations(row["aligned_seq"])
    info_dict.update(counts_dict)
    df_counts = df_counts.append(info_dict, ignore_index=True)

In [None]:
df_counts[df_counts["sum"] > 0]

### Prevalence of the XX signature mutations before and after 2020-10-23

In [None]:
df_counts_freq = pd.DataFrame(
    columns=(
        "mutation",
        "abs_freq",
        "relativ_freq",
        "abs_freq_early",
        "abs_freq_late",
        "relativ_freq_early",
        "relativ_freq_late",
    )
)

cut_off_date = "2020-10-23"

df_counts["date_datetime"] = pd.to_datetime(df_counts["date"])
df_counts_before_oct = df_counts[df_counts["date_datetime"] < cut_off_date]
df_counts_after_oct = df_counts[df_counts["date_datetime"] >= cut_off_date]

print("# early seq ", df_counts_before_oct.shape[0])
print("# late seq ", df_counts_after_oct.shape[0])


list_mutations = []
for mut_pos in dict_mut:
    list_mutations.append(parse2colname(mut_pos, dict_mut[mut_pos]))

for mut in list_mutations:
    temp_dict = {
        "mutation": mut,
        "abs_freq": df_counts[mut].sum(),
        "relativ_freq": df_counts[mut].sum() / df_counts.shape[0] * 100,
        "abs_freq_early": df_counts_before_oct[mut].sum(),
        "relativ_freq_early": float(
            df_counts_before_oct[mut].sum() / df_counts_before_oct.shape[0] * 100
        ),
        "abs_freq_late": df_counts_after_oct[mut].sum(),
        "relativ_freq_late": float(
            df_counts_after_oct[mut].sum() / df_counts_after_oct.shape[0] * 100
        ),
    }
    df_counts_freq = df_counts_freq.append(temp_dict, ignore_index=True)

df_counts_freq

In [None]:
# save dataframe
# df_counts_freq.to_csv('df_counts_freq.csv')

### Grouping the co-occuring mutations together

In [None]:
list_mutations.append("id")
list_mutations.append("sum")
list_mutations.append("date_datetime")

df_co_occ = df_counts[df_counts["sum"] > 1].drop(list_mutations, axis=1)

In [None]:
df_co_occ["co_occ"] = df_co_occ["co_occ"].astype(str)

In [None]:
df_co_occ1 = df_co_occ.groupby(["co_occ", "division", "date"]).sum()
df_co_occ1

In [None]:
# save dataframe
# df_co_occ1.to_csv('df_co_occ.csv')