In [1]:
import pandas as pd
import numpy as np
from collections import defaultdict

In [2]:
def get_data(path="../OMSK_new.tsv"):

    data = pd.read_csv(
        path,
        sep="\t",
        names=[
            "strain",
            "strain2",
            "gisaid_epi_isl",
            "genbank_accession",
            "date",
            "region",
            "country",
            "division",
            "location",
            "region_exposure",
            "country_exposure",
            "division_exposure",
            "segment",
            "length",
            "host",
            "age",
            "sex",
            "Nextstrain_clade",
            "pangolin_lineage",
            "GISAID_clade",
            "originating_lab",
            "submitting_lab",
            "authors",
            "url",
            "?.1",
            "?.2",
            "Unnamed: 26",
            "minimap2, samtools",
            "Unnamed: 28",
            "Nasopharyngeal swab",
            "B.1.1.163.1",
            "Nanopore MinION",
            "GR.1",
            "None",
            "Original",
            "None.1",
            "Europe / Russia / Omsk / Omsk.1",
            "Unnamed: 37",
            "Female.1",
            "65.1",
            "unknown",
            "Unnamed: 41",
            "seq",
            "muts",
        ],
    )
    needed_cols = [
    "strain",
    "gisaid_epi_isl",
    "date",
    "region",
    "country",
    "division",
    "location",
    "region_exposure",
    "country_exposure",
    "division_exposure",
    "segment",
    "length",
    "host",
    "age",
    "sex",
    "Nextstrain_clade",
    "pangolin_lineage",
    "GISAID_clade",
    "seq",
    "muts",
    ]
    good_data = data[needed_cols]
    good_data = good_data.set_index("gisaid_epi_isl")
#     display(good_data.head())
    return good_data
    
def string_to_dict(string):
    string = string.replace('{', '').replace('}', '').replace('\'', '')
    elements=string.split(', ')
    dictionary = [el.split(': ') for el in elements]
    dictionary = {k: int(v) for k, v in dictionary}
    return dictionary


def get_data_for_mut(good_data, mut2strain, mut):
    strain_list = mut2strain[mut]
    return good_data.loc[strain_list]

def get_cols(data, cols):
    return data[cols]

In [3]:
import plotly
import plotly.plotly as py


def genSankey(df, cat_cols=[], value_cols="", title="Sankey Diagram"):
    # maximum of 6 value cols -> 6 colors
    colorPalette = [
        "orangered",
        "orangered",
        "orangered",
        "orangered",
        "orangered",
    ]
    labelList = []
    colorNumList = []
    for catCol in cat_cols:
        labelListTemp = list(set(df[catCol].values))
        colorNumList.append(len(labelListTemp))
        labelList = labelList + labelListTemp

    # remove duplicates from labelList
    labelList = list(dict.fromkeys(labelList))

    # define colors based on number of levels
    colorList = []
    for idx, colorNum in enumerate(colorNumList):
        colorList = colorList + [colorPalette[idx]] * colorNum

    # transform df into a source-target pair
    for i in range(len(cat_cols) - 1):
        if i == 0:
            sourceTargetDf = df[[cat_cols[i], cat_cols[i + 1], value_cols]]
            sourceTargetDf.columns = ["source", "target", "count"]
        else:
            tempDf = df[[cat_cols[i], cat_cols[i + 1], value_cols]]
            tempDf.columns = ["source", "target", "count"]
            sourceTargetDf = pd.concat([sourceTargetDf, tempDf])
        sourceTargetDf = (
            sourceTargetDf.groupby(["source", "target"])
            .agg({"count": "sum"})
            .reset_index()
        )

    # add index for source-target pair
    sourceTargetDf["sourceID"] = sourceTargetDf["source"].apply(
        lambda x: labelList.index(x)
    )
    sourceTargetDf["targetID"] = sourceTargetDf["target"].apply(
        lambda x: labelList.index(x)
    )

    # creating the sankey diagram
    data = dict(
        type="sankey",
        node=dict(
            pad=15,
            thickness=20,
            line=dict(color="indianred", width=0.5),
            label=labelList,
            color=colorList,
            fontsize=15,
        ),
        link=dict(
            source=sourceTargetDf["sourceID"],
            target=sourceTargetDf["targetID"],
            value=sourceTargetDf["count"],
            color="salmon",
        ),
    )

    layout = dict(title=title, font=dict(size=20))

    fig = dict(data=[data], layout=layout)
    return fig

ImportError: 
The plotly.plotly module is deprecated,
please install the chart-studio package and use the
chart_studio.plotly module instead. 


In [5]:
import plotly
import chart_studio.plotly as py


def genSankey(df, cat_cols=[], value_cols="", title="Sankey Diagram"):
    # maximum of 6 value cols -> 6 colors
    colorPalette = ['#4B8BBE','#306998','#FFE873','#FFD43B','#646464']
    labelList = []
    colorNumList = []
    for catCol in cat_cols:
        labelListTemp = list(set(df[catCol].values))
        colorNumList.append(len(labelListTemp))
        labelList = labelList + labelListTemp

    # remove duplicates from labelList
    labelList = list(dict.fromkeys(labelList))

    # define colors based on number of levels
    colorList = []
    for idx, colorNum in enumerate(colorNumList):
        colorList = colorList + [colorPalette[idx]] * colorNum

    # transform df into a source-target pair
    for i in range(len(cat_cols) - 1):
        if i == 0:
            sourceTargetDf = df[[cat_cols[i], cat_cols[i + 1], value_cols]]
            sourceTargetDf.columns = ["source", "target", "count"]
        else:
            tempDf = df[[cat_cols[i], cat_cols[i + 1], value_cols]]
            tempDf.columns = ["source", "target", "count"]
            sourceTargetDf = pd.concat([sourceTargetDf, tempDf])
        sourceTargetDf = (
            sourceTargetDf.groupby(["source", "target"])
            .agg({"count": "sum"})
            .reset_index()
        )

    # add index for source-target pair
    sourceTargetDf["sourceID"] = sourceTargetDf["source"].apply(
        lambda x: labelList.index(x)
    )
    sourceTargetDf["targetID"] = sourceTargetDf["target"].apply(
        lambda x: labelList.index(x)
    )

    # creating the sankey diagram
    data = dict(
        type="sankey",
        node=dict(
            pad=15,
            thickness=20,
            line=dict(width=0.5),
            label=labelList,
            color=colorList,
            fontsize=15,
        ),
        link=dict(
            source=sourceTargetDf["sourceID"],
            target=sourceTargetDf["targetID"],
            value=sourceTargetDf["count"],
        ),
    )

    layout = dict(title=title, font=dict(size=20))

    fig = dict(data=[data], layout=layout)
    return fig

In [6]:
def prepare_and_draw(df1, col_array, name = ''):
    df1["exist"] = 1
    # df1 = df1.sort_values(by='date')
    df_pull = []
    master_df = None
    for i in range(len(col_array) - 1):
        df_pull.append(
            df1.groupby([col_array[i], col_array[i + 1]], as_index=False)["exist"].count()
        )
        df_pull[i].columns = ["a", "b", "Quantity"]
        if i == 0:
            master_df = df_pull[i].copy(deep=True)
        else:
            master_df = master_df.append(df_pull[i].copy(deep=True))
        display(master_df)


    fig = genSankey(
        master_df, cat_cols=["a", "b"], value_cols="Quantity", title=name
    )
    plotly.offline.plot(fig, validate=False)

In [7]:
def string_to_dict(string):
    string = string.replace('{', '').replace('}', '').replace('\'', '')
    elements=string.split(', ')
    dictionary = [el.split(': ') for el in elements]
    dictionary = {k: int(v) for k, v in dictionary}
    return dictionary


def get_data_for_mut(good_data, mut2strain, mut):
    strain_list = mut2strain[mut]
    return good_data.loc[strain_list]

def get_cols(data, cols):
    return data[cols]

In [8]:
good_data = get_data()
good_data.head()

Unnamed: 0_level_0,strain,date,region,country,division,location,region_exposure,country_exposure,division_exposure,segment,length,host,age,sex,Nextstrain_clade,pangolin_lineage,GISAID_clade,seq,muts
gisaid_epi_isl,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
EPI_ISL_639915,Russia/OMS-ORINFI-1068S/2020,2020-06-26,Europe,Russia,Omsk,Europe / Russia / Omsk / Omsk,Europe,Russia,Omsk,genome,29837,Human,65,Female,20B,B.1.1.163,GR,ACCAACCAACTTTCGATCTCTTGTAGATCTGTTCTCTAAACGAACT...,"{'241_C_T': 405965, '3037_C_T': 415594, '3177_..."
EPI_ISL_569837,Russia/OMS-ORINFI-106S/2020,2020-05-13,Europe,Russia,Omsk,Europe / Russia / Omsk / Omsk,Europe,Russia,Omsk,genome,29873,Human,43,Female,20B,B.1.1.119,GR,TTATACCTTCCCAGGTAACAAACCAACCAACTTTCGATCTCTTGTA...,"{'241_C_T': 405965, '3037_C_T': 415594, '14408..."
EPI_ISL_639912,Russia/OMS-ORINFI-1123S/2020,2020-06-26,Europe,Russia,Omsk,Europe / Russia / Omsk / Omsk,Europe,Russia,Omsk,genome,29849,Human,65,Female,20B,B.1.1.119,GR,TCCCAGGTAACAAACCAACCAACTTTCGATCTCTTGTAGATCTGTT...,"{'241_C_T': 405965, '1469_C_T': 296, '3037_C_T..."
EPI_ISL_569752,Russia/OMS-ORINFI-1129S/2020,2020-05-24,Europe,Russia,Omsk,Europe / Russia / Omsk / Russkaya Polyana,Europe,Russia,Omsk,genome,29556,Human,33,Male,20B,B.1.1.119,GR,CAGTTTGCCTGTTTTACAGGTTCGCGACGTGCTCGTACGTGGCTTT...,"{'3037_C_T': 415594, '4451_A_G': 14, '14408_C_..."
EPI_ISL_569836,Russia/OMS-ORINFI-1141S/2020,2020-08-18,Europe,Russia,Omsk,Europe / Russia / Omsk / Omsk,Europe,Russia,Omsk,genome,29860,Human,62,Female,20B,B.1.1.119,GR,TACCTTCCCAGGTAACAAACCAACCAACTTTCGATCTCTTGTAGAT...,"{'241_C_T': 405965, '3037_C_T': 415594, '3631_..."


In [9]:
mut2amount = defaultdict(int)
mut2strain = defaultdict(list)
strain2mut = defaultdict(list)

for strain, row in good_data.iterrows():
    try:
        cur_dict = string_to_dict(good_data.at[strain, 'muts'])
        good_data.at[strain, 'muts'] = cur_dict
        strain2mut[strain].append(cur_dict.keys())
        for mut in cur_dict:
            mut2strain[mut].append(strain)
            mut2amount[mut] = cur_dict[mut]
    except ValueError:
        pass
    
sankey_cols = [
    "sex",
    "age",
    "date",
    "pangolin_lineage",
    "Nextstrain_clade",
    "GISAID_clade",
]
col_to_prefix = {
    "sex": "Sex",
    "age": "Age",
    "date": "Date",
    "pangolin_lineage": "PANGO lineage",
    "Nextstrain_clade": "Nextstrain clade",
    "GISAID_clade": "GISAID clade",
}
sankey_cols = sankey_cols[::-1]

df1 = get_cols(get_data_for_mut(good_data, mut2strain, "4451_A_G"), sankey_cols)

In [10]:
for col in col_to_prefix:
    df1[col] = col_to_prefix[col] + ": " + df1[col].astype(str)
# df1
prepare_and_draw(df1, sankey_cols, "Sankey for 4451_A_G in Omsk")

Unnamed: 0,a,b,Quantity
0,GISAID clade: GR,Nextstrain clade: 20B,11
1,GISAID clade: O,Nextstrain clade: 20B,3


Unnamed: 0,a,b,Quantity
0,GISAID clade: GR,Nextstrain clade: 20B,11
1,GISAID clade: O,Nextstrain clade: 20B,3
0,Nextstrain clade: 20B,PANGO lineage: B.1.1.119,3
1,Nextstrain clade: 20B,PANGO lineage: B.1.1.129,5
2,Nextstrain clade: 20B,PANGO lineage: B.1.1.163,2
3,Nextstrain clade: 20B,PANGO lineage: B.1.1.292,2
4,Nextstrain clade: 20B,PANGO lineage: B.1.1.31,1
5,Nextstrain clade: 20B,PANGO lineage: B.1.1.70,1


Unnamed: 0,a,b,Quantity
0,GISAID clade: GR,Nextstrain clade: 20B,11
1,GISAID clade: O,Nextstrain clade: 20B,3
0,Nextstrain clade: 20B,PANGO lineage: B.1.1.119,3
1,Nextstrain clade: 20B,PANGO lineage: B.1.1.129,5
2,Nextstrain clade: 20B,PANGO lineage: B.1.1.163,2
3,Nextstrain clade: 20B,PANGO lineage: B.1.1.292,2
4,Nextstrain clade: 20B,PANGO lineage: B.1.1.31,1
5,Nextstrain clade: 20B,PANGO lineage: B.1.1.70,1
0,PANGO lineage: B.1.1.119,Date: 2020-05-24,1
1,PANGO lineage: B.1.1.119,Date: 2020-06-05,1


Unnamed: 0,a,b,Quantity
0,GISAID clade: GR,Nextstrain clade: 20B,11
1,GISAID clade: O,Nextstrain clade: 20B,3
0,Nextstrain clade: 20B,PANGO lineage: B.1.1.119,3
1,Nextstrain clade: 20B,PANGO lineage: B.1.1.129,5
2,Nextstrain clade: 20B,PANGO lineage: B.1.1.163,2
3,Nextstrain clade: 20B,PANGO lineage: B.1.1.292,2
4,Nextstrain clade: 20B,PANGO lineage: B.1.1.31,1
5,Nextstrain clade: 20B,PANGO lineage: B.1.1.70,1
0,PANGO lineage: B.1.1.119,Date: 2020-05-24,1
1,PANGO lineage: B.1.1.119,Date: 2020-06-05,1


Unnamed: 0,a,b,Quantity
0,GISAID clade: GR,Nextstrain clade: 20B,11
1,GISAID clade: O,Nextstrain clade: 20B,3
0,Nextstrain clade: 20B,PANGO lineage: B.1.1.119,3
1,Nextstrain clade: 20B,PANGO lineage: B.1.1.129,5
2,Nextstrain clade: 20B,PANGO lineage: B.1.1.163,2
3,Nextstrain clade: 20B,PANGO lineage: B.1.1.292,2
4,Nextstrain clade: 20B,PANGO lineage: B.1.1.31,1
5,Nextstrain clade: 20B,PANGO lineage: B.1.1.70,1
0,PANGO lineage: B.1.1.119,Date: 2020-05-24,1
1,PANGO lineage: B.1.1.119,Date: 2020-06-05,1
