In [1]:
import plotly.graph_objects as go
import plotly.express as px
from scipy import stats
from SiFoN import clinical_analysis as clin
import pandas as pd
import numpy as np
import re

### Converting Clinical Data into a VCF format for Sei

In [2]:
pten = pd.read_csv("test_input_data/brown_suppl4.csv") # DOI: 10.1200/PO.17.00108 Suppl. Table 14.
pten.head(1)

Unnamed: 0,RefSeqID,Alteration,Protein Change,Classification,PromoterRegion,Breast Cancer Case Alleles,Breast Cancer Case MAF,Control Alleles,Control MAF,OR,95% CI 1,95% CI 2,Pvalue
0,NM_000314,c.-976G>C,,VUS,Promoter,46,0.04%,40,0.08%,0.581,0.372,0.911,0.016


In [3]:
# Save any metadata you want for your analysis
pten = pten[["Alteration", "Breast Cancer Case Alleles", "Control Alleles", "Classification"]]
pten.head(2)

Unnamed: 0,Alteration,Breast Cancer Case Alleles,Control Alleles,Classification
0,c.-976G>C,46,40,VUS
1,c.-835C>T,14,10,VUS


At this point, you can save this metadata for future analysis. We still need to convert the `Alteration` column into a format that Sei can understand. The below function converts alterations from the c-dot (c. pos ref > alt) format to a VCF file. This function expects a `chrm` paramter (since the chromosome name is not available in the c-dot format). If the positions are offset from a particular position, you can denote that with the `start` parameter. The default of `start` is zero, which you can use if the positions are raw. Once you run this function, you can save this VCF and use it as input into Sei. 

In [4]:
pten_VCF = clin.cdot_to_VCF_format(pten, chrm="chr10", start=89624227)
pten_VCF.head(1)

Unnamed: 0,#CHROM,POS,ID,REF,ALT
0,chr10,89623251,c.-976G>C,G,C


There is also another function that will convert alterations in the following format "chrom_position_alt_ref"

In [None]:
 # ccvs = pd.read_excel("supp13.xlsx", "BCAC_OVERALL")
# print(ccvs.shape)
# ccvs.head(3)

### Load in and combine the data

In [67]:
row_labels = pd.read_csv("test_input_data/PTEN_row_labels", header=0) # Description of SNPs
print(len(row_labels))
row_labels.head(1)

203


Unnamed: 0,chrom,pos,Alteration,ref,alt,strand,ref_match,contains_unk
0,chr10,89623251,c.-976G>C,G,C,+,True,False


In [76]:
scores = pd.read_csv("test_input_data/PTEN_sequence_class_scores.npy") # Sequence class scores
scores.head(1)

Unnamed: 0,PC1 Polycomb / Heterochromatin,L1 Low signal,TN1 Transcription,TN2 Transcription,L2 Low signal,E1 Stem cell,E2 Multi-tissue,E3 Brain / Melanocyte,L3 Low signal,E4 Multi-tissue,...,E10 Brain,TF4 OTX2,HET4 Heterochromatin,L7 Low signal,PC4 Polycomb / Bivalent stem cell Enh,HET5 Centromere,E11 T-cell,TF5 AR,E12 Erythroblast-like,HET6 Centromere
0,0.036797,0.016578,0.015117,0.015232,0.010343,0.024441,0.021501,0.035103,0.033337,0.092842,...,0.066348,0.017301,0.002037,0.009142,0.025761,0.021403,0.089531,0.025894,0.009224,0.00617


In [77]:
dis_score = pd.read_csv("test_input_data/PTEN_DIS.csv")
scores["Disease Impact Score"] = dis_score["Disease Impact Score"]
scores = row_labels.join(scores)
scores.head(1)

Unnamed: 0,chrom,pos,Alteration,ref,alt,strand,ref_match,contains_unk,PC1 Polycomb / Heterochromatin,L1 Low signal,...,TF4 OTX2,HET4 Heterochromatin,L7 Low signal,PC4 Polycomb / Bivalent stem cell Enh,HET5 Centromere,E11 T-cell,TF5 AR,E12 Erythroblast-like,HET6 Centromere,Disease Impact Score
0,chr10,89623251,c.-976G>C,G,C,+,True,False,0.036797,0.016578,...,0.017301,0.002037,0.009142,0.025761,0.021403,0.089531,0.025894,0.009224,0.00617,-0.310427


In [78]:
clinical_data = pd.read_csv("test_input_data/PTEN_Brown_Suppl4.csv")
clinical_data.head(1)

Unnamed: 0,Alteration,Breast Cancer Case Alleles,Control Alleles,Classification
0,c.-976G>C,46,40,VUS


In [79]:
scores = clinical_data.merge(scores) # Clinical data connects to the Sei results by the `Alteration` column.
scores.head(1)

Unnamed: 0,Alteration,Breast Cancer Case Alleles,Control Alleles,Classification,chrom,pos,ref,alt,strand,ref_match,...,TF4 OTX2,HET4 Heterochromatin,L7 Low signal,PC4 Polycomb / Bivalent stem cell Enh,HET5 Centromere,E11 T-cell,TF5 AR,E12 Erythroblast-like,HET6 Centromere,Disease Impact Score
0,c.-976G>C,46,40,VUS,chr10,89623251,G,C,+,True,...,0.017301,0.002037,0.009142,0.025761,0.021403,0.089531,0.025894,0.009224,0.00617,-0.310427


In [None]:
dis_score = dis_score.merge(st_small)
dis_score["Case + Control Counts"] = [case + control for case, control in 
                              zip(dis_score["Breast Cancer Case Alleles"], dis_score["Control Alleles"])]
diff = 0.5
dis_score["Case/Control"] = ["Case" if OR > 1 + diff else "Control" if OR < 1 - diff else "Equal"
                              for OR in dis_score["Odds Ratio"]]
dis_score.head(3)

In [None]:
# cases, controls, offset = 52065, 28549, 0.1
# st["Odds Ratio"] = [((case + offset)/cases)/((control+offset)/controls)
#                      for case, control in zip(st["Breast Cancer Case Alleles"], st["Control Alleles"])]