In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.append('..')

In [117]:
from talus_data_analysis.plot import histogram
from talus_data_analysis.elib import Elib
from talus_data_analysis.load import read_excel_from_gdrive, read_df_from_s3
from talus_data_analysis.save import write_df_to_s3
from dotenv import load_dotenv
import tempfile
import sqlite3
import math
import pandas as pd
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive

In [4]:
load_dotenv()

True

In [5]:
gauth = GoogleAuth(settings_file="../settings.yaml")

In [6]:
ENCYCLOPEDIA_BUCKET = "talus-data-pipeline-encyclopedia-bucket"
DATA_FOLDER = "../data/210308_MLLtx"
S3_FOLDER = "wide/210308_MLLtx"
PROJECT_NAME = "MLLtx"

ELIB_FILE = "RESULTS-quant.elib"
peptide_protein_file = "peptide_proteins_results.csv"
peptide_protein_norm_output = "peptide_proteins_normalized.csv"
msstats_groupcompare_output = "msstats_groupcompare.csv"
comparison_matrix_file = "comparison_matrix.csv"

In [156]:
peptide_df = read_df_from_s3(bucket=ENCYCLOPEDIA_BUCKET, key=f"{S3_FOLDER}/RESULTS-quant.elib.peptides.txt", inputformat="txt")
peptide_df = peptide_df.drop("numFragments", axis=1)
# make sure there is one protein per column
peptide_df = peptide_df.drop("Protein", axis=1).join(peptide_df["Protein"].str.split(";", expand=True).stack().reset_index(drop=True, level=1).rename("Protein"))
# melt the dataframe so that each source file is in a seperate row instead of column
peptide_df = peptide_df.melt(id_vars=["Peptide", "Protein"], var_name="SourceFile", value_name="TotalIntensity")
peptide_df["Run"] = peptide_df["SourceFile"].apply(lambda x: x.split(".")[0].split("_")[-1])

In [157]:
peptide_df

Unnamed: 0,Peptide,Protein,SourceFile,TotalIntensity,Run
0,AAAAAAAAAAAAAAAASAGGK,sp|P0CG40|SP9_HUMAN,210308_talus_01.mzML,2.403030e+07,01
1,AAAAAAAAAAAAAAAGAGAGAK,sp|P55011|S12A2_HUMAN,210308_talus_01.mzML,1.323888e+06,01
2,AAAAAAAAAPAAAATAPTTAATTAATAAQ,sp|P37108|SRP14_HUMAN,210308_talus_01.mzML,1.465808e+09,01
3,AAAAAAAAAVSR,sp|Q96JP5|ZFP91_HUMAN,210308_talus_01.mzML,2.784300e+08,01
4,AAAAAAAAGAFAGR,sp|Q8N697|S15A4_HUMAN,210308_talus_01.mzML,2.485319e+07,01
...,...,...,...,...,...
1008112,YYYDKNIMTK,sp|P11308|ERG_HUMAN,210308_talus_12b.mzML,6.919218e+05,12b
1008113,YYYDKNIMTK,sp|Q01543|FLI1_HUMAN,210308_talus_12b.mzML,6.919218e+05,12b
1008114,YYYIPQYK,sp|Q8N183|NDUF2_HUMAN,210308_talus_12b.mzML,9.711452e+06,12b
1008115,YYYQGC[+57.021464]ASWK,sp|Q9H0D6|XRN2_HUMAN,210308_talus_12b.mzML,4.785607e+07,12b


## Template DF

In [158]:
sample_df = peptide_df[["SourceFile"]].drop_duplicates().reset_index(drop=True)
sample_df["BioReplicate"] = pd.Series([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0])
sample_df["Condition"] = pd.Series(["DRUG2", "DRUG3", "DRUG4", "DRUG2", "DMSO", "DMSO", "DRUG5", "DRUG1", "DRUG1", "DRUG3", "DRUG5"])
sample_df["Comparison"] = pd.Series(["DMSO", "DMSO", "DMSO", "DMSO", "Control", "Control", "DMSO", "DMSO", "DMSO", "DMSO", "DMSO"])
sample_df["Run"] = sample_df["SourceFile"].apply(lambda x: x.split("_")[-1].split(".")[0])
sample_df = sample_df[["Run", "BioReplicate", "Condition", "Comparison"]]

In [159]:
sample_df

Unnamed: 0,Run,BioReplicate,Condition,Comparison
0,01,1.0,DRUG2,DMSO
1,01b,2.0,DRUG3,DMSO
2,02,3.0,DRUG4,DMSO
3,02b,4.0,DRUG2,DMSO
4,03,5.0,DMSO,Control
5,03b,6.0,DMSO,Control
6,10,7.0,DRUG5,DMSO
7,10b,8.0,DRUG1,DMSO
8,11b,9.0,DRUG1,DMSO
9,12,10.0,DRUG3,DMSO


In [179]:
msstats_df = pd.merge(peptide_df, sample_df, how="right", on="Run")

In [180]:
msstats_df

Unnamed: 0,Peptide,Protein,SourceFile,TotalIntensity,Run,BioReplicate,Condition,Comparison
0,AAAAAAAAAAAAAAAASAGGK,sp|P0CG40|SP9_HUMAN,210308_talus_01.mzML,2.403030e+07,01,1.0,DRUG2,DMSO
1,AAAAAAAAAAAAAAAGAGAGAK,sp|P55011|S12A2_HUMAN,210308_talus_01.mzML,1.323888e+06,01,1.0,DRUG2,DMSO
2,AAAAAAAAAPAAAATAPTTAATTAATAAQ,sp|P37108|SRP14_HUMAN,210308_talus_01.mzML,1.465808e+09,01,1.0,DRUG2,DMSO
3,AAAAAAAAAVSR,sp|Q96JP5|ZFP91_HUMAN,210308_talus_01.mzML,2.784300e+08,01,1.0,DRUG2,DMSO
4,AAAAAAAAGAFAGR,sp|Q8N697|S15A4_HUMAN,210308_talus_01.mzML,2.485319e+07,01,1.0,DRUG2,DMSO
...,...,...,...,...,...,...,...,...
1008112,YYYDKNIMTK,sp|P11308|ERG_HUMAN,210308_talus_12b.mzML,6.919218e+05,12b,11.0,DRUG5,DMSO
1008113,YYYDKNIMTK,sp|Q01543|FLI1_HUMAN,210308_talus_12b.mzML,6.919218e+05,12b,11.0,DRUG5,DMSO
1008114,YYYIPQYK,sp|Q8N183|NDUF2_HUMAN,210308_talus_12b.mzML,9.711452e+06,12b,11.0,DRUG5,DMSO
1008115,YYYQGC[+57.021464]ASWK,sp|Q9H0D6|XRN2_HUMAN,210308_talus_12b.mzML,4.785607e+07,12b,11.0,DRUG5,DMSO


In [181]:
## Add a few required columns and rename header to match MSstats convention
msstats_df = msstats_df.drop(["Run", "Comparison"], axis=1)
msstats_df["PrecursorCharge"] = 2
msstats_df["IsotopeLabelType"] = "L"
msstats_df["FragmentIon"] = "y0"
msstats_df["ProductCharge"] = "1"
msstats_df = msstats_df.rename(columns={"Peptide": "PeptideSequence",
                                        "Protein": "ProteinName",
                                        "SourceFile": "Run",
                                        "TotalIntensity": "Intensity"})

In [182]:
msstats_df = msstats_df.sort_values(by=["PeptideSequence", "Intensity"]).reset_index(drop=True)

In [183]:
msstats_df

Unnamed: 0,PeptideSequence,ProteinName,Run,Intensity,BioReplicate,Condition,PrecursorCharge,IsotopeLabelType,FragmentIon,ProductCharge
0,AAAAAAAAAAAAAAAASAGGK,sp|P0CG40|SP9_HUMAN,210308_talus_12.mzML,13823692.0,10.0,DRUG3,2,L,y0,1
1,AAAAAAAAAAAAAAAASAGGK,sp|P0CG40|SP9_HUMAN,210308_talus_12b.mzML,18324944.0,11.0,DRUG5,2,L,y0,1
2,AAAAAAAAAAAAAAAASAGGK,sp|P0CG40|SP9_HUMAN,210308_talus_01.mzML,24030302.0,1.0,DRUG2,2,L,y0,1
3,AAAAAAAAAAAAAAAASAGGK,sp|P0CG40|SP9_HUMAN,210308_talus_03b.mzML,25332058.0,6.0,DMSO,2,L,y0,1
4,AAAAAAAAAAAAAAAASAGGK,sp|P0CG40|SP9_HUMAN,210308_talus_11b.mzML,26111934.0,9.0,DRUG1,2,L,y0,1
...,...,...,...,...,...,...,...,...,...,...
1008112,YYYQLNSK,sp|O00257|CBX4_HUMAN,210308_talus_02b.mzML,19630538.0,4.0,DRUG2,2,L,y0,1
1008113,YYYQLNSK,sp|O00257|CBX4_HUMAN,210308_talus_02.mzML,20652764.0,3.0,DRUG4,2,L,y0,1
1008114,YYYQLNSK,sp|O00257|CBX4_HUMAN,210308_talus_01b.mzML,20824640.0,2.0,DRUG3,2,L,y0,1
1008115,YYYQLNSK,sp|O00257|CBX4_HUMAN,210308_talus_03.mzML,22253328.0,5.0,DMSO,2,L,y0,1


In [184]:
msstats_df.to_csv(f"{DATA_FOLDER}/{peptide_protein_file}")

In [185]:
write_df_to_s3(dataframe=msstats_df, bucket=ENCYCLOPEDIA_BUCKET, key=f"{S3_FOLDER}/{peptide_protein_file.replace('.csv', '.parquet')}", outputformat="parquet")

In [186]:
def get_comparison_matrix(df, filter_target_func=lambda x:x):
    df = df.sort_values(by="Condition")
    dmso_map = {condition: dmso for (condition, dmso) in zip(df["Condition"], df["Comparison"])}
    
    comp_lol = []
    targets = sorted(dmso_map.keys())
    comp_df_index = []
    for i, s in enumerate(targets):
        comp_list = [0 for t in targets]
        if dmso_map[s] in dmso_map:
            # make dmso 0
            comp_list[targets.index(dmso_map[s])] = -1
            # make target itself 1
            comp_list[i] = 1

            comp_lol.append(comp_list)
            comp_df_index.append(f"{filter_target_func(s)}/{dmso_map[s]}")

    comp_df = pd.DataFrame(comp_lol)
    comp_df.index = comp_df_index
    
    return comp_df

In [187]:
comp_matrix = get_comparison_matrix(df=sample_df, filter_target_func=lambda x: x.split(" - ")[0])

In [188]:
comp_matrix

Unnamed: 0,0,1,2,3,4,5
DRUG1/DMSO,-1,1,0,0,0,0
DRUG2/DMSO,-1,0,1,0,0,0
DRUG3/DMSO,-1,0,0,1,0,0
DRUG4/DMSO,-1,0,0,0,1,0
DRUG5/DMSO,-1,0,0,0,0,1


In [189]:
comp_matrix.to_csv(f"{DATA_FOLDER}/{comparison_matrix_file}")

# Run R Script (MSStats) ...

## Write msstats normalized peptide protein df to s3

In [152]:
msstats_df_norm = pd.read_csv(f"{DATA_FOLDER}/{peptide_protein_norm_output}")

In [154]:
msstats_df_norm

Unnamed: 0,PROTEIN,PEPTIDE,TRANSITION,FEATURE,LABEL,GROUP_ORIGINAL,SUBJECT_ORIGINAL,RUN,GROUP,SUBJECT,INTENSITY,SUBJECT_NESTED,ABUNDANCE,FRACTION,originalRUN,censored
0,sp|A0A1W2PR48|TLE7_HUMAN,LSGLEAPSLQK_2,y0_1,LSGLEAPSLQK_2_y0_1,L,ARN-505 - 24h,Gryder-4,1,1,4,1.000000e+00,1.4,0.000000,1,210511_Talus_Run4.mzML,True
1,sp|A0AVT1|UBA6_HUMAN,FISADVHGIWSR_3,y0_1,FISADVHGIWSR_3_y0_1,L,ARN-505 - 24h,Gryder-4,1,1,4,1.085368e+06,1.4,20.915179,1,210511_Talus_Run4.mzML,False
2,sp|A0AVT1|UBA6_HUMAN,NCFLNLAIPIVVFTETTEVR_3,y0_1,NCFLNLAIPIVVFTETTEVR_3_y0_1,L,ARN-505 - 24h,Gryder-4,1,1,4,3.750285e+05,1.4,19.382067,1,210511_Talus_Run4.mzML,False
3,sp|A0AVT1|UBA6_HUMAN,TVFFESLER_2,y0_1,TVFFESLER_2_y0_1,L,ARN-505 - 24h,Gryder-4,1,1,4,2.064677e+06,1.4,21.842911,1,210511_Talus_Run4.mzML,False
4,sp|A0AVT1|UBA6_HUMAN,YVDLTVSFAPDIDGDEDLPGPPVR_3,y0_1,YVDLTVSFAPDIDGDEDLPGPPVR_3_y0_1,L,ARN-505 - 24h,Gryder-4,1,1,4,1.000000e+00,1.4,0.000000,1,210511_Talus_Run4.mzML,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
109843,sp|Q9Y6Q5|AP1M2_HUMAN,VLFELTGR_2,y0_1,VLFELTGR_2_y0_1,L,R1881 - 48h,Gryder-8,12,12,8,5.530902e+05,12.8,17.649025,1,210511_Talus_Run14.mzML,False
109844,sp|Q9Y6X4|F169A_HUMAN,DFGLHMLEDFVDSFTEDALGLR_3,y0_1,DFGLHMLEDFVDSFTEDALGLR_3_y0_1,L,R1881 - 48h,Gryder-8,12,12,8,1.000000e+00,12.8,0.000000,1,210511_Talus_Run14.mzML,True
109845,sp|Q9Y6X4|F169A_HUMAN,GKDFGLHMLEDFVDSFTEDALGLR_3,y0_1,GKDFGLHMLEDFVDSFTEDALGLR_3_y0_1,L,R1881 - 48h,Gryder-8,12,12,8,1.000000e+00,12.8,0.000000,1,210511_Talus_Run14.mzML,True
109846,sp|Q9Y6Y8|S23IP_HUMAN,EVLTLQETLEALSLSEYFSTFEK_3,y0_1,EVLTLQETLEALSLSEYFSTFEK_3_y0_1,L,R1881 - 48h,Gryder-8,12,12,8,5.637411e+04,12.8,14.354615,1,210511_Talus_Run14.mzML,False


In [155]:
write_df_to_s3(dataframe=msstats_df_norm, bucket=ENCYCLOPEDIA_BUCKET, key=f"{S3_FOLDER}/{peptide_protein_norm_output.replace('.csv', '.parquet')}", outputformat="parquet")

## Write msstats groupcompare df to s3

In [156]:
msstats_groupcompare = pd.read_csv(f"{DATA_FOLDER}/{msstats_groupcompare_output}")

In [157]:
msstats_groupcompare

Unnamed: 0,Protein,Label,log2FC,SE,Tvalue,DF,pvalue,adj.pvalue,issue,MissingPercentage,ImputationPercentage
0,sp|A0A1W2PR48|TLE7_HUMAN,ARN-505/DMSO - 24h,-inf,,,,,0.0,oneConditionMissing,0.500,0.000
1,sp|A0AVT1|UBA6_HUMAN,ARN-505/DMSO - 24h,0.218510,,,0.0,,,,0.250,0.250
2,sp|A0FGR8|ESYT2_HUMAN,ARN-505/DMSO - 24h,-0.457277,,,0.0,,,,0.000,0.000
3,sp|A0MZ66|SHOT1_HUMAN,ARN-505/DMSO - 24h,-0.689042,,,0.0,,,,0.500,0.500
4,sp|A1L0T0|ILVBL_HUMAN,ARN-505/DMSO - 24h,inf,,,,,0.0,oneConditionMissing,0.500,0.000
...,...,...,...,...,...,...,...,...,...,...,...
21385,sp|Q9Y6I4|UBP3_HUMAN,R1881/DMSO - 48h,0.920673,,,0.0,,,,0.000,0.000
21386,sp|Q9Y6M1|IF2B2_HUMAN,R1881/DMSO - 48h,1.344610,,,0.0,,,,0.125,0.125
21387,sp|Q9Y6Q5|AP1M2_HUMAN,R1881/DMSO - 48h,1.468264,,,0.0,,,,0.000,0.000
21388,sp|Q9Y6X4|F169A_HUMAN,R1881/DMSO - 48h,,,,,,,completeMissing,1.000,0.000


In [158]:
write_df_to_s3(dataframe=msstats_groupcompare, bucket=ENCYCLOPEDIA_BUCKET, key=f"{S3_FOLDER}/{msstats_groupcompare_output.replace('.csv', '.parquet')}", outputformat="parquet")