# Intro & Loading 

In [None]:
from __future__ import print_function
import pandas as pd

import os 

from genepy.utils import helper as h
from depmapomics import terra as myterra
from depmapomics.config import *
from gsheets import Sheets
from taigapy import TaigaClient
from bokeh.plotting import output_notebook

%load_ext autoreload
%autoreload 2
%load_ext rpy2.ipython
tc = TaigaClient()
output_notebook()

sheets = Sheets.from_files('~/.client_secrets.json', '~/.storage.json')

virtual = VIRTUAL

In [None]:
VIRTUAL

## making the virtuals

In [None]:
new = {}
gsheets = sheets.get(POTENTIAL_LIST).sheets[0].to_frame()
new['internal'] = set([i for i in gsheets['Internal'].values.tolist() if str(i) != "nan"])
new['dmc'] = set([i for i in gsheets['DMC'].values.tolist() if str(i) != "nan"])
new['ibm'] = set([i for i in gsheets['IBM'].values.tolist() if str(i) != "nan"])
new['public'] = set([i for i in gsheets['Public'].values.tolist() if str(i) != "nan"])


new["internal"] = new["internal"] | new["ibm"] | new["dmc"] | new["public"]
new["ibm"] = new["ibm"] | new["dmc"] | new["public"]
new["dmc"] = new["dmc"] | new["public"]

## Getting what was released before

In [None]:
prevmut = {}
prevrna = {}
prevcn = {}
prevwes = {}
prev = {}
for val in datasets:
    print(val)
    prevmut[val] = set(tc.get(name=PREV_VIRTUAL[val], file='CCLE_mutations').DepMap_ID)
    prevrna[val] = set(tc.get(name=PREV_VIRTUAL[val], file='CCLE_expression').index)
    prevcn[val] = set(tc.get(name=PREV_VIRTUAL[val], file='CCLE_segment_cn').DepMap_ID)
    prev[val] = prevmut[val] | prevrna[val] | prevcn[val]
    prevwes[val] = prevmut[val] | prevcn[val]

In [None]:
prevmut["dmc"] = prevmut["dmc"] | prevmut["public"]
prevrna["dmc"] = prevrna["dmc"] | prevrna["public"]
prevcn["dmc"] = prevcn["dmc"] | prevcn["public"]
prev["dmc"] = prev["dmc"] | prev["public"]
prevwes["dmc"] = prevwes["dmc"] | prevwes["public"]

prevmut["ibm"] = prevmut["ibm"] | prevmut["dmc"]
prevrna["ibm"] = prevrna["ibm"] | prevrna["dmc"]
prevcn["ibm"] = prevcn["ibm"] | prevcn["dmc"]
prev["ibm"] = prev["ibm"] | prev["dmc"]
prevwes["ibm"] = prevwes["ibm"] | prevwes["dmc"]

prevmut["internal"] = prevmut["internal"] | prevmut["ibm"]
prevrna["internal"] = prevrna["internal"] | prevrna["ibm"]
prevcn["internal"] = prevcn["internal"] | prevcn["ibm"]
prev["internal"] = prev["internal"] | prev["ibm"]
prevwes["internal"] = prevwes["internal"] | prevwes["ibm"]

# create INFO

In [None]:
segmentcn = pd.read_csv('temp/'+SAMPLESETNAME+'/achilles_segment.csv')

In [None]:
INFO = {}
blacklist = set()

for val in datasets:
    removed = set(prevcn[val]) - set(segmentcn.DepMap_ID)
    missing = set(new[val]) - set(segmentcn.DepMap_ID)
    blacklist = (set(segmentcn.DepMap_ID) - (prevcn[val] | set(new[val]))) | blacklist
    newlines = set(new[val]) 

    INFO[val] = "# " + val + """ dataset:
                
## DNAseq Omics:

NEW LINES:
"""+str(newlines)+"""

BLACKLIST:
"""+str(blacklist)+"""

MISSING:
"""+str(missing)+"""

REMOVED:
"""+str(removed)


In [None]:
genes_tpm = pd.read_csv('temp/'+SAMPLESETNAME+'/genes_tpm_logp1.csv',index_col=0)

In [None]:
rnafailed = []

In [None]:
blacklist = {'ACH-000658'}
for val in datasets:
    removed = set(prev[val]) - set(genes_tpm.index)
    removed = set(prevrna[val]) - set(genes_tpm.index)
    missing = set(new[val]) - set(genes_tpm.index)
    blacklist = (set(genes_tpm.index) - (prevrna[val] | set(new[val]))) | blacklist
    newlines = set(new[val]) 
    
    INFO[val] += """


## RNAseq Omics:

NEW LINES:
"""+str(newlines)+"""

BLACKLIST:
"""+str(blacklist)+"""

MISSING:
"""+str(missing)+"""

REMOVED:
"""+str(removed)+"""
                
REMOVED FOR QC REASONS:
"""+str(rnafailed)


# Fusions

In [None]:
fusions = pd.read_csv('temp/'+SAMPLESETNAME+'/fusions_latest.csv')
filtered = pd.read_csv('temp/'+SAMPLESETNAME+'/filteredfusions_latest.csv')

In [None]:
failed = []

In [None]:
blacklist = {'ACH-000658'}
for val in datasets:
    print('_________________________________________________')
    print(val)
    print('not present')
    removed = set(prev[val]) - set(fusions.DepMap_ID)
    print(removed)
    print('removed for QC reasons')
    print(failed)
    print('removed')
    removed = set(prevrna[val]) - set(fusions.DepMap_ID)
    print(removed)
    missing = set(new[val]) - set(fusions.DepMap_ID)
    blacklist = (set(fusions.DepMap_ID) - (prevrna[val] | set(new[val]))) | blacklist
    print('missing')
    print(missing)
    newlines = set(new[val]) 
    print('blacklist')
    print(len(blacklist), blacklist)
    ## removing first blacklisted, then embargoed, to create two datasets
    print(len(fusions))
    a = fusions[~fusions.DepMap_ID.isin(blacklist)]
    print(len(a))
    a.to_csv('temp/fusions.csv', index=False)
    print(len(filtered))
    a= filtered[~filtered.DepMap_ID.isin(blacklist)]
    print(len(a))
    a.to_csv('temp/filtered_fusions.csv', index=False)

    # uploading to taiga
    tc.update_dataset(virtual[val],
                      changes_description='adding fusions',
                      upload_files=[
                        {
                            "path": "temp/fusions.csv",
                            "name": "CCLE_fusions_unfiltered",
                            "format": "TableCSV",
                            "encoding": "utf-8"
                        },
                        {
                            "path": "temp/filtered_fusions.csv",
                            "name": "CCLE_fusions",
                            "format": "TableCSV",
                            "encoding": "utf-8"
                        },
                      ],
                      dataset_description=INFO[val],
                      add_all_existing_files=True)

# Updating eternal

In [None]:
def findLatestVersion(dataset, approved_only=True):
    highest = 0
    latest_version = 0
    data = tc.get_dataset_metadata(dataset)
    for val in data['versions']:
        if val['state']=="approved" or not approved_only:
            if int(val['name'])>highest:
                highest = int(val['name'])
                latest_version = highest
    if latest_version==0:
        raise ValueError('could not find a version')
    return data['permanames'][0]+'.'+str(latest_version)

In [None]:
# To add to a eternal dataset
latest_version = findLatestVersion(virtual['internal'])

files = ["CCLE_gene_cn", "CCLE_segment_cn", 
         
         "CCLE_mutations", "CCLE_mutations_bool_damaging", "CCLE_mutations_bool_nonconserving", "CCLE_mutations_bool_otherconserving", "CCLE_mutations_bool_hotspot", 
         
         "CCLE_expression_full", "CCLE_RNAseq_transcripts", "CCLE_RNAseq_reads", "CCLE_expression", "CCLE_expression_proteincoding_genes_expected_count", "CCLE_expression_transcripts_expected_count",

         "CCLE_fusions_unfiltered", "CCLE_fusions"]

tc.update_dataset(eternal_dataset,
                changes_description='new '+SAMPLESETNAME+" omics dataset.",
                add_taiga_ids=[{"taiga_id": latest_version +"/"+ file, "name": file} for file in files],
                add_all_existing_files=True)

# saving the current release version

In [None]:
! cd ../ccle_processing && git add . && git commit -m "depmap omics $samplesetname final" && git push

In [None]:
ls 

In [None]:
x