# 1. Explore Meta



In [None]:
import subprocess
from pathlib import Path

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

In [None]:
def print_version(package):
    print(f"{getattr(package, '__name__'):10} {getattr(package, '__version__'):>15}")

In [None]:
def show_direc_tree(path: Path):
    cmd = f"tree {path}"
    subprocess.check_call(cmd.split())

In [None]:
DATA_FOLDER = Path(".").absolute().parent / "data"
show_direc_tree(DATA_FOLDER)

In [None]:
print_version(sns)
print_version(pd)

## Explore mutation

__NOTE__: ccle_mutations.csv is not sv result

In [None]:
ccle_mutations_path = DATA_FOLDER / "meta/ccle_mutations.csv"
ccle_mutations = pd.read_csv(DATA_FOLDER / ccle_mutations_path)

In [None]:
ccle_mutations

In [None]:
ccle_mutations.columns

In [None]:
pd.unique(ccle_mutations["Variant_Type"])

In [None]:
ccle_mutations[ccle_mutations["Variant_Type"] == "DNP"]

## Explore sample info

In [None]:
sample_info_path = DATA_FOLDER / "meta/sample_info.csv"
sample_info_df = pd.read_csv(sample_info_path)

In [None]:
sample_info_df

In [None]:
all_sample_rna_seq_path = DATA_FOLDER / "meta/table.csv"
all_sample_rna_seq = pd.read_csv(all_sample_rna_seq_path)

In [None]:
all_sample_rna_seq

## Explore Svaba data

[source](https://depmap.org/portal/download/?releasename=CCLE+2019&filename=Cell_lines_annotations_20181226.txt)

In [None]:
svaba_path = DATA_FOLDER / "meta/ccle_translocations_svaba_20181221.xlsx"

In [None]:
svaba = pd.read_excel(svaba_path)

In [None]:
svaba

In [None]:
svaba.columns

In [None]:
svaba.info()

In [None]:
pd.unique(svaba["class"])

## Explore Svaba annotations info

In [None]:
cell_line_annotations_path = DATA_FOLDER / "meta/cell_lines_annotations_20181226.txt"

cell_line_annotations = pd.read_csv(cell_line_annotations_path, sep="\t")

In [None]:
cell_line_annotations

In [None]:
cell_line_annotations.columns

## Explore Rna seq data samples info

In [None]:
rna_seq_info = DATA_FOLDER / "meta/table.csv"

In [None]:
rna_seq = pd.read_csv(rna_seq_info)

In [None]:
rna_seq

In [None]:
rna_seq.columns

In [None]:
rna_seq.info()

In [None]:
rna_seq_sample_name = set(rna_seq["Sample Name"].to_list())
svaba_ccle_name = set(svaba["CCLE_name"].to_list())

In [None]:
len(rna_seq_sample_name)

In [None]:
len(svaba_ccle_name)

### Find rna seq data download link for svaba samples

In [None]:
# Note: "SKMEL2_SKIN"  is not in rna seq dataset
svaba_rna_seq_data = (
    rna_seq.set_index("Sample Name").loc[list(svaba_ccle_name - {"SKMEL2_SKIN"}), :].reset_index()
)

In [None]:
base_url = "https://trace.ncbi.nlm.nih.gov/Traces/sra-reads-be/fastq?acc={}".format

In [None]:
tmp = svaba_rna_seq_data["Run"].to_list()

In [None]:
tmp

In [None]:
svaba_rna_seq_data_file = DATA_FOLDER / "meta/svaba_rna_seq_download.txt"

In [None]:
with open(svaba_rna_seq_data_file, "w") as f:
    for name in tmp:
        f.write(f"{name}.fq.gz\t{base_url(name)}\n")