In [1]:
import pathlib
cwd = pathlib.Path.cwd()
import pandas
import re

In [2]:
min_pos = 250
min_ant = 250

ut_cancer_file = cwd.parent / "Data_Files" / "UTData_fs.csv"
ut_cancer_raw = cwd.parent / "Data_Files" / "Cancer_UTData_fs_RAW.xlsx"
ut_cancer_clean = cwd.parent / "Data_Files" / f"Cancer_UTData_fs_{min_pos}-{min_ant}.xlsx"

uh_cancer_file = cwd.parent / "Data_Files" / "Fusion_AllConfidence.pkl"
uh_cancer_raw = cwd.parent / "Data_Files" / "Cancer_UHData_RAW.xlsx"
uh_cancer_clean = cwd.parent / "Data_Files" / f"Cancer_UHData_{min_pos}-{min_ant}.xlsx"

In [3]:
def import_data_and_save(file_path: pathlib.Path, new_file_path: pathlib.Path, *args, **kwargs) -> pandas.DataFrame:
    '''
    '''
    data: pandas.DataFrame
    if file_path.suffix in ".csv":
        data = pandas.read_csv(file_path, *args, **kwargs)
        data.to_excel(new_file_path, sheet_name = "Sheet 1")

    elif file_path.suffix in ".xlsx":
        data = pandas.read_excel(file_path, *args, **kwargs)

    elif file_path.suffix in ".pkl":
        data = pandas.read_pickle(file_path, *args, **kwargs)
        data.to_excel(new_file_path, sheet_name = "Sheet 1")

    else:
        data = None

    return data

In [4]:
def filter_length(data: pandas.DataFrame, min_pos: int = 6, min_ant: int = 6):
    '''
    filters the data by length.
    '''
    data["Ant_Seq_Len"] = data["Anterior_Seq"].apply(lambda x: len(x) if isinstance(x, str) else pandas.NA)
    data["Pos_Seq_Len"] = data["Posterior_Seq"].apply(lambda x: len(x) if isinstance(x, str) else pandas.NA)
    data = data[(data["Ant_Seq_Len"] > 0) & (data["Pos_Seq_Len"] > 0)]

    data[f"Anterior_{min_pos}"] = data["Anterior_Seq"].apply(lambda x: x[0: min_ant] if len(x) >= min_ant else pandas.NA)
    data[f"Posterior_{min_ant}"] = data["Posterior_Seq"].apply(lambda x: x[len(x) - min_pos: len(x)] if len(x) >= min_pos else pandas.NA)

    data.dropna()
    data = data.reset_index()

    return data

In [5]:
ut_cancer = import_data_and_save(ut_cancer_file, ut_cancer_raw)

print(list(ut_cancer.columns))

['Henst', 'Tenst', 'DataType', 'Source', 'Ctype', 'Hgene', 'Hchr', 'Hbp', 'Hstrand', 'Tgene', 'Tchr', 'Tbp', 'Tstrand', 'BreakPoint', 'Seq', 'Hseq', 'Tseq', 'Length', 'Unnamed: 18', 'Unnamed: 19']


In [6]:
uh_cancer = import_data_and_save(uh_cancer_file, uh_cancer_raw)

print(list(uh_cancer.columns))
print(uh_cancer.shape)

['CANCER TYPE', 'Shipment Number', 'XS_MODEL', '#gene1', 'gene2', 'strand1(gene/fusion)', 'strand2(gene/fusion)', 'breakpoint1', 'breakpoint2', 'site1', 'site2', 'type', 'split_reads1', 'split_reads2', 'discordant_mates', 'coverage1', 'coverage2', 'confidence', 'reading_frame', 'tags', 'retained_protein_domains', 'closest_genomic_breakpoint1', 'closest_genomic_breakpoint2', 'gene_id1', 'gene_id2', 'transcript_id1', 'transcript_id2', 'direction1', 'direction2', 'filters', 'fusion_transcript', 'peptide_sequence', 'read_identifiers', 'Tail', 'Head']
(17377, 35)


In [7]:
new_columns = ["Posterior_ENST", "Anterior_ENST", "Cancer_Type", "Posterior_Name", "Anterior_Name", "Posterior_Strand", "Anterior_Strand", "Posterior_Chrm", "Anterior_Chrm", "Posterior_Seq", "Anterior_Seq", "Full_Seq"]
old_columns_ut = ["Henst", "Tenst", "Ctype", "Hgene", "Tgene", "Hstrand", "Tstrand", "Hchr", "Tchr", "Hseq", "Tseq", "Seq"]
old_columns_uh = ["transcript_id1", "transcript_id2", "CANCER TYPE", '#gene1', "gene2", "strand1(gene/fusion)", "strand2(gene/fusion)", "Head_Chrm", "Tail_Chrm", "Head", "Tail", "fusion_transcript"]

new_columns_ut = {old: new_columns[i] for i, old in enumerate(old_columns_ut)}
new_columns_uh = {old: new_columns[i] for i, old in enumerate(old_columns_uh)}

In [8]:
print(new_columns_ut)

{'Henst': 'Posterior_ENST', 'Tenst': 'Anterior_ENST', 'Ctype': 'Cancer_Type', 'Hgene': 'Posterior_Name', 'Tgene': 'Anterior_Name', 'Hstrand': 'Posterior_Strand', 'Tstrand': 'Anterior_Strand', 'Hchr': 'Posterior_Chrm', 'Tchr': 'Anterior_Chrm', 'Hseq': 'Posterior_Seq', 'Tseq': 'Anterior_Seq', 'Seq': 'Full_Seq'}


In [9]:
print(new_columns_uh)

{'transcript_id1': 'Posterior_ENST', 'transcript_id2': 'Anterior_ENST', 'CANCER TYPE': 'Cancer_Type', '#gene1': 'Posterior_Name', 'gene2': 'Anterior_Name', 'strand1(gene/fusion)': 'Posterior_Strand', 'strand2(gene/fusion)': 'Anterior_Strand', 'Head_Chrm': 'Posterior_Chrm', 'Tail_Chrm': 'Anterior_Chrm', 'Head': 'Posterior_Seq', 'Tail': 'Anterior_Seq', 'fusion_transcript': 'Full_Seq'}


In [10]:
ut_cancer_claned = ut_cancer.rename(columns = new_columns_ut)

ut_cancer_claned = filter_length(ut_cancer_claned, min_ant = min_ant, min_pos = min_pos)

print(ut_cancer_claned.shape)
print(list(ut_cancer_claned.columns))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[f"Anterior_{min_pos}"] = data["Anterior_Seq"].apply(lambda x: x[0: min_ant] if len(x) >= min_ant else pandas.NA)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[f"Posterior_{min_ant}"] = data["Posterior_Seq"].apply(lambda x: x[len(x) - min_pos: len(x)] if len(x) >= min_pos else pandas.NA)


(56458, 25)
['index', 'Posterior_ENST', 'Anterior_ENST', 'DataType', 'Source', 'Cancer_Type', 'Posterior_Name', 'Posterior_Chrm', 'Hbp', 'Posterior_Strand', 'Anterior_Name', 'Anterior_Chrm', 'Tbp', 'Anterior_Strand', 'BreakPoint', 'Full_Seq', 'Posterior_Seq', 'Anterior_Seq', 'Length', 'Unnamed: 18', 'Unnamed: 19', 'Ant_Seq_Len', 'Pos_Seq_Len', 'Anterior_250', 'Posterior_250']


In [11]:
uh_cancer_claned = uh_cancer.rename(columns = new_columns_uh)
uh_cancer_claned["Posterior_Chrm"] = "Unknown"
uh_cancer_claned["Anterior_Chrm"] = "Unknown"
uh_cancer_claned = filter_length(uh_cancer_claned, min_pos = min_pos, min_ant = min_ant)
print(uh_cancer_claned.shape)


(17377, 42)


In [12]:
new_columns.append(f"Posterior_{min_pos}")
new_columns.append(f"Anterior_{min_pos}")

In [13]:
ut_cancer_claned = ut_cancer_claned[new_columns]

In [14]:
uh_cancer_claned = uh_cancer_claned[new_columns]

In [15]:
print(ut_cancer_claned.head())

    Posterior_ENST    Anterior_ENST Cancer_Type Posterior_Name Anterior_Name  \
0  ENST00000370243  ENST00000370225        UCEC          BCAR3         ABCA4   
1  ENST00000370247  ENST00000370225        UCEC          BCAR3         ABCA4   
2  ENST00000379446  ENST00000340645        UCEC          NEDD9        GOLGB1   
3  ENST00000379446  ENST00000393667        UCEC          NEDD9        GOLGB1   
4  ENST00000379433  ENST00000340645        UCEC          NEDD9        GOLGB1   

  Posterior_Strand Anterior_Strand Posterior_Chrm Anterior_Chrm  \
0                -               -           chr1          chr1   
1                -               -           chr1          chr1   
2                -               -           chr6          chr3   
3                -               -           chr6          chr3   
4                -               -           chr6          chr3   

                                       Posterior_Seq  \
0  GAGCGCGCTGGGACTCCCCGAGACGTGGCCCACGGCGGGGAGCGCT...   
1  A

In [16]:
ut_cancer_claned.to_excel(ut_cancer_clean, sheet_name = "Sheet 1")

In [17]:
uh_cancer_claned.to_excel(uh_cancer_clean, sheet_name = "Sheet 1")

: 