# R72

In [None]:
# R72:
    # HCMI -> CPTAC, 2 cancer types
    # cell line -> CPTAC, 5 cancer types

#2024-03-12
    # Devel on Tybalt

In [1]:
%whos

Interactive namespace is empty.


In [2]:
# Data handling & comparision setup
import pandas as pd
import glob
import itertools

# Feature selection, RFE
from sklearn.feature_selection import RFE
from sklearn.svm import SVR

# Import 3 SciKit Learn classifiers
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree
from sklearn.linear_model import LogisticRegression

# SVM
from sklearn.svm import LinearSVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

# Classification setup and eval
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
import statistics

# Plotting
import matplotlib.pyplot as plt

### Read data

#### Input read

In [3]:
# phase = 'Input'
inp_pths = sorted(
    glob.glob('../data/r72/*.tsv'), reverse = True)
inp_pths

['../data/r72/nc5_transcriptomics_cell-line+CPTAC.tsv',
 '../data/r72/nc2_transcriptomics_HCMI+CPTAC.tsv']

##### Check label capitalization on input paths

In [None]:
for inp_pth in inp_pths:
    df = pd.read_csv(
        inp_pth, sep = '\t', index_col = 0)
    print(df.System.unique())

In [None]:
# Conclusion - MBatch rebuild script converted to caps
    # archive caps versions
    # conver to lowercase and re-write

#### MBatch

In [None]:
# phase = 'MBatch'
mb_pths = sorted(
    glob.glob('../results/r72/mbatch/*.tsv'), reverse = True)
mb_pths

##### MBatch i-o de-capitilize

In [None]:
for mb_pth in mb_pths:
    df = pd.read_csv(
        mb_pth, sep = '\t', index_col = 0)
    print(df.System.unique())

In [None]:
mb_pths = sorted(
    glob.glob('../results/r72/mbatch/cap_lbls_archive/*.tsv'), reverse = True)
for mb_pth in mb_pths:
    # print(mb_pth)
    f_nm = mb_pth.split('/')[-1]
    df = pd.read_csv(
        mb_pth, sep = '\t', index_col = 0)
    print(df.System.unique())

    df['System'] = df['System'].str.lower()
    
    print(df.System.unique())

    df.to_csv(
        '../results/r72/mbatch/'+f_nm,
        sep = '\t')
    print(' ')
    # break

In [None]:
f_nm = mb_pth.split('/')[-1]

In [None]:
f_nm

#### VAE

In [None]:
ls ../results/r72/vae/

In [None]:
# phase = 'VAE'
vae_pths = sorted(
    glob.glob('../results/r72/vae/*.tsv'), reverse = True)
vae_pths

#### Tybalt

In [None]:
ls ../results/r72/tybalt/

In [6]:
tb_pths = sorted(
    glob.glob('../results/r72/tybalt/*.tsv'), reverse = True)
tb_pths

['../results/r72/tybalt/nc5_transcriptomics_cell-line+CPTAC.250-ltnt-dim_12-epchs.tsv',
 '../results/r72/tybalt/nc2_transcriptomics_HCMI+CPTAC.250-ltnt-dim_12-epchs.tsv']

In [None]:
# skip RFE for now, start with 3 clf on raw and mbatch

### RFE

In [None]:
for pth in inp:
    df = pd.read_csv(
        pth, sep = '\t', index_col = 0)
    X = df.iloc[:, 2:]
    break

In [None]:
df

In [None]:
estimator = SVR(kernel="linear")
rfe = RFE(estimator=estimator, n_features_to_select=10, step=0.5)
file_n = pd.read_csv(pth, sep = '\t', index_col = 0) # read validation split file
X = file_n.iloc[:, 1:]
y = file_n.iloc[:, 0]
rfe.fit(X, y)
mask = rfe.support_
vs_dict[vs] = json.dumps(list(X.columns[mask]))

### Subsample cross val

In [None]:
# Sub-sample input at 80%
# match row index on corr df

In [7]:
inp_pths

['../data/r72/nc5_transcriptomics_cell-line+CPTAC.tsv',
 '../data/r72/nc2_transcriptomics_HCMI+CPTAC.tsv']

In [8]:
tb_pths

['../results/r72/tybalt/nc5_transcriptomics_cell-line+CPTAC.250-ltnt-dim_12-epchs.tsv',
 '../results/r72/tybalt/nc2_transcriptomics_HCMI+CPTAC.250-ltnt-dim_12-epchs.tsv']

In [None]:
vae_pths

In [None]:
# Questions - sub-sample on only the input?
# Predict on entire CPTAC?

# split before subsample? - yes, that's what X was

In [9]:
# Input load
input_df = pd.read_csv(
                inp_pths[0], # nc5 file
                sep = '\t',
                index_col = 0)
print('inpt len:', len(input_df))

inpt len: 827


In [None]:
# VAE load
corr_df = pd.read_csv(
        vae_pths[0], sep = '\t', index_col = 0)
print('vae latent len:', len(corr_df))

In [10]:
# Tybalt load
corr_df = pd.read_csv(
        tb_pths[0], sep = '\t', index_col = 0)
print('vae latent len:', len(corr_df))

vae latent len: 827


In [None]:
# Correction load, method to evaluate
corr_df = pd.read_csv(
        vae_pths[0], sep = '\t', index_col = 0)
print('vae latent len:', len(corr_df))

In [None]:
# MBatch


In [None]:
# Cellinger


### Clf

In [None]:
# Options - store in DF

In [20]:
# print('Begin clf on ', phase)
print('Begin sub-sample matched-cross val clssfctn evals')
print(' ')
print('Input file:\n', inp_pths[0])
print(' ')

# print('Corrected file:\n', vae_pths)
# print(' ')

clf_list = [RandomForestClassifier(max_depth=2, random_state=0),
            tree.DecisionTreeClassifier(),
            LogisticRegression(random_state=0, max_iter=5000),
            make_pipeline(StandardScaler(),
                LinearSVC(dual="auto", random_state=0,
                          max_iter=5000, tol=1e-5))]

inp_mdl_sys_train_df = input_df[input_df.System == 'cell-line']
X_input = inp_mdl_sys_train_df.iloc[:, 2:]

corr_mdl_sys_train_df = corr_df[corr_df.System == 'cell-line']
X_corr = corr_mdl_sys_train_df.iloc[:, 2:]

# Sub
input_dict = {}
corr_dict = {}
x_val_lst = 

for cross_val in list(range(0,5)):
    print(cross_val)
    break
    x_val_lst.append(cross_val)
    # Train set sub-sample, input - X and y
    X_input = X_input.sample(round(len(X_input)*.8))
    y_input = inp_mdl_sys_train_df[inp_mdl_sys_train_df.index.isin(
                                        X_input.index)].Cancer_type

    # Train set sub-sample, corr - X and y
    X_corr = X_corr[X_corr.index.isin(X_input.index)]
    y_corr = corr_mdl_sys_train_df[corr_mdl_sys_train_df.index.isin(
                                        X_corr.index)].Cancer_type

    # Test set extraction, input and corr
    CPTAC_input = input_df[input_df.System == 'cptac']
    CPTAC_corr = corr_df[corr_df.System == 'cptac']

    print(len(CPTAC_input))
    print(len(CPTAC_corr))
    
    
    # Store scores by clf
    
    clf_i_list = ['RF', 'DT', 'LR', 'SVM']

    # Classifier loop, input
    # inp_clf_score_dict = {}
    print('Start input loop')
    input_score_list = []
    for clf_i, clf in enumerate(clf_list):
        print(clf)
        print(clf_i_list[clf_i])

        clf.fit(X_input,
                y_input)
    
        score = f1_score(CPTAC_input.Cancer_type,
                         clf.predict(CPTAC_input.iloc[:, 2:]),
                         average = 'weighted')
        
        # inp_clf_score_list.append(score)
        input_dict[clf_i_list[clf_i]] = inp_clf_score_list    
    print(' ')

    # Classifier loop, correction method
    corr_clf_score_dict = {}
    print('Start correction eval loop')
    corr_clf_score_list = []
    for clf_i, clf in enumerate(clf_list):
        # clf = Scikit-learn.method ?
        print(clf)
        print(clf_i_list[clf_i])

        clf.fit(X_corr,
                y_corr)
    
        score = f1_score(CPTAC_corr.Cancer_type,
                         clf.predict(CPTAC_corr.iloc[:, 2:]),
                         average = 'weighted')
        
        corr_clf_score_list.append(score)
    corr_clf_score_dict[clf_i_list[clf_i]] = corr_clf_score_list    
    print(' ')

    print('end cross fold one')
    break
print('done')

SyntaxError: invalid syntax (4130707898.py, line 26)

In [19]:
pd.DataFrame(inp_clf_score_dict)

Unnamed: 0,SVM
0,0.058059
1,0.063249
2,0.058059
3,0.053324


In [18]:
pd.DataFrame(corr_clf_score_dict)

Unnamed: 0,SVM
0,0.449844
1,0.293701
2,0.434028
3,0.263939


In [None]:
pd.DataFrame(y_corr).value_counts()

In [None]:
pd.DataFrame(y_input).value_counts()

#### Devel

In [None]:
df.System.unique()

In [None]:
df.System.value_counts()

In [None]:
df.Cancer_type.unique()

In [None]:
df.Cancer_type.value_counts()

#### Run 1

In [None]:
train_list = ['cell-line', 'hcmi']

# toggle_val = 1

# for trn_i, pth in enumerate(inp[toggle_val]):
for pth in [phase_pths[toggle_val]]:
    # break
    df = pd.read_csv(
        pth, sep = '\t', index_col = 0)
    # break
    # Train on cell line, n = 261
    non_human_train_df = df[df.System == train_list[toggle_val]]
    X = non_human_train_df.iloc[:, 2:]

    CPTAC_df = df[df.System == 'cptac']

    # Store scores by clf
    clf_i_list = ['RF', 'DT', 'LR',]
    clf_score_dict = {}
    for clf_i, clf in enumerate(clf_list):

        for cv_split in list(range(0,1)):

            clf.fit(X,
                    non_human_train_df.Cancer_type)
    
            score = f1_score(CPTAC_df.Cancer_type,
                             clf.predict(CPTAC_df.iloc[:, 2:]),
                             average = 'weighted')
            clf_score_dict[clf_i_list[clf_i]] = score
            # break
        print(' ')
        # break
    # break # cell line, HCMI
print('done')

#### Notes

In [None]:
# clf = tree.DecisionTreeClassifier()
# clf = LogisticRegression(random_state=0).fit(X, y)