# R72

In [None]:
# R72:
    # cell line -> CPTAC, 5 cancer types
    # HCMI -> CPTAC, 2 cancer types

In [1]:
%whos

Interactive namespace is empty.


In [2]:
# Data handling & comparision setup
import pandas as pd
import glob
import itertools

# Feature selection, RFE
from sklearn.feature_selection import RFE
from sklearn.svm import SVR

# Import 3 SciKit Learn classifiers
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree
from sklearn.linear_model import LogisticRegression

# SVM
from sklearn.svm import LinearSVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

# Classification setup and eval
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
import statistics

# Plotting
import matplotlib.pyplot as plt

### Set data paths

#### Input read

In [3]:
# phase = 'Input'
inp_pths = sorted(
    glob.glob('../data/r72/*.tsv'), reverse = True)
inp_pths

['../data/r72/nc5_transcriptomics_cell-line+CPTAC.tsv',
 '../data/r72/nc2_transcriptomics_HCMI+CPTAC.tsv']

##### Check label capitalization on input paths

In [None]:
for inp_pth in inp_pths:
    df = pd.read_csv(
        inp_pth, sep = '\t', index_col = 0)
    print(df.System.unique())

In [None]:
# Conclusion - MBatch rebuild script converted to caps
    # archive caps versions
    # conver to lowercase and re-write

#### MBatch

In [None]:
# phase = 'MBatch'
mb_pths = sorted(
    glob.glob('../results/r72/mbatch/*.tsv'), reverse = True)
mb_pths

##### MBatch i-o de-capitilize

In [None]:
for mb_pth in mb_pths:
    df = pd.read_csv(
        mb_pth, sep = '\t', index_col = 0)
    print(df.System.unique())

In [None]:
mb_pths = sorted(
    glob.glob('../results/r72/mbatch/cap_lbls_archive/*.tsv'), reverse = True)
for mb_pth in mb_pths:
    # print(mb_pth)
    f_nm = mb_pth.split('/')[-1]
    df = pd.read_csv(
        mb_pth, sep = '\t', index_col = 0)
    print(df.System.unique())

    df['System'] = df['System'].str.lower()
    
    print(df.System.unique())

    df.to_csv(
        '../results/r72/mbatch/'+f_nm,
        sep = '\t')
    print(' ')
    # break

In [None]:
f_nm = mb_pth.split('/')[-1]

In [None]:
f_nm

#### VAE

In [None]:
ls ../results/r72/vae/

In [None]:
# phase = 'VAE'
vae_pths = sorted(
    glob.glob('../results/r72/vae/*.tsv'), reverse = True)
vae_pths

#### Tybalt

In [None]:
ls ../results/r72/tybalt/

In [8]:
tb_pths = sorted(
    glob.glob('../results/r72/tybalt/*.tsv'), reverse = True)
tb_pths

['../results/r72/tybalt/nc5_transcriptomics_cell-line+CPTAC.250-ltnt-dim_12-epchs.tsv',
 '../results/r72/tybalt/nc2_transcriptomics_HCMI+CPTAC.250-ltnt-dim_12-epchs.tsv']

In [None]:
# skip RFE for now, start with 3 clf on raw and mbatch

### RFE

In [None]:
for pth in inp:
    df = pd.read_csv(
        pth, sep = '\t', index_col = 0)
    X = df.iloc[:, 2:]
    break

In [None]:
df

In [None]:
estimator = SVR(kernel="linear")
rfe = RFE(estimator=estimator, n_features_to_select=10, step=0.5)
file_n = pd.read_csv(pth, sep = '\t', index_col = 0) # read validation split file
X = file_n.iloc[:, 1:]
y = file_n.iloc[:, 0]
rfe.fit(X, y)
mask = rfe.support_
vs_dict[vs] = json.dumps(list(X.columns[mask]))

### Subsample cross val devel

In [None]:
# Sub-sample input at 80%
# match row index on corr df

In [None]:
inp_pths

In [None]:
tb_pths

In [None]:
vae_pths

In [None]:
# Questions - sub-sample on only the input?
# Predict on entire CPTAC?

# split before subsample? - yes, that's what X was

### Load data

In [12]:
# Input load
input_df = pd.read_csv(
                inp_pths[0], # nc5 file
                sep = '\t',
                index_col = 0)
print('inpt len:', len(input_df))

inpt len: 827


In [None]:
# MBatch load
corr_df = pd.read_csv(
        mb_pths[0], sep = '\t', index_col = 0)
print('MB corr len:', len(corr_df))

In [None]:
# VAE load
corr_df = pd.read_csv(
        vae_pths[0], sep = '\t', index_col = 0)
print('vae latent len:', len(corr_df))

In [10]:
# Tybalt load
corr_df = pd.read_csv(
        tb_pths[0], sep = '\t', index_col = 0)
print('Tybalt latent len:', len(corr_df))

Tybalt latent len: 827


In [None]:
# Cellinger


### Clf

In [22]:
# print('Begin clf on ', phase)
print('Begin sub-sampled, matched-cross val\n classification evals')
print(' ')
print('Input file:\n', inp_pths[0])
print(' ')

clf_list = [RandomForestClassifier(max_depth=2, random_state=0),
            tree.DecisionTreeClassifier(),
            LogisticRegression(random_state=0, max_iter=5000),
            make_pipeline(StandardScaler(),
                LinearSVC(dual="auto", random_state=0,
                          max_iter=100000, tol=1e-5))]

# Subset training input
inp_mdl_sys_train_df = input_df[input_df.System == 'cell-line']

# Subset correction object, toggle with file read
corr_mdl_sys_train_df = corr_df[corr_df.System == 'cell-line']

# Hard-code for HCMI
# hold

# List of dicts
results = []

for cross_val in range(30): # chk
    print('cross_val', cross_val)

    # Train set sub-sample, input - X and y
    X_input = inp_mdl_sys_train_df.iloc[:, 2:]
    X_input = X_input.sample(round(len(X_input)*.85))
    y_input = inp_mdl_sys_train_df[inp_mdl_sys_train_df.index.isin(
                                        X_input.index)].Cancer_type

    # Train set sub-sample, corr - X and y
    X_corr = corr_mdl_sys_train_df.iloc[:, 2:]
    X_corr = X_corr[X_corr.index.isin(X_input.index)]
    y_corr = corr_mdl_sys_train_df[corr_mdl_sys_train_df.index.isin(
                                        X_corr.index)].Cancer_type

    # Test set extraction, input and corr
    CPTAC_input = input_df[input_df.System == 'cptac']
    CPTAC_corr = corr_df[corr_df.System == 'cptac']

    # print(len(CPTAC_input))
    # print(len(CPTAC_corr))
    
    clf_i_list = ['RF', 'DT', 'LR', 'SVM']

    # Classifier loop, input
    print('Start input loop')
    input_score_list = []
    for clf_i in range(len(clf_list)):
        clf = clf_list[clf_i]
        clf.fit(X_input,
                y_input)
    
        score = f1_score(CPTAC_input.Cancer_type,
                         clf.predict(CPTAC_input.iloc[:, 2:]),
                         average = 'weighted')
        
        results.append({
            'cross_val': cross_val,
            'classifier': clf_i_list[clf_i],
            'score': score,
            'type': 'input'
        })
        # break
    # Classifier loop, correction method
    print('Start correction eval loop')
    for clf_i in range(len(clf_list)):
        clf = clf_list[clf_i]
        clf.fit(X_corr,
                y_corr)
    
        score = f1_score(CPTAC_corr.Cancer_type,
                         clf.predict(CPTAC_corr.iloc[:, 2:]),
                         average = 'weighted')
        results.append({
            'cross_val': cross_val,
            'classifier': clf_i_list[clf_i],
            'score': score,
            'type': 'corrected'
        })

    # print('end cross fold')
    print(' ')
    # break
print('done')

Begin sub-sampled, matched-cross val
 classification evals
 
Input file:
 ../data/r72/nc5_transcriptomics_cell-line+CPTAC.tsv
 
cross_val 0
Start input loop
Start correction eval loop
 
cross_val 1
Start input loop
Start correction eval loop
 
cross_val 2
Start input loop
Start correction eval loop
 
cross_val 3
Start input loop
Start correction eval loop
 
cross_val 4
Start input loop
Start correction eval loop
 
cross_val 5
Start input loop
Start correction eval loop
 
cross_val 6
Start input loop
Start correction eval loop
 
cross_val 7
Start input loop
Start correction eval loop
 
cross_val 8
Start input loop
Start correction eval loop
 
cross_val 9
Start input loop
Start correction eval loop
 
cross_val 10
Start input loop
Start correction eval loop
 
cross_val 11
Start input loop
Start correction eval loop
 
cross_val 12
Start input loop
Start correction eval loop
 
cross_val 13
Start input loop
Start correction eval loop
 
cross_val 14
Start input loop
Start correction eval loop

In [23]:
results_df = pd.DataFrame(results)
# results_df

In [24]:
pd.options.display.max_rows = 300

In [25]:
results_df

Unnamed: 0,cross_val,classifier,score,type
0,0,RF,0.098238,input
1,0,DT,0.072313,input
2,0,LR,0.055562,input
3,0,SVM,0.028091,input
4,0,RF,0.371692,corrected
5,0,DT,0.444772,corrected
6,0,LR,0.433568,corrected
7,0,SVM,0.230526,corrected
8,1,RF,0.210937,input
9,1,DT,0.055709,input


In [26]:
results_df.to_csv('../results/r72/input-tybalt_transfer-lrn_x-val=30.tsv',
                 sep = '\t')