# R72

In [None]:
# R72:
    # HCMI -> CPTAC, 5 cancer types
    # cell line -> CPTAC, 5 cancer types

In [None]:
%whos

In [3]:
# Data handling & comparision setup
import pandas as pd
import glob
import itertools

# Feature selection
from sklearn.feature_selection import RFE
from sklearn.svm import SVR

# Import 3 SciKit Learn classifiers
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree
from sklearn.linear_model import LogisticRegression

# Classification setup and eval
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
import statistics

# Plotting
import matplotlib.pyplot as plt

### Read data

#### Input read

In [4]:
# phase = 'Input'
inp_pths = sorted(
    glob.glob('../data/r72/*.tsv'), reverse = True)
inp_pths

['../data/r72/nc5_transcriptomics_cell-line+CPTAC.tsv',
 '../data/r72/nc2_transcriptomics_HCMI+CPTAC.tsv']

##### Check label capitalization on input paths

In [9]:
for inp_pth in inp_pths:
    df = pd.read_csv(
        inp_pth, sep = '\t', index_col = 0)
    print(df.System.unique())

['cell-line' 'cptac']
['cptac' 'hcmi']


In [None]:
# Conclusion - MBatch rebuild script converted to caps
    # archive caps versions
    # conver to lowercase and re-write

#### MBatch

In [22]:
# phase = 'MBatch'
mb_pths = sorted(
    glob.glob('../results/r72/mbatch/*.tsv'), reverse = True)
mb_pths

['../results/r72/mbatch/nc5_transcriptomics_cell-line+CPTAC.tsv',
 '../results/r72/mbatch/nc2_transcriptomics_HCMI+CPTAC.tsv']

##### MBatch i-o de-capitilize

In [15]:
for mb_pth in mb_pths:
    df = pd.read_csv(
        mb_pth, sep = '\t', index_col = 0)
    print(df.System.unique())

['cell-line' 'CPTAC']
['CPTAC' 'HCMI']


In [21]:
mb_pths = sorted(
    glob.glob('../results/r72/mbatch/cap_lbls_archive/*.tsv'), reverse = True)
for mb_pth in mb_pths:
    # print(mb_pth)
    f_nm = mb_pth.split('/')[-1]
    df = pd.read_csv(
        mb_pth, sep = '\t', index_col = 0)
    print(df.System.unique())

    df['System'] = df['System'].str.lower()
    
    print(df.System.unique())

    df.to_csv(
        '../results/r72/mbatch/'+f_nm,
        sep = '\t')
    print(' ')
    # break

['cell-line' 'CPTAC']
['cell-line' 'cptac']
 
['CPTAC' 'HCMI']
['cptac' 'hcmi']
 


In [19]:
f_nm = mb_pth.split('/')[-1]

In [20]:
f_nm

'nc5_transcriptomics_cell-line+CPTAC.tsv'

#### VAE

In [23]:
ls ../results/r72/vae/

2024-03-11-latent_space_1.tsv


In [5]:
# phase = 'VAE'
vae_pths = sorted(
    glob.glob('../results/r72/vae/*.tsv'))
vae_pths

['../results/r72/vae/2024-03-11-latent_space_1.tsv']

#### Tybalt

In [None]:
vae # run Tybalt for baseline

In [None]:
# skip RFE for now, start with 3 clf on raw and mbatch

### RFE

In [None]:
for pth in inp:
    df = pd.read_csv(
        pth, sep = '\t', index_col = 0)
    X = df.iloc[:, 2:]
    break

In [None]:
df

In [None]:
estimator = SVR(kernel="linear")
rfe = RFE(estimator=estimator, n_features_to_select=10, step=0.5)
file_n = pd.read_csv(pth, sep = '\t', index_col = 0) # read validation split file
X = file_n.iloc[:, 1:]
y = file_n.iloc[:, 0]
rfe.fit(X, y)
mask = rfe.support_
vs_dict[vs] = json.dumps(list(X.columns[mask]))

### Clf

In [18]:
corr_df.index

Index([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,
       ...
       556, 557, 558, 559, 560, 561, 562, 563, 564, 565],
      dtype='int64', length=827)

In [22]:
# print('Begin 3 clf on ', phase)
print('Begin 3 clf on VAE n5')
print(' ')
print('Input file:\n', inp_pths[0])
print(' ')

print('Corrected file:\n', vae_pths)
print(' ')

clf_list = [RandomForestClassifier(max_depth=2, random_state=0),
            tree.DecisionTreeClassifier(),
            LogisticRegression(random_state=0, max_iter=5000)]

# Input
input_df = pd.read_csv(
        inp_pths[0], sep = '\t', index_col = 0)
print(len(input_df))

mdl_sys_train_df = input_df[input_df.System == 'cell-line']
X = mdl_sys_train_df.iloc[:, 2:]

CPTAC_df = input_df[input_df.System == 'cptac']

# # Corrected
# corr_df = pd.read_csv(
#         vae_pths[0], sep = '\t', index_col = 0)
# print(len(corr_df))

# mdl_sys_train_df = corr_df[corr_df.System == 'cell-line']
# X = mdl_sys_train_df.iloc[:, 2:]

# CPTAC_df = corr_df[corr_df.System == 'cptac']

# Store scores by clf
clf_score_dict = {}

clf_i_list = ['RF', 'DT', 'LR',]
for clf_i, clf in enumerate(clf_list):
    print(clf)
    print(clf_i_list[clf_i])

    clf_score_list = []
    for cv_split in list(range(0,5)):
        # X = X.sample(round(len(X)*.8))
        
        # clf.fit(X,
        #         mdl_sys_train_df[mdl_sys_train_df.index.isin(X.index)].Cancer_type)

        clf.fit(X,
                mdl_sys_train_df.Cancer_type)

        score = f1_score(CPTAC_df.Cancer_type,
                         clf.predict(CPTAC_df.iloc[:, 2:]),
                         average = 'weighted')
        clf_score_list.append(score)
    clf_score_dict[clf_i_list[clf_i]] = clf_score_list
    print(' ')

print('done')

Begin 3 clf on VAE n5
 
Input file:
 ../data/r72/nc5_transcriptomics_cell-line+CPTAC.tsv
 
Corrected file:
 ['../results/r72/vae/2024-03-11-latent_space_1.tsv']
 
827
RandomForestClassifier(max_depth=2, random_state=0)
RF
 
DecisionTreeClassifier()
DT
 
LogisticRegression(max_iter=5000, random_state=0)
LR
 
done


In [21]:
clf_score_dict # vae 2024-03-11

{'RF': [0.11401331405611255,
  0.11401331405611255,
  0.11401331405611255,
  0.11401331405611255,
  0.11401331405611255],
 'DT': [0.19670891991181316,
  0.20329576598258636,
  0.18786488096404189,
  0.1916039711854222,
  0.18150572541678622],
 'LR': [0.19306558128186152,
  0.19306558128186152,
  0.19306558128186152,
  0.19306558128186152,
  0.19306558128186152]}

In [23]:
clf_score_dict # input

{'RF': [0.23296980103409473,
  0.23296980103409473,
  0.23296980103409473,
  0.23296980103409473,
  0.23296980103409473],
 'DT': [0.05305659015906366,
  0.05305659015906366,
  0.05305659015906366,
  0.05305659015906366,
  0.05305659015906366],
 'LR': [0.4163660594458076,
  0.4163660594458076,
  0.4163660594458076,
  0.4163660594458076,
  0.4163660594458076]}

#### Devel

In [None]:
df.System.unique()

In [None]:
df.System.value_counts()

In [None]:
df.Cancer_type.unique()

In [None]:
df.Cancer_type.value_counts()

#### Run 1

In [None]:
train_list = ['cell-line', 'hcmi']

# toggle_val = 1

# for trn_i, pth in enumerate(inp[toggle_val]):
for pth in [phase_pths[toggle_val]]:
    # break
    df = pd.read_csv(
        pth, sep = '\t', index_col = 0)
    # break
    # Train on cell line, n = 261
    non_human_train_df = df[df.System == train_list[toggle_val]]
    X = non_human_train_df.iloc[:, 2:]

    CPTAC_df = df[df.System == 'cptac']

    # Store scores by clf
    clf_i_list = ['RF', 'DT', 'LR',]
    clf_score_dict = {}
    for clf_i, clf in enumerate(clf_list):

        for cv_split in list(range(0,1)):

            clf.fit(X,
                    non_human_train_df.Cancer_type)
    
            score = f1_score(CPTAC_df.Cancer_type,
                             clf.predict(CPTAC_df.iloc[:, 2:]),
                             average = 'weighted')
            clf_score_dict[clf_i_list[clf_i]] = score
            # break
        print(' ')
        # break
    # break # cell line, HCMI
print('done')

#### Notes

In [None]:
# clf = tree.DecisionTreeClassifier()
# clf = LogisticRegression(random_state=0).fit(X, y)