# Generating and evaluating validation<br>Adam Klie<br>12/08/2019<br>Script to predict generate and then evaluate prediction on validation

#### Import necessary packages

In [5]:
%matplotlib inline
from tqdm import tqdm
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#### Read in key-value pairs and create pandas dataframe

In [6]:
# SRA BioSample key-value pairs
SRS_dir = "../data/allSRS_05_15_2018.pickle"
allSRS = pd.read_pickle(SRS_dir)

In [7]:
SRS_df = pd.DataFrame(allSRS).reset_index()
SRS_df.columns = ['srs', 'attribute', 'value']
SRS_df = SRS_df.set_index('srs')

#### Keep only samples with a TITLE longer than certain length and choose class to evaluate

In [8]:
title_len = 5 # min length of titles to predict on

In [9]:
SRS_df['word_count'] = (SRS_df['value'].str.count(' ') + 1)
validation_srs = SRS_df[(SRS_df['attribute'].isin(['TITLE'])) & (SRS_df['word_count'] >= title_len)].index
validation_all = SRS_df.loc[validation_srs].sample(validation_srs.shape[0])

In [10]:
# Filter out any samples with non-usable values
filterTextList = ['not collected','not applicable','missing','n[/]?a','unknown', '-', '--', 'none', 'no']
filterTextRegex = "|".join(map(lambda myStr:'(?:{})'.format(myStr), filterTextList))
filter_mask = validation_all['value'].str.contains(filterTextRegex, case=False)
validation_all = validation_all[~filter_mask]

#### Get all the annotations for those samples that have these TITLES

In [11]:
model_iter = 'simple'

In [59]:
srs_class = 'tissue'
predicted_class = 'Tissue'
nDupTextMax = 1  # number of duplicate values allowed
numSamples = 1000  # number of samples to evaluate for a give class

In [60]:
# Get a dataframe with the class values to try to predict for this specific class
tmp_df = validation_all[validation_all['attribute'] == srs_class]
tmp_df['value'].value_counts().head()

leaf                 275
Pancreatic islets    125
blood                123
liver                101
whole blood           92
Name: value, dtype: int64

In [61]:
total_samples = tmp_df.groupby(['value']).head(n = nDupTextMax).shape[0]
class_validation = tmp_df.groupby(['value']).head(n = nDupTextMax).sample(min(numSamples, total_samples))
class_validation.shape

(768, 3)

In [62]:
# Get the TITLES for this validation set
validation_sample_ids = class_validation.index
validation_samples = SRS_df.loc[validation_sample_ids]
validation_titles = validation_samples[
    validation_samples['attribute'].isin(['TITLE'])].reset_index().set_index(['srs', 'attribute'])
validation_set = validation_titles['value']  # get a series object compatible with prediction script

In [63]:
# See what these look like and print the number
validation_set.head()

srs         attribute
SRS2429072  TITLE                 RNAseq of Zea mays Mo17 zygotene anthers
SRS1414639  TITLE                     Xenopus borealis male, son of family
SRS679173   TITLE         Human CD4+ alpha chain TCR repertoire HV01_CD4+a
SRS583052   TITLE                      Patient AR prostate tumor tissue 18
SRS625786   TITLE        Model organism or animal sample from Orcinus orca
Name: value, dtype: object

In [64]:
display(pd.DataFrame(validation_set).head())

Unnamed: 0_level_0,Unnamed: 1_level_0,value
srs,attribute,Unnamed: 2_level_1
SRS2429072,TITLE,RNAseq of Zea mays Mo17 zygotene anthers
SRS1414639,TITLE,"Xenopus borealis male, son of family"
SRS679173,TITLE,Human CD4+ alpha chain TCR repertoire HV01_CD4+a
SRS583052,TITLE,Patient AR prostate tumor tissue 18
SRS625786,TITLE,Model organism or animal sample from Orcinus orca


In [65]:
class_validation.to_pickle(
'../results/{myclass}_validation_values.pickle'.format(model = model_iter, myclass = predicted_class))

In [66]:
validation_set.to_pickle(
    '../results/{myclass}_validation_set.pickle'.format(model = model_iter, myclass = predicted_class))

In [78]:
validations = ["Species", "Cell type", "Genotype", "Condition-Disease", "Tissue"]
test_SRSs = []
for valid in validations:
    if 
    curr_df = pd.read_pickle('../results/{myclass}_validation_set.pickle'.format(myclass = predicted_class))
    test_SRSs = test_SRSs + list(curr_df.index.get_level_values('srs'))

In [80]:
test_list = set(test_SRSs)

In [87]:
with open('../results/test_SRSs_{model}.txt'.format(model=model_iter), 'w') as f:
    f.writelines('\n'.join(test_list))