In [116]:
import pandas as pd
import numpy as np
import collections
from scipy import stats

In [120]:
def readSimilarityMatrix(filename):
    """
    read in sample by sample similarity matrix
    input: file location
    return: pandas data frame 
    """
    similarity = pd.read_csv(filename, delimiter = "\t",skiprows=0, index_col=0)
#     similarity = similarity.set_index('sample')
#     similarity = similarity.set_index([[0]])
    
    return similarity

similarity = readSimilarityMatrix("/Users/Alexis/Desktop/StuartRotation/data/GBM_simMatrix.pancan.atlas.imputed.tsv")

similarity_test = readSimilarityMatrix("/Users/Alexis/Desktop/samplemat.txt")
similarity_test2 = readSimilarityMatrix("/Users/Alexis/Desktop/pancansample.tsv")

print(similarity_test2)
# print(similarity)

#['TCGA-06-0125-01A-01R-1849-01'] occurs twice? 
# check if vals are equal and delete

                              TCGA-02-0047-01A-01R-1849-01  \
TCGA-02-0047-01A-01R-1849-01                        1.0000   
TCGA-02-0055-01A-01R-1849-01                        0.0300   
TCGA-02-2483-01A-01R-1849-01                        0.7234   
TCGA-02-2485-01A-01R-1849-01                        0.2200   

                              TCGA-02-0055-01A-01R-1849-01  \
TCGA-02-0047-01A-01R-1849-01                      0.359671   
TCGA-02-0055-01A-01R-1849-01                      1.000000   
TCGA-02-2483-01A-01R-1849-01                      0.359671   
TCGA-02-2485-01A-01R-1849-01                      0.359671   

                              TCGA-02-2483-01A-01R-1849-01  \
TCGA-02-0047-01A-01R-1849-01                      0.420936   
TCGA-02-0055-01A-01R-1849-01                      0.420936   
TCGA-02-2483-01A-01R-1849-01                      1.000000   
TCGA-02-2485-01A-01R-1849-01                      0.420936   

                              TCGA-02-2485-01A-01R-1849-01  
TCGA-0

In [125]:
def getSampleRankings(similarity_matrix):
    """
    calculate ranking of samples by similarity to each sample
    input: similarity matrix
    return: sample by rank matrix, containing sample names ordered by similarity to row name
    """
    samples = (list(similarity_matrix.index)) #row names
    sample_ranking_mat = pd.DataFrame(index = samples) #new matrix with same row names
    
    for sample in samples:
        #sort row, get list of ranked sample names
        ranked_samples = (similarity_matrix.loc[sample].sort_values(ascending=False).index.tolist())
        sample_ranking_mat[sample] = ranked_samples
    
    #transform
    sample_ranking_mat = sample_ranking_mat.T
    
    #redo header line
    sample_ranking_mat.columns = range(len(samples))
        
    #remove first column
    sample_ranking_mat = sample_ranking_mat.drop(sample_ranking_mat.columns[0], axis=1)
    
    return sample_ranking_mat

ranked_samples = getSampleRankings(similarity_test2)

print(ranked_samples)

                                                         1  \
TCGA-02-0047-01A-01R-1849-01  TCGA-02-2485-01A-01R-1849-01   
TCGA-02-0055-01A-01R-1849-01  TCGA-02-2485-01A-01R-1849-01   
TCGA-02-2483-01A-01R-1849-01  TCGA-02-2485-01A-01R-1849-01   
TCGA-02-2485-01A-01R-1849-01  TCGA-02-2483-01A-01R-1849-01   

                                                         2  \
TCGA-02-0047-01A-01R-1849-01  TCGA-02-2483-01A-01R-1849-01   
TCGA-02-0055-01A-01R-1849-01  TCGA-02-2483-01A-01R-1849-01   
TCGA-02-2483-01A-01R-1849-01  TCGA-02-0047-01A-01R-1849-01   
TCGA-02-2485-01A-01R-1849-01  TCGA-02-0055-01A-01R-1849-01   

                                                         3  
TCGA-02-0047-01A-01R-1849-01  TCGA-02-0055-01A-01R-1849-01  
TCGA-02-0055-01A-01R-1849-01  TCGA-02-0047-01A-01R-1849-01  
TCGA-02-2483-01A-01R-1849-01  TCGA-02-0055-01A-01R-1849-01  
TCGA-02-2485-01A-01R-1849-01  TCGA-02-0047-01A-01R-1849-01  


In [126]:
def KStoUniform(ranked_samples, pos_samples):
    """
    calculate KS test of ranks of positive samples to uniform distribution
    input: ranked samples, names of positive samples
    return: KS distance
    """
    pos_ranks = []
    
    #getting enumerated dictionary of ranked samples to reference
    rankedsampleD = {}
    for x,y in enumerate(ranked_samples.T.values.flatten()):
        rankedsampleD[x]=y
        
    #getting list of positive sample rankings
    for sample in pos_samples:
        pos_ranks.append([key for key, val in rankedsampleD.items() if val == sample])
    
    #flattening 
    pos_ranks = [val for sublist in pos_ranks for val in sublist]
        
    ks = scipy.stats.kstest(pos_ranks,'norm', alternative='greater',N=len(ranked_samples))
    
    return ks

    
pos_samples = ['TCGA-02-2485-01A-01R-1849-01']
ks = KStoUniform(ranked_samples, pos_samples)
print(ks)


KstestResult(statistic=0.02275013194817921, pvalue=0.9762029562927512)


In [127]:
def getKSDistribution(samples, sample_ranking_mat, pos_samples):
    """
    get KS distance for all samples in a set
    input: sample set, the sample ranking matrix, names of positive samples
    return: vector of KS distances
    """

    samples = (list(similarity_matrix.index)) #row names
    sample_ranking_mat = pd.DataFrame(index = samples) #new matrix with same row names
    
    for sample in samples:
        #sort row, get list of ranked sample names
        ranked_samples = (similarity_matrix.loc[sample].sort_values(ascending=False).index.tolist())
        sample_ranking_mat[sample] = ranked_samples
    
    #transform
    sample_ranking_mat = sample_ranking_mat.T
    
    #redo header line
    sample_ranking_mat.columns = range(len(samples))
    
    #remove first column
    sample_ranking_mat = sample_ranking_mat.drop(sample_ranking_mat.columns[0], axis=1)
    
    distances = []
    
    
    
    
    
    return distances
    
posKS = getKSDistribution(ranked_samples, ['TCGA-02-2485-01A-01R-1849-01'])

TypeError: getKSDistribution() takes exactly 3 arguments (2 given)

(['TCGA-02-0047-01A-01R-1849-01', 'TCGA-02-0055-01A-01R-1849-01', 'TCGA-02-2483-01A-01R-1849-01', 'TCGA-02-2485-01A-01R-1849-01'],                                                          1  \
TCGA-02-0047-01A-01R-1849-01  TCGA-02-2485-01A-01R-1849-01   
TCGA-02-0055-01A-01R-1849-01  TCGA-02-2485-01A-01R-1849-01   
TCGA-02-2483-01A-01R-1849-01  TCGA-02-2485-01A-01R-1849-01   
TCGA-02-2485-01A-01R-1849-01  TCGA-02-2483-01A-01R-1849-01   

                                                         2  \
TCGA-02-0047-01A-01R-1849-01  TCGA-02-2483-01A-01R-1849-01   
TCGA-02-0055-01A-01R-1849-01  TCGA-02-2483-01A-01R-1849-01   
TCGA-02-2483-01A-01R-1849-01  TCGA-02-0047-01A-01R-1849-01   
TCGA-02-2485-01A-01R-1849-01  TCGA-02-0055-01A-01R-1849-01   

                                                         3  
TCGA-02-0047-01A-01R-1849-01  TCGA-02-0055-01A-01R-1849-01  
TCGA-02-0055-01A-01R-1849-01  TCGA-02-0047-01A-01R-1849-01  
TCGA-02-2483-01A-01R-1849-01  TCGA-02-0055-01A-01R-1849-01  
TCG