In [1]:
import pandas as pd
import os
import shutil
import getFeatures as gf
import numpy as np
from sklearn import metrics
import scipy
from matplotlib import pyplot as plt
from xlwt import Workbook

In [2]:
def get_all_files(directories):
    filenames = []
    for directory in directories:
        for filename in os.listdir(directory):
            if filename.endswith('.xlsx') or filename.endswith('.xls'):
                filenames.append(directory+filename)
    return filenames

def get_stats(featureFile):
    df = pd.read_excel(featureFile)
    return df.mean(axis = 0), df.std(axis = 0)

def file_2_rank(excelName):
    file2rank = dict()
    try:
        df = pd.read_excel(excelName)
    except:
        df = pd.read_csv(excelName, sep='\t')
    
    for i in range(df.shape[0]):
        file2rank[df.loc[i, 'Filename']] = df.loc[i, 'Rank']
    
    return file2rank

def file_2_feature_train(filenames, file2rank):
    file2feature = dict()
    for filename in filenames:
        assert filename in file2rank.keys(), filename + ' not in ranking excel file'
        
        l = list(gf.classify(filename, ''))
        l.append(file2rank[filename])
        file2feature[filename] = l
        
    return file2feature

def file_2_feature_test(filenames):
    file2feature = dict()
    for filename in filenames:
        l = list(gf.classify(filename, ''))
        file2feature[filename] = l
        
    return file2feature

def distance(sample, cluster):
    dist = 0
    for i in range(len(sample)):
        dist += (cluster[i] - sample[i])**2
    return dist**(1/2)

def find_closest_cluster(df, clusters):
    file2cluster = dict()
    
    def find_min(vals):
        minIndex = None
        minVal = np.inf
        for i in range(len(vals)):
            #print(val[i])
            if vals[i] < minVal:
                minIndex = i
                minVal = vals[i]
        return minIndex+1
    
    for i in range(df.shape[0]):
        allDist = []
        for j in range(clusters.shape[0]):
            dist = distance(df.iloc[i, :], clusters[j, :])
            allDist.append(dist)
        
        file2cluster[list(df.index)[i]] = find_min(allDist)
    return file2cluster

def rank(directories, clusterFilename, cluster2group):
    allFilenames = get_all_files(directories)
    means, stds = get_stats('Train_07192020.xls')
    file2features = file_2_feature_test(allFilenames)
    #df = pd.read_excel(featureFile)
    clusters = np.loadtxt(clusterFilename)
    
    columns = ['Ratio of Peaks Found', 'Ratio of Peaks to Ideal', 'Ratio of Range', 'Inverse Standard Deviation', 'Area Under the Curve', 'Normed Area Under the Curve', 'Smoothing Error']

    df = pd.DataFrame(list(file2features.values()), columns = columns, index = file2features.keys())
    for column in columns:
        df.loc[:, column] = (df.loc[:, column] - means[column])/stds[column]
    #print(df)
    file2cluster = find_closest_cluster(df, clusters)
    file2group = dict()
    for filename in file2cluster.keys():
        for key in cluster2group.keys():
            if file2cluster[filename] in key:
                file2group[filename] = cluster2group[key]
                break
    
    return file2group

def get_wilcoxon(testRanks, realRanks):
    def remove_directory(filename):
        for i in range(len(filename)):
            if filename[i] == '/':
                return filename[i+1:]
            
    df = pd.read_excel('TrainRanked.xls')
    rankedDict = dict()
    for i in range(df.shape[0]):
        rankedDict[df.loc[i, 'name']] = df.loc[i, 'rank']
    
    for key in rankedDict.keys():
        for pair in realRanks.keys():
            if rankedDict[key] >= pair[0] and rankedDict[key] <= pair[1]:
                rankedDict[key] = realRanks[pair]
                break
    
    testList = []
    realList = []
    for f1 in testRanks.keys():
        testList.append(testRanks[f1])
        realList.append(rankedDict[remove_directory(f1)])
    
    print(testList)
    print(realList)
    return scipy.stats.wilcoxon(testList, realList)

In [3]:
cluster2group = dict()
cluster2group[(6, 7)] = 1
cluster2group[(4, 1)] = 4
cluster2group[(2, 8)] = 3
cluster2group[(3, 5)] = 2
test = rank(['train/', 'test/'], 'ClusterValues.txt', cluster2group)

train/AnGam_Mos55_cells.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls
train/Dmel_HI-N31_sRNA.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls
train/AeAeg_Aag2_eGFPIP2B_AK.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls
train/AeAeg_CCL-125cells_CHIKV.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls
train/AeAlbo_VietFcar_AEFV_MERV_SHTV_YS.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls
train/AeAeg_Aag2_SBV2_AK.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls
train/Dmel_fGSOSS_cellline.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls
train/Dmel_Sg4_cellline.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls
train/AeAeg_Female_Soma.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls
train/Dmel_OSCSiomi2015_sRNA.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls
train/AeAeg_Aag2_Zika.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls
train/AeAeg_Female_Ovary_BetaE.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls
train/AeAlbo_C636_2dpi_WNV1.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls
train/AeAlbo_C636_6dpi_WNV1.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls
train/Dmel_3r

train/AeAlbo_CHIKV_9dpi.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls
train/AeAeg_Fcarc_GH_rep3.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls
train/AeAlbo_C636_cells.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls
train/AeAeg_Aag2_BTV2_AK.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls
train/CuQuin_Fcarc_NL.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls
train/Dmel_OSSHann2016_sRNA.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls
train/Dmel_Ovary_Ago2_414.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls
train/AeAlbo_Ovary_OA.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls
train/AeAeg_Aag2_Piwi4IP2A_AK.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls
train/AeAlbo_C6-36_Mock_GP.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls
train/Dmel_OSScells_BetaE.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls
train/CuQuin_Hsu_6dpi_WNV3.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls
train/AeAeg_Female_Carcass_BF72h.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls
train/Dmel_HI-N10_sRNA.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls
train/AeAlbo_CHIKV_3dpi.24_35.trim.

train/AeAeg_Aag2SINV_dsPiwi5.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls
train/Dmel_ML-DmD20-c5_cellline.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls
train/Dmel_sRNA_w1XHar_21day.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls
train/AeAlbo_C636_SINV_GFP.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls
train/CuQuin_Testes_NL.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls
train/AeAeg_Larvae.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls
train/AnGam_Ovaries_rep3_KM.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls
train/AeAlbo_C636_2dpi_WNV2.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls
train/CuQuin_Hsu_17dpi_WNV3.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls
train/AeAlbo_U4.4_Mock_GP.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls
train/AeAlbo_VietMcar_AEFV_MERV_SHTV_YS.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls
train/AeAeg_Aag2_Ago3IP2B_AK.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls
train/Dmel_sRNA_HarXw1_21day.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls
train/AeAlbo_C636_CHIKV_AZT_3dpi.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xl

In [4]:
counters = [0 for i in range(4)]
for i in list(test.values()):
    counters[i-1] += 1
print(counters)

newCounters = []
val = 0
for i in counters:
    val += i
    newCounters.append(val)
print(newCounters)

[169, 0, 0, 158]
[169, 169, 169, 327]


In [5]:
clusters = np.loadtxt('ClusterValues.txt')
print(clusters.shape)

(8, 7)


In [6]:
"""
realRanks = dict()
realRanks[(1, 45)] = 0
realRanks[(46, 138)] = 0
realRanks[(139, 216)] = 1
realRanks[(216, 350)] = 1

print(get_wilcoxon(test, realRanks))
"""

'\nrealRanks = dict()\nrealRanks[(1, 45)] = 0\nrealRanks[(46, 138)] = 0\nrealRanks[(139, 216)] = 1\nrealRanks[(216, 350)] = 1\n\nprint(get_wilcoxon(test, realRanks))\n'

In [7]:
#New Cluster Centers
"""
wb = Workbook()
sheet = wb.add_sheet('Sheet 1')
sheet.write(0, 0, 'Filename')
sheet.write(0, 1, 'Ratio of Peaks Found')
sheet.write(0, 2, 'Ratio of Peaks to Ideal')
sheet.write(0, 3, 'Ratio of Range')
sheet.write(0, 4, 'Inverse Standard Deviation')
sheet.write(0, 5, 'Smoothing Error')
sheet.write(0, 6, 'Area Under the Curve')
sheet.write(0, 7, 'Normed Area Under the Curve')


directory = 'train/'
rowCounter = 1
for filename in os.listdir(directory):
    vals = gf.classify(filename, directory)
    sheet.write(rowCounter, 0, filename)
    sheet.write(rowCounter, 1, vals[0])
    sheet.write(rowCounter, 2, vals[1])
    sheet.write(rowCounter, 3, vals[2])
    sheet.write(rowCounter, 4, vals[3])
    sheet.write(rowCounter, 5, vals[4])
    sheet.write(rowCounter, 6, vals[5])
    sheet.write(rowCounter, 7, vals[6])
    rowCounter += 1
    
wb.save('Train_07192020.xls')
"""

"\nwb = Workbook()\nsheet = wb.add_sheet('Sheet 1')\nsheet.write(0, 0, 'Filename')\nsheet.write(0, 1, 'Ratio of Peaks Found')\nsheet.write(0, 2, 'Ratio of Peaks to Ideal')\nsheet.write(0, 3, 'Ratio of Range')\nsheet.write(0, 4, 'Inverse Standard Deviation')\nsheet.write(0, 5, 'Smoothing Error')\nsheet.write(0, 6, 'Area Under the Curve')\nsheet.write(0, 7, 'Normed Area Under the Curve')\n\n\ndirectory = 'train/'\nrowCounter = 1\nfor filename in os.listdir(directory):\n    vals = gf.classify(filename, directory)\n    sheet.write(rowCounter, 0, filename)\n    sheet.write(rowCounter, 1, vals[0])\n    sheet.write(rowCounter, 2, vals[1])\n    sheet.write(rowCounter, 3, vals[2])\n    sheet.write(rowCounter, 4, vals[3])\n    sheet.write(rowCounter, 5, vals[4])\n    sheet.write(rowCounter, 6, vals[5])\n    sheet.write(rowCounter, 7, vals[6])\n    rowCounter += 1\n    \nwb.save('Train_07192020.xls')\n"

In [8]:
cluster2group = dict()
cluster2group[(2, 5)] = 1
cluster2group[(8, -1)] = 2
cluster2group[(4, 7)] = 3
cluster2group[(6, 3, 1)] = 4
test = rank(['train/'], 'NewClusterValues.txt', cluster2group)

train/AnGam_Mos55_cells.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls
train/Dmel_HI-N31_sRNA.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls
train/AeAeg_Aag2_eGFPIP2B_AK.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls
train/AeAeg_CCL-125cells_CHIKV.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls
train/AeAlbo_VietFcar_AEFV_MERV_SHTV_YS.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls
train/AeAeg_Aag2_SBV2_AK.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls
train/Dmel_fGSOSS_cellline.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls
train/Dmel_Sg4_cellline.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls
train/AeAeg_Female_Soma.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls
train/Dmel_OSCSiomi2015_sRNA.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls
train/AeAeg_Aag2_Zika.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls
train/AeAeg_Female_Ovary_BetaE.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls
train/AeAlbo_C636_2dpi_WNV1.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls
train/AeAlbo_C636_6dpi_WNV1.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls
train/Dmel_3r

train/AeAlbo_CHIKV_9dpi.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls
train/AeAeg_Fcarc_GH_rep3.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls
train/AeAlbo_C636_cells.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls
train/AeAeg_Aag2_BTV2_AK.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls
train/CuQuin_Fcarc_NL.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls
train/Dmel_OSSHann2016_sRNA.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls
train/Dmel_Ovary_Ago2_414.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls
train/AeAlbo_Ovary_OA.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls
train/AeAeg_Aag2_Piwi4IP2A_AK.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls
train/AeAlbo_C6-36_Mock_GP.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls
train/Dmel_OSScells_BetaE.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls
train/CuQuin_Hsu_6dpi_WNV3.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls
train/AeAeg_Female_Carcass_BF72h.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls
train/Dmel_HI-N10_sRNA.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls
train/AeAlbo_CHIKV_3dpi.24_35.trim.

train/AeAeg_Aag2SINV_dsPiwi5.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls
train/Dmel_ML-DmD20-c5_cellline.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls
train/Dmel_sRNA_w1XHar_21day.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls
train/AeAlbo_C636_SINV_GFP.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls
train/CuQuin_Testes_NL.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls
train/AeAeg_Larvae.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls
train/AnGam_Ovaries_rep3_KM.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls
train/AeAlbo_C636_2dpi_WNV2.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls
train/CuQuin_Hsu_17dpi_WNV3.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls
train/AeAlbo_U4.4_Mock_GP.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls
train/AeAlbo_VietMcar_AEFV_MERV_SHTV_YS.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls
train/AeAeg_Aag2_Ago3IP2B_AK.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls
train/Dmel_sRNA_HarXw1_21day.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls
train/AeAlbo_C636_CHIKV_AZT_3dpi.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xl

In [9]:
print(test)

{'train/AnGam_Mos55_cells.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls': 3, 'train/Dmel_HI-N31_sRNA.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls': 1, 'train/AeAeg_Aag2_eGFPIP2B_AK.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls': 2, 'train/AeAeg_CCL-125cells_CHIKV.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls': 1, 'train/AeAlbo_VietFcar_AEFV_MERV_SHTV_YS.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls': 3, 'train/AeAeg_Aag2_SBV2_AK.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls': 3, 'train/Dmel_fGSOSS_cellline.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls': 1, 'train/Dmel_Sg4_cellline.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls': 4, 'train/AeAeg_Female_Soma.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls': 1, 'train/Dmel_OSCSiomi2015_sRNA.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls': 3, 'train/AeAeg_Aag2_Zika.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls': 2, 'train/AeAeg_Female_Ovary_BetaE.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls': 1, 'train/AeAlbo_C636_2dpi_WNV1.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls': 1, 'trai

In [10]:
realRanks = dict()
realRanks[(1, 150)] = 1
realRanks[(151, 175)] = 2
realRanks[(176, 199)] = 3
realRanks[(200, 350)] = 4
print(get_wilcoxon(test, realRanks))

[3, 1, 2, 1, 3, 3, 1, 4, 1, 3, 2, 1, 1, 1, 4, 2, 4, 2, 2, 2, 3, 1, 1, 1, 4, 3, 3, 1, 1, 4, 3, 1, 1, 2, 3, 2, 2, 3, 4, 2, 4, 2, 3, 1, 2, 2, 2, 3, 2, 1, 2, 1, 3, 3, 1, 1, 4, 2, 1, 2, 1, 4, 3, 3, 2, 1, 4, 1, 3, 2, 4, 1, 1, 3, 1, 2, 2, 1, 1, 1, 1, 2, 3, 1, 1, 4, 3, 4, 3, 1, 1, 1, 2, 1, 2, 1, 2, 4, 3, 2, 1, 1, 2, 2, 3, 2, 1, 1, 1, 2, 2, 2, 2, 3, 1, 3, 2, 4, 1, 4, 1, 3, 3, 1, 1, 1, 3, 1, 3, 2, 2, 1, 1, 1, 3, 3, 3, 3, 2, 1, 2, 1, 2, 2, 3, 1, 2, 1, 4, 2, 3, 2, 2, 2, 4, 4, 1, 3, 1, 4, 2, 2, 1, 2, 2, 2, 3, 2, 4, 1, 2, 3, 3, 2, 2, 3, 3, 1, 1, 2, 2, 1, 2, 3, 3, 1, 2, 1, 4, 2, 2, 4, 1, 1, 2, 2, 1, 1, 3, 3, 3, 1, 3, 1, 1, 3, 1, 1, 3, 3, 3, 2, 1, 1, 1, 2, 2, 1, 2, 1, 1, 3, 4, 3, 3, 1, 3, 1, 4, 1, 1, 3, 3, 3, 1, 3, 2, 3, 1, 2, 1, 1, 3, 4, 1, 2, 1, 3, 3, 2, 1, 1, 2, 1, 3, 3, 4, 2, 1, 1, 1, 2, 3, 1, 3]
[4, 1, 3, 1, 4, 2, 4, 4, 1, 4, 1, 1, 1, 1, 4, 2, 4, 4, 1, 2, 2, 1, 4, 1, 4, 1, 2, 1, 2, 4, 1, 1, 1, 2, 1, 1, 1, 2, 4, 4, 4, 4, 2, 1, 1, 4, 1, 4, 3, 1, 1, 1, 1, 2, 2, 1, 4, 2, 1, 1, 1, 4, 1, 1, 3, 4, 4, 4,

In [11]:
df = pd.read_excel('ranked_curves.xlsx')
rankedDict = dict()
for i in range(df.shape[0]):
    rankedDict[df.loc[i, 'name']] = df.loc[i, 'rank']
    
rankedFiles = [x for _, x in sorted(zip(list(rankedDict.values()), list(rankedDict.keys())))]
#print(rankedFiles)
#print(test)

newRankedDict = dict()
counter = 1
for i in range(len(rankedFiles)):
    filename = 'train/' + rankedFiles[i]
    if filename in test.keys():
        newRankedDict[rankedFiles[i]] = counter
        counter += 1
print(newRankedDict)

wb = Workbook()
sheet = wb.add_sheet('Sheet 1')
sheet.write(0, 0, 'name')
sheet.write(0, 1, 'rank')

rowCounter = 1
for key in newRankedDict.keys():
    sheet.write(rowCounter, 0, key)
    sheet.write(rowCounter, 1, newRankedDict[key])
    rowCounter += 1
wb.save('TrainRanked.xls')

{'AeAlbo_Ovary_OA.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls': 1, 'AeAeg_Female_Ovary_BetaE.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls': 2, 'AaAeg_whole_SINV_YFVcapsid.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls': 3, 'AeAeg_Female_Ovary.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls': 4, 'AeAeg_Embryo_0-1h.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls': 5, 'AeAlbo_FemaleWhole_Denv2_JG.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls': 6, 'AeAlbo_C636_cells_rep_3.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls': 7, 'AeAeg_Whole_ZV_In_RNA_GP.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls': 8, 'AeAlbo_CHIKV_AZT_9dpi.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls': 9, 'AeAlbo_FemaleWhole_Sugar_CT.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls': 10, 'CuQuin_Testes_NL.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls': 11, 'AeAeg_Testes.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls': 12, 'AeAlbo_VietOvaries_AEFV_MERV_SHTV_YS.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls': 13, 'AeAlbo_C636HT_DENV2_Persist.24_35.trim.fastq.uq.polyn.5to5

In [12]:
def get_all_files(directories):
    filenames = []
    for directory in directories:
        for filename in os.listdir(directory):
            if filename.endswith('.xlsx') or filename.endswith('.xls'):
                filenames.append(directory+filename)
    return filenames

def get_stats(featureFile):
    df = pd.read_excel(featureFile)
    return df.mean(axis = 0), df.std(axis = 0)

def file_2_rank(excelName):
    file2rank = dict()
    try:
        df = pd.read_excel(excelName)
    except:
        df = pd.read_csv(excelName, sep='\t')
    
    for i in range(df.shape[0]):
        file2rank[df.loc[i, 'Filename']] = df.loc[i, 'Rank']
    
    return file2rank

def file_2_feature_train(filenames, file2rank):
    file2feature = dict()
    for filename in filenames:
        assert filename in file2rank.keys(), filename + ' not in ranking excel file'
        
        l = list(gf.classify(filename, ''))
        l.append(file2rank[filename])
        file2feature[filename] = l
        
    return file2feature

def file_2_feature_test(filenames):
    file2feature = dict()
    for filename in filenames:
        l = list(gf.classify(filename, ''))
        file2feature[filename] = l
        
    return file2feature

def distance(sample, cluster):
    dist = 0
    for i in range(len(sample)):
        dist += (cluster[i] - sample[i])**2
    return dist**(1/2)

def find_closest_cluster(df, clusters):
    file2cluster = dict()
    
    def find_min(vals):
        minIndex = None
        minVal = np.inf
        for i in range(len(vals)):
            #print(val[i])
            if vals[i] < minVal:
                minIndex = i
                minVal = vals[i]
        return minIndex+1
    
    for i in range(df.shape[0]):
        allDist = []
        for j in range(clusters.shape[0]):
            dist = distance(df.iloc[i, :], clusters[j, :])
            allDist.append(dist)
        
        file2cluster[list(df.index)[i]] = find_min(allDist)
    return file2cluster

def rank(directories, clusterFilename, cluster2group):
    allFilenames = get_all_files(directories)
    means, stds = get_stats('Train_07192020.xls')
    file2features = file_2_feature_test(allFilenames)
    #df = pd.read_excel(featureFile)
    clusters = np.loadtxt(clusterFilename)
    
    columns = ['Ratio of Peaks Found', 'Ratio of Peaks to Ideal', 'Ratio of Range', 'Inverse Standard Deviation', 'Area Under the Curve', 'Normed Area Under the Curve', 'Smoothing Error']

    df = pd.DataFrame(list(file2features.values()), columns = columns, index = file2features.keys())
    for column in columns:
        df.loc[:, column] = (df.loc[:, column] - means[column])/stds[column]
    #print(df)
    file2cluster = find_closest_cluster(df, clusters)
    file2group = dict()
    for filename in file2cluster.keys():
        for key in cluster2group.keys():
            if file2cluster[filename] in key:
                file2group[filename] = cluster2group[key]
                break
    
    return file2group

def get_wilcoxon(testRanks, realRanks):
    def remove_directory(filename):
        for i in range(len(filename)):
            if filename[i] == '/':
                return filename[i+1:]
            
    df = pd.read_excel('TestRanked.xls')
    rankedDict = dict()
    for i in range(df.shape[0]):
        rankedDict[df.loc[i, 'name']] = df.loc[i, 'rank']
    
    for key in rankedDict.keys():
        for pair in realRanks.keys():
            if rankedDict[key] >= pair[0] and rankedDict[key] <= pair[1]:
                rankedDict[key] = realRanks[pair]
                break
    
    testList = []
    realList = []
    for f1 in testRanks.keys():
        testList.append(testRanks[f1])
        realList.append(rankedDict[remove_directory(f1)])
    
    print(testList)
    print(realList)
    return scipy.stats.wilcoxon(testList, realList)

In [13]:
wb = Workbook()
sheet = wb.add_sheet('Sheet 1')
sheet.write(0, 0, 'Filename')
sheet.write(0, 1, 'Ratio of Peaks Found')
sheet.write(0, 2, 'Ratio of Peaks to Ideal')
sheet.write(0, 3, 'Ratio of Range')
sheet.write(0, 4, 'Inverse Standard Deviation')
sheet.write(0, 5, 'Smoothing Error')
sheet.write(0, 6, 'Area Under the Curve')
sheet.write(0, 7, 'Normed Area Under the Curve')


directory = 'test/'
rowCounter = 1
for filename in os.listdir(directory):
    vals = gf.classify(filename, directory)
    sheet.write(rowCounter, 0, filename)
    sheet.write(rowCounter, 1, vals[0])
    sheet.write(rowCounter, 2, vals[1])
    sheet.write(rowCounter, 3, vals[2])
    sheet.write(rowCounter, 4, vals[3])
    sheet.write(rowCounter, 5, vals[4])
    sheet.write(rowCounter, 6, vals[6])
    sheet.write(rowCounter, 7, vals[5])
    rowCounter += 1
    
wb.save('Test_07192021.xls')

AeAlbo_HeadThor_CHIKV-B2_NOV.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls
AnGam_Fem_Head_and_Thorax_rep2_KM.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls
AeAeg_Ovary_BF72h_cat.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls
AeAeg_Aag2_WNV_GP.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls
AeAeg_Aag2_DENV2NGC.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls
CuQuin_Hsu_6dpi_mock1.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls
AeAeg_Mid_DissToCar_4dpf_JM.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls
AeAlbo_C636_2dpi_mock2.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls
CuQuin_Ovary_NL.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls
AeAlbo_C636HT_cells.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls
AeAeg_Mid_dsDV_4dpf_JM.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls
AeAeg_Ovary_TC.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls
Dmel_OvaryBeta_RAL375_sRNA.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls
AeAeg_Aag2_R2D2_r123_ZA.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls
AeAeg_Aag2_Ago2IP1_AK.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls
AeAlbo_U4-4_

In [14]:
cluster2group = dict()
cluster2group[(2, 5)] = 1
cluster2group[(8, -1)] = 2
cluster2group[(4, 7)] = 3
cluster2group[(6, 3, 1)] = 4
test = rank(['test/'], 'NewClusterValues.txt', cluster2group)

test/AeAlbo_HeadThor_CHIKV-B2_NOV.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls
test/AnGam_Fem_Head_and_Thorax_rep2_KM.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls
test/AeAeg_Ovary_BF72h_cat.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls
test/AeAeg_Aag2_WNV_GP.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls
test/AeAeg_Aag2_DENV2NGC.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls
test/CuQuin_Hsu_6dpi_mock1.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls
test/AeAeg_Mid_DissToCar_4dpf_JM.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls
test/AeAlbo_C636_2dpi_mock2.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls
test/CuQuin_Ovary_NL.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls
test/AeAlbo_C636HT_cells.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls
test/AeAeg_Mid_dsDV_4dpf_JM.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls
test/AeAeg_Ovary_TC.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls
test/Dmel_OvaryBeta_RAL375_sRNA.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls
test/AeAeg_Aag2_R2D2_r123_ZA.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls
test/Ae

In [15]:
realRanks = dict()
realRanks[(1, 30)] = 1
realRanks[(31, 50)] = 2
realRanks[(51, 60)] = 3
realRanks[(61, 100)] = 4
print(get_wilcoxon(test, realRanks))

[1, 2, 2, 3, 3, 2, 1, 1, 1, 1, 1, 2, 1, 3, 3, 1, 3, 1, 1, 1, 2, 1, 2, 1, 1, 1, 1, 1, 2, 2, 1, 1, 3, 3, 1, 1, 3, 2, 1, 2, 4, 1, 1, 2, 1, 1, 3, 1, 1, 1, 2, 3, 1, 1, 1, 1, 3, 2, 2, 1, 2, 1]
[3, 4, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 2, 2, 3, 1, 2, 2, 1, 1, 2, 1, 2, 1, 1, 1, 1, 1, 2, 1, 1, 1, 3, 3, 1, 1, 3, 1, 1, 3, 4, 1, 2, 2, 1, 2, 3, 1, 1, 2, 2, 3, 3, 2, 1, 2, 2, 1, 3, 2, 1, 1]
WilcoxonResult(statistic=105.0, pvalue=0.2750996691485784)


In [16]:
df = pd.read_excel('ranked_curves.xlsx')
rankedDict = dict()
for i in range(df.shape[0]):
    rankedDict[df.loc[i, 'name']] = df.loc[i, 'rank']
    
rankedFiles = [x for _, x in sorted(zip(list(rankedDict.values()), list(rankedDict.keys())))]
#print(rankedFiles)
#print(test)

newRankedDict = dict()
counter = 1
for i in range(len(rankedFiles)):
    filename = 'test/' + rankedFiles[i]
    if filename in test.keys():
        newRankedDict[rankedFiles[i]] = counter
        counter += 1
print(newRankedDict)

wb = Workbook()
sheet = wb.add_sheet('Sheet 1')
sheet.write(0, 0, 'name')
sheet.write(0, 1, 'rank')

rowCounter = 1
for key in newRankedDict.keys():
    sheet.write(rowCounter, 0, key)
    sheet.write(rowCounter, 1, newRankedDict[key])
    rowCounter += 1
wb.save('TestRanked.xls')

{'AeAlbo_Testis_OA.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls': 1, 'CuQuin_Ovary_NL.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls': 2, 'AeAeg_Embryo_0-2h_cat.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls': 3, 'AeAlbo_Fcarc_OA.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls': 4, 'AeAeg_Ovary_NonBF_cat.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls': 5, 'AeAlbo_C636_6dpi_SINV2.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls': 6, 'AeAlbo_Larv_OA.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls': 7, 'AeAlbo_CHIKV_AZT_3dpi.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls': 8, 'AeAeg_Female_Soma_BetaE.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls': 9, 'AeAeg_Fcarc_TC.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls': 10, 'AeAeg_Ovary_GH.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls': 11, 'AeAeg_Mid_No_diss_4dpf_JM.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls': 12, 'AeAeg_Mid_dsDV_4dpf_JM.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls': 13, 'AeAeg_Mid_DissToCar_4dpf_JM.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls': 14, 'AeAeg_Whole_ZIKV_BM2_GP