# Matrix editing
- remove no_host records
- select first n most abundant hosts
- feature selection with variance threshold on 10% of the least abundant species from selection

In [1]:
# imports
import pandas as pd
from sklearn.feature_selection import VarianceThreshold

In [2]:
# parameters
matrix_file = '/data/projects/kimona/data_18-03-08/011_matrix.tsv'
hosts_file = '/data/projects/kimona/data_18-03-08/004_hosts'
hosts_counts_file = '/data/projects/kimona/data_18-03-08/004_hosts.counts'
out = '/data/projects/kimona/data_18-03-08/011_edited_matrix.tsv'

# number of most abundant hosts to count in
n = 8

# threshold level as in description
level = 0.1

In [3]:
# matrix loading
matrix = pd.read_csv(matrix_file, sep='\t', header=0, index_col=0)

In [12]:
matrix

Unnamed: 0,Cluster_0,Cluster_1,Cluster_2,Cluster_3,Cluster_4,Cluster_5,Cluster_6,Cluster_7,Cluster_8,Cluster_9,...,Cluster_15007,Cluster_15008,Cluster_15009,Cluster_15010,Cluster_15011,Cluster_15012,Cluster_15013,Cluster_15014,Cluster_15015,Cluster_15016
phage0000009,0,0,0,0,1,0,0,0,1,0,...,1,0,0,0,0,0,0,0,0,0
phage0000012,0,0,0,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
phage0000022,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
phage0000023,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
phage0000024,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
phage0000025,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
phage0000028,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
phage0000029,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
phage0000030,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
phage0000031,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
# removing no_host records
with open(hosts_file) as f:
    hosts = f.readlines()

phages_to_drop = []

for record in hosts:
    
    phage = record.split('\t')[0]
    hosts_string = record.split('\t')[1].strip()
    
    if hosts_string == 'no_host':
        phages_to_drop.append(phage)
        
dropped = matrix.drop(phages_to_drop)

print(dropped.shape)

ValueError: labels ['phage0000007' 'phage0000147' 'phage0000148' 'phage0000149' 'phage0000150'
 'phage0000151' 'phage0000152' 'phage0000219' 'phage0000220' 'phage0000221'
 'phage0000259' 'phage0000297' 'phage0000298' 'phage0000299' 'phage0000311'
 'phage0000312' 'phage0000313' 'phage0000314' 'phage0000318' 'phage0000326'
 'phage0000327' 'phage0000403' 'phage0000725' 'phage0000903' 'phage0000904'
 'phage0000905' 'phage0000906' 'phage0000907' 'phage0000910' 'phage0001288'
 'phage0001348' 'phage0001349' 'phage0001385' 'phage0001468' 'phage0001492'
 'phage0001493' 'phage0001522' 'phage0001523' 'phage0001524' 'phage0001740'
 'phage0001756' 'phage0001766' 'phage0001767' 'phage0001768' 'phage0001769'
 'phage0001770' 'phage0001800' 'phage0001801' 'phage0001802' 'phage0001803'
 'phage0001804' 'phage0001805' 'phage0001806' 'phage0001807' 'phage0001808'
 'phage0001809' 'phage0001810' 'phage0001811' 'phage0001812' 'phage0001813'
 'phage0001814' 'phage0001815' 'phage0001816' 'phage0001817' 'phage0001818'
 'phage0001819' 'phage0001820' 'phage0001821' 'phage0001822' 'phage0001823'
 'phage0001824' 'phage0001825' 'phage0001826' 'phage0001827' 'phage0001828'
 'phage0001829' 'phage0001830' 'phage0001831' 'phage0001832' 'phage0001833'
 'phage0001834' 'phage0001835' 'phage0001836' 'phage0001837' 'phage0001838'
 'phage0001839' 'phage0001840' 'phage0001841' 'phage0001842' 'phage0001843'
 'phage0001844' 'phage0001845' 'phage0001846' 'phage0001847' 'phage0001848'
 'phage0001849' 'phage0001850' 'phage0001851' 'phage0001852' 'phage0001853'
 'phage0001854' 'phage0001855' 'phage0001856' 'phage0001857' 'phage0001858'
 'phage0001859' 'phage0001860' 'phage0001861' 'phage0001862' 'phage0001863'
 'phage0001864' 'phage0001865' 'phage0001866' 'phage0001867' 'phage0001868'
 'phage0001869' 'phage0001870' 'phage0001871' 'phage0001872' 'phage0001873'
 'phage0001874' 'phage0001875' 'phage0001876' 'phage0001877' 'phage0001878'
 'phage0001879' 'phage0001880' 'phage0001881' 'phage0001882' 'phage0001883'
 'phage0001884' 'phage0001885' 'phage0001886' 'phage0001887' 'phage0001888'
 'phage0001889' 'phage0001890' 'phage0001891' 'phage0001892' 'phage0001893'
 'phage0001894' 'phage0001895' 'phage0001896' 'phage0001897' 'phage0001899'
 'phage0001900' 'phage0001901' 'phage0001902' 'phage0001903' 'phage0001904'
 'phage0001905' 'phage0001906' 'phage0001907' 'phage0001908' 'phage0001909'
 'phage0001910' 'phage0001911' 'phage0001912' 'phage0001913' 'phage0001914'
 'phage0001915' 'phage0001916' 'phage0001917' 'phage0001918' 'phage0001919'
 'phage0001920' 'phage0001921' 'phage0001922' 'phage0001923' 'phage0001924'
 'phage0001925' 'phage0001926' 'phage0001927' 'phage0001928' 'phage0001929'
 'phage0001930' 'phage0001931' 'phage0001932' 'phage0001933' 'phage0001934'
 'phage0001935' 'phage0001936' 'phage0001937' 'phage0001938' 'phage0001939'
 'phage0002041' 'phage0002042' 'phage0002043' 'phage0002044' 'phage0002058'
 'phage0002122' 'phage0002123' 'phage0002124' 'phage0002177' 'phage0002267'
 'phage0002268' 'phage0002283' 'phage0002371' 'phage0002493' 'phage0002494'
 'phage0002514' 'phage0002530' 'phage0002531' 'phage0002532' 'phage0002533'
 'phage0002534' 'phage0002535' 'phage0002536' 'phage0002537' 'phage0002538'
 'phage0002547' 'phage0002548' 'phage0002549' 'phage0002550' 'phage0002551'
 'phage0002552' 'phage0002553' 'phage0002554' 'phage0002555' 'phage0002556'
 'phage0002557' 'phage0002558' 'phage0002559' 'phage0002560' 'phage0002561'
 'phage0002562' 'phage0002563' 'phage0002564' 'phage0002565' 'phage0002566'
 'phage0002567' 'phage0002568' 'phage0002569' 'phage0002570' 'phage0002571'
 'phage0002572' 'phage0002573' 'phage0002574' 'phage0002681' 'phage0002741'
 'phage0002742' 'phage0002744' 'phage0002745' 'phage0002798' 'phage0002832'
 'phage0002845' 'phage0002865' 'phage0002866' 'phage0002867' 'phage0002951'
 'phage0003026' 'phage0003137' 'phage0003141' 'phage0003182' 'phage0003189'
 'phage0003210' 'phage0003219' 'phage0003260' 'phage0003321' 'phage0003337'
 'phage0003479' 'phage0003480' 'phage0003481' 'phage0003482' 'phage0003483'
 'phage0003484' 'phage0003485' 'phage0003486' 'phage0003487' 'phage0003488'
 'phage0003489' 'phage0003490' 'phage0003491' 'phage0003492' 'phage0003493'
 'phage0003494' 'phage0003495' 'phage0003496' 'phage0003497' 'phage0003498'
 'phage0003499' 'phage0003500' 'phage0003501' 'phage0003502' 'phage0003503'
 'phage0003504' 'phage0003505' 'phage0003506' 'phage0003507' 'phage0003508'
 'phage0003509' 'phage0003510' 'phage0003511' 'phage0003512' 'phage0003513'
 'phage0003514' 'phage0003515' 'phage0003516' 'phage0003517' 'phage0003518'
 'phage0003519' 'phage0003520' 'phage0003521' 'phage0003522' 'phage0003523'
 'phage0003524' 'phage0003525' 'phage0003526' 'phage0003527' 'phage0003528'
 'phage0003529' 'phage0003530' 'phage0003531' 'phage0003532' 'phage0003533'
 'phage0003534' 'phage0003535' 'phage0003536' 'phage0003537' 'phage0003538'
 'phage0003539' 'phage0003540' 'phage0003541' 'phage0003542' 'phage0003543'
 'phage0003544' 'phage0003545' 'phage0003546' 'phage0003547' 'phage0003548'
 'phage0003549' 'phage0003550' 'phage0003551' 'phage0003552' 'phage0003553'
 'phage0003554' 'phage0003555' 'phage0003556' 'phage0003557' 'phage0003558'
 'phage0003559' 'phage0003560' 'phage0003561' 'phage0003562' 'phage0003563'
 'phage0003564' 'phage0003565' 'phage0003566' 'phage0003567' 'phage0003568'
 'phage0003569' 'phage0003570' 'phage0003571' 'phage0003572' 'phage0003573'
 'phage0003574' 'phage0003575' 'phage0003576' 'phage0003577' 'phage0003578'
 'phage0003579' 'phage0003580' 'phage0003581' 'phage0003582' 'phage0003583'
 'phage0003584' 'phage0003585' 'phage0003586' 'phage0003587' 'phage0003588'
 'phage0003589' 'phage0003590' 'phage0003591' 'phage0003592' 'phage0003593'
 'phage0003594' 'phage0003595' 'phage0003596' 'phage0003597' 'phage0003598'
 'phage0003599' 'phage0003600' 'phage0003601' 'phage0003602' 'phage0003603'
 'phage0003604' 'phage0003605' 'phage0003606' 'phage0003607' 'phage0003608'
 'phage0003609' 'phage0003610' 'phage0003611' 'phage0003612' 'phage0003613'
 'phage0003614' 'phage0003615' 'phage0003616' 'phage0003617' 'phage0003618'
 'phage0003619' 'phage0003620' 'phage0003621' 'phage0003622' 'phage0003623'
 'phage0003624' 'phage0003625' 'phage0003626' 'phage0003627' 'phage0003628'
 'phage0003629' 'phage0003630' 'phage0003631' 'phage0003632' 'phage0003633'
 'phage0003634' 'phage0003635' 'phage0003636' 'phage0003637' 'phage0003638'
 'phage0003639' 'phage0003640' 'phage0003641' 'phage0003642' 'phage0003643'
 'phage0003644' 'phage0003645' 'phage0003646' 'phage0003647' 'phage0003648'
 'phage0003649' 'phage0003650' 'phage0003651' 'phage0003652' 'phage0003653'
 'phage0003654' 'phage0003655' 'phage0003656' 'phage0003657' 'phage0003658'
 'phage0003659' 'phage0003660' 'phage0003661' 'phage0003662' 'phage0003663'
 'phage0003664' 'phage0003665' 'phage0003666' 'phage0003667' 'phage0003668'
 'phage0003669' 'phage0003670' 'phage0003701' 'phage0003702' 'phage0003707'
 'phage0003711' 'phage0003715' 'phage0003842' 'phage0003947' 'phage0003948'
 'phage0003949' 'phage0003955' 'phage0003956' 'phage0004008' 'phage0004017'
 'phage0004018' 'phage0004028' 'phage0004030' 'phage0004085' 'phage0004179'
 'phage0004316' 'phage0004342' 'phage0004400' 'phage0004404' 'phage0004406'
 'phage0004409' 'phage0004416' 'phage0004436' 'phage0004437' 'phage0004440'
 'phage0004441' 'phage0004462' 'phage0004489' 'phage0004493' 'phage0004507'
 'phage0004557' 'phage0004558' 'phage0004559' 'phage0004560' 'phage0004694'
 'phage0004731' 'phage0004734' 'phage0004769' 'phage0004799' 'phage0004868'
 'phage0004894' 'phage0004966' 'phage0004986' 'phage0005023' 'phage0005030'
 'phage0005041' 'phage0005053' 'phage0005054' 'phage0005055' 'phage0005105'
 'phage0005143' 'phage0005159' 'phage0005160' 'phage0005161' 'phage0005162'
 'phage0005163' 'phage0005219' 'phage0005220' 'phage0005221' 'phage0005224'
 'phage0005225' 'phage0005226' 'phage0005229' 'phage0005260' 'phage0005287'
 'phage0005289' 'phage0005290' 'phage0005294' 'phage0005296' 'phage0005307'
 'phage0005327' 'phage0005335' 'phage0005338' 'phage0005343' 'phage0005344'
 'phage0005364' 'phage0005365' 'phage0005366' 'phage0005367' 'phage0005368'
 'phage0005385' 'phage0005386' 'phage0005387' 'phage0005388' 'phage0005389'
 'phage0005390' 'phage0005403' 'phage0005405' 'phage0005406' 'phage0005412'
 'phage0005480' 'phage0005482' 'phage0005530' 'phage0005544' 'phage0005575'
 'phage0005591' 'phage0005658' 'phage0005673' 'phage0005682' 'phage0005683'
 'phage0005685' 'phage0005686' 'phage0005713' 'phage0005721' 'phage0005728'
 'phage0005729' 'phage0005752' 'phage0005771' 'phage0005782' 'phage0005797'
 'phage0005864' 'phage0005866' 'phage0005902' 'phage0005919' 'phage0006068'
 'phage0006254' 'phage0006255' 'phage0006256' 'phage0006257' 'phage0006258'
 'phage0006259' 'phage0006266' 'phage0006294' 'phage0006377' 'phage0006379'
 'phage0006390' 'phage0006428' 'phage0006449' 'phage0006454' 'phage0006471'
 'phage0006543' 'phage0006573' 'phage0006574' 'phage0006575' 'phage0006576'
 'phage0006577' 'phage0006585' 'phage0006587' 'phage0006592' 'phage0006671'
 'phage0006691' 'phage0006692' 'phage0006693' 'phage0006694' 'phage0006702'
 'phage0006703' 'phage1000090' 'phage1000423' 'phage1000426' 'phage1000427'
 'phage1000428' 'phage1000429' 'phage1000431' 'phage1000488' 'phage1000515'
 'phage1000532' 'phage1000560' 'phage1000564' 'phage1000635' 'phage1000811'
 'phage1000812' 'phage1000916' 'phage1001532'] not contained in axis

In [7]:
# selecting first n most abundant hosts
with open(hosts_counts_file) as f:
    hosts_counts = f.readlines()

strings = []
counts = []

for line in hosts_counts:
    string = line.split()[1].strip()
    count = int(line.split()[0].strip())
    
    if string == 'no_host':
        n += 1
    else:
        counts.append(count)
    
    strings.append(string)
    if len(strings) == n:
        break
        
print(strings)
print(counts)

['mycobac', 'no_host', 'strepto', 'escheri', 'gordoni', 'arthrob', 'pseudom', 'lactoco', 'staphyl', 'bacillu']
[1619, 354, 323, 293, 240, 236, 219, 184, 169]


In [18]:
# removing other than strings records
phages_to_drop = []

for record in hosts:
    
    phage = record.split('\t')[0]
    hosts_string = record.split('\t')[1].strip()
    to_delete = True
    
    for string in strings:
        if string in hosts_string:
            to_delete = False
            
    if to_delete:
        phages_to_drop.append(phage)
        
pure_matrix = dropped.drop(phages_to_drop)

In [19]:
# feature selection
min_count = min(counts)

var_threshold = (min_count*level)/pure_matrix.shape[0]

sel = VarianceThreshold(threshold=var_threshold * (1 - var_threshold))
trained_sel = sel.fit(pure_matrix)
final_matrix = pure_matrix.iloc[:,trained_sel.get_support(indices=True)]

In [21]:
# saving new dataframe
final_matrix.to_csv(out, sep='\t')