# Matrix editing
- remove no_host records
- select first n most abundant hosts
- feature selection with variance threshold on 10% of the least abundant species from selection

In [13]:
# imports
import pandas as pd
from sklearn.feature_selection import VarianceThreshold

In [14]:
# parameters
matrix_file = '/data/projects/kimona/data_18-01-19/009_matrix.tsv'
hosts_file = '/data/projects/kimona/data_18-01-19/101_hosts'
hosts_counts_file = '/data/projects/kimona/data_18-01-19/101_hosts_counts'
out = '/data/projects/kimona/data_18-01-19/009_edited_matrix.tsv'

# number of most abundant hosts to count in
n = 8

# threshold level as in description
level = 0.1

In [15]:
# matrix loading
matrix = pd.read_csv(matrix_file, sep='\t', header=0, index_col=0)

In [16]:
# removing no_host records
with open(hosts_file) as f:
    hosts = f.readlines()

phages_to_drop = []

for record in hosts:
    
    phage = record.split('\t')[0]
    hosts_string = record.split('\t')[1].strip()
    
    if hosts_string == 'no_host':
        phages_to_drop.append(phage)
        
dropped = matrix.drop(phages_to_drop)

In [17]:
# selecting first n most abundant hosts
with open(hosts_counts_file) as f:
    hosts_counts = f.readlines()

strings = []
counts = []

for line in hosts_counts:
    string = line.split()[1].strip()
    count = int(line.split()[0].strip())
    
    if string == 'no_host':
        n += 1
    else:
        counts.append(count)
    
    strings.append(string)
    if len(strings) == n:
        break
        
print(strings)

['mycobac', 'no_host', 'strepto', 'escheri', 'gordoni', 'pseudom', 'arthrob', 'lactoco', 'staphyl']
[1590, 340, 313, 284, 228, 220, 182, 180]


In [18]:
# removing other than strings records
phages_to_drop = []

for record in hosts:
    
    phage = record.split('\t')[0]
    hosts_string = record.split('\t')[1].strip()
    to_delete = True
    
    for string in strings:
        if string in hosts_string:
            to_delete = False
            
    if to_delete:
        phages_to_drop.append(phage)
        
pure_matrix = dropped.drop(phages_to_drop)

In [19]:
# feature selection
min_count = min(counts)

var_threshold = (min_count*level)/pure_matrix.shape[0]

sel = VarianceThreshold(threshold=var_threshold * (1 - var_threshold))
trained_sel = sel.fit(pure_matrix)
final_matrix = pure_matrix.iloc[:,trained_sel.get_support(indices=True)]

In [21]:
# saving new dataframe
final_matrix.to_csv(out, sep='\t')