# Finding distances to group centroids.

In the previous notebooks we've conducted the PCA for the CAZyme data.

Now we want to find a pseudo-probability of group membership using a technique called relative centroid distance.

In [1]:
from importlib import reload
import datetime
from collections import defaultdict
import os
from os import makedirs

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

from catas.matrix import Matrix

The first thing to do is probably to find the centroids for each of our groups.
The centroid of a cluster in multidimensional space is simply the arithmetic mean of each cluster in each individual dimension.
For example the mean of a cluster in three dimensional space with axes x, y, and z would be a point at [$\bar{x}$, $\bar{y}$, $\bar{z}$], where $\bar{x}$ is the mean of the point in the $x$ dimension. In our case, we are working in a 16 dimensional space (from our 16 principle components) so our mean will be an array of 16 values.

In [2]:
VERSIONS = ["v4", "v5", "v6", "v7"]
DEFAULT_VERSION = "v5"
NOMENCLATURES = ["nomenclature1", "nomenclature2", "nomenclature3"]
DEFAULT_NOMENCLATURE = "nomenclature3"

TODAY = "20180324" #datetime.datetime.utcnow().strftime("%Y%m%d")
TODAY = "20190311"

In [3]:
dfs = dict()
labels = dict()

for version in VERSIONS:
    df = pd.read_csv("03-replicating_pca/{}-principle_components.csv".format(version), sep="\t")
    labels[version] = df[["Species", "nomenclature1", "nomenclature2", "nomenclature3"]]
    #df.drop(["nomenclature1", "nomenclature2"], axis=1, inplace=True)
    #df.rename(columns={"nomenclature3": "nomenclature"}, inplace=True)
    dfs[version] = df

dfs[DEFAULT_VERSION][:5]

Unnamed: 0,Species,nomenclature1,nomenclature2,nomenclature3,pc01,pc02,pc03,pc04,pc05,pc06,pc07,pc08,pc09,pc10,pc11,pc12,pc13,pc14,pc15,pc16
0,Agaricus bisporus,saprotroph,saprotroph,saprotroph,-5.596715,-7.175748,20.5612,-18.262674,-5.688208,-0.749536,2.126003,2.103183,5.382176,0.997175,-3.001956,-2.889436,-0.640579,1.840926,-4.67477,2.977663
1,Albugo laibachii,biotroph,biotroph,biotroph 2,-48.502155,3.433361,-1.463343,-3.008712,-10.110828,8.478079,11.54841,3.194017,-4.257336,-15.663615,12.64701,6.322598,1.510813,1.862938,15.264624,2.944969
2,Alternaria brassicicola,necrotroph,necrotroph,necrotroph - narrow host range,7.804736,-3.859394,2.972143,18.22682,1.650082,-3.620947,-0.199021,-2.674036,-0.495409,-0.077786,-5.934555,-0.121204,2.709126,2.499152,1.708681,-3.570464
3,Armillaria mellea,necrotroph,necrotroph,necrotroph - broad host range,21.985218,-4.530297,22.45542,-16.84976,-16.705063,-5.928867,-6.786644,4.643309,7.768957,5.854125,0.323093,-7.940342,-2.19685,5.947209,1.817424,0.575801
4,Aspergillus fumigatus,saprotroph,saprotroph,saprotroph,4.934787,-0.790617,-4.519309,3.308799,11.333103,2.445708,-4.878553,-0.852445,0.517279,7.712803,7.079127,0.724315,-2.419157,-0.792004,3.606708,5.958527


In [4]:
import json

columns = dict()
# Reorder columns to logical classes
columns["nomenclature1"] = [
    'saprotroph',
    'biotroph',
    'hemibiotroph',
    'necrotroph',
    'symbiont'
    ]
columns["nomenclature2"] = [
    'saprotroph',
    'biotroph',
    'mesotroph',
    'necrotroph',
    'wilt'
    ]
columns["nomenclature3"] = [
    'saprotroph',
    'biotroph 1',
    'biotroph 2',
    'biotroph 3',
    'mesotroph - internal',
    'mesotroph - external',
    'necrotroph - narrow host range',
    'necrotroph - broad host range',
    'wilt'
    ]

for nomenclature in NOMENCLATURES:
    with open("../catas/data/{}-trophic_classes.json".format(nomenclature), "w") as handle:
        json.dump(columns[nomenclature], handle)

In [5]:
df_means = defaultdict(dict)
for nomenclature in NOMENCLATURES:
    for version in VERSIONS:
        df = dfs[version]
        drop = [n for n in NOMENCLATURES if n != nomenclature]
        df_mean = df.drop(drop, axis=1).groupby(nomenclature).mean()
        df_mean = df_mean.loc[columns[nomenclature]]

        df_means[version][nomenclature] = df_mean

        mat_mean = Matrix.from_df(df_mean)
        mat_mean.write("../catas/data/{}-{}-{}-centroids.npz".format(version, TODAY, nomenclature))

df_means[DEFAULT_VERSION][DEFAULT_NOMENCLATURE]

Unnamed: 0_level_0,pc01,pc02,pc03,pc04,pc05,pc06,pc07,pc08,pc09,pc10,pc11,pc12,pc13,pc14,pc15,pc16
nomenclature3,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
saprotroph,-12.057478,-7.822239,5.66617,-5.843308,-0.638999,4.466832,-0.91827,-1.999073,2.973201,-1.771914,3.098081,-0.663621,0.382331,0.869797,-0.003102,1.07052
biotroph 1,-21.342632,-9.643083,-0.723562,-4.682462,4.764076,3.07639,-1.08208,3.369592,2.562128,0.478576,-1.015344,-1.101551,2.187042,-0.607146,-1.828773,-1.219463
biotroph 2,-38.15378,19.409073,-3.128182,-0.868712,-2.260894,-1.605195,4.90146,-0.351714,-1.979149,-0.450807,1.84156,0.948329,-0.266074,-0.90906,0.220582,0.109511
biotroph 3,-18.707682,-9.749498,-7.245657,-0.0037,-1.097028,3.924517,-4.785148,5.22418,-5.160805,0.377295,-1.407624,1.14932,-0.901361,2.062088,1.649361,0.986254
mesotroph - internal,49.487354,-0.199436,7.326127,12.674136,0.374741,-2.851277,-1.661372,0.058039,-1.63984,-1.212379,6.469157,1.304574,-2.735288,3.916875,-1.206091,1.522617
mesotroph - external,-3.121336,-4.905912,-0.659381,0.365841,0.315615,-3.513459,-2.525976,0.372049,2.823042,3.096434,-2.810222,2.284872,-1.005175,2.839374,-0.54176,0.427166
necrotroph - narrow host range,27.21303,-7.865903,6.132308,15.308049,-7.175545,0.803661,4.37991,-3.213934,-0.673112,3.459422,-5.628296,-1.841606,1.917331,-0.326963,1.239278,-3.007841
necrotroph - broad host range,15.26347,-3.084845,8.664699,-2.236674,-2.090839,-2.087789,-6.17637,-1.846104,-1.28046,3.96116,-0.616649,-1.114624,-0.717143,-0.674946,0.996206,1.605543
wilt,68.658753,3.249681,-14.821007,-1.120193,7.546867,-3.759955,0.944871,0.412731,0.716115,-7.103314,-0.431012,1.350647,-1.96262,-1.225287,-0.126903,-0.22964


Now that we have our means, we can easily get the distances using numpy or scipy.
I've found that `numpy.linalg.norm` is slightly faster than `scipy.spatial.distance` so i'll use numpy.

At the same time we might as well calculate the relative centroid distance (RCD) using the following formula where $C$ means the array of distances between a single fungal isolate to each of the trophy classes.

$$
RCD_{i} = 1 - \left( \frac{C_i - \min C}{\max C - \min C} \right)
$$

Looking at the numerator in that fraction, if the class that we're looking at ($C_i$) is the minimum class, then the difference is 0 and the $RCD_i$ becomes 1. For all other classes $C_i$ the distance will be a value in the range [0, 1] (and I use square brackets in the formal sense meaning that it can't be 0 or 1 but is a value between).
If the difference between the min and max distances for each class is large, then the denominator becomes large and the curve of possible $RCD_i$'s will be more gradual (ie. less erratic with large values in the numerator).

Before I get RCD though, i need some distances.
There are two easy options that I can use, a scipy one and a numpy one.
I'll try them both and use the faster one.

In [6]:
from numpy.linalg import norm
from scipy.spatial import distance

In [7]:
%time distance.euclidean(dfs[DEFAULT_VERSION].iloc[0,4:], df_means[DEFAULT_VERSION][DEFAULT_NOMENCLATURE].iloc[0])

CPU times: user 796 µs, sys: 0 ns, total: 796 µs
Wall time: 796 µs


24.086301691229398

In [8]:
%time norm(dfs[DEFAULT_VERSION].iloc[0,4:] - df_means[DEFAULT_VERSION][DEFAULT_NOMENCLATURE].iloc[0])

CPU times: user 3.92 ms, sys: 0 ns, total: 3.92 ms
Wall time: 3.63 ms


24.086301691229398

So the numpy norm command is usually a bit faster (though not always).
So I'll implement my distance function as something like this...

```python
def distances(point, centroids):
    results = list()
    for idx in centroids.index:
        results.append(norm(point - centroids.loc[idx]))
    
    new_index = centroids.index
    new_index.name = None
    return pd.Series(results, index=new_index)
```

Except of cource I want to use the same functions as we will for the actual classifications so i'll import it from catas.
The catas version of distances is more-or-less the same as above.

In [9]:
from catas.predict import distances

I also need to implement the RCD in python.
It will look something like this though i'll probably add some bits to make input/output more convenient.

```python
def rcd(arr, names):
    """ Finds the relative centroid distance.
    
    Given an array of distances between two points,
    returns the RCD for each distance in the array.
    """

    min_ = arr.min()
    max_ = arr.max()
    ratio = (arr - min_) / (max_ - min_)
    
    new_index = names
    new_index.name = None
    return pd.Series(1 - ratio, index=names)
```

In [10]:
from catas.predict import rcd

So now we can use the functions!

In [11]:
dfs[version].iloc[0:1, 4:]

Unnamed: 0,pc01,pc02,pc03,pc04,pc05,pc06,pc07,pc08,pc09,pc10,pc11,pc12,pc13,pc14,pc15,pc16
0,-6.276969,-15.067001,19.048088,-18.855778,-5.012601,2.64151,1.74196,5.164346,-0.281394,-3.479802,-1.479,-1.122091,-1.579042,-0.333801,4.70993,0.444683


In [12]:
distance_results = defaultdict(dict)
for nomenclature in NOMENCLATURES:
    for version in VERSIONS:
        mat = Matrix.from_df(dfs[version].iloc[0:1, 4:])
        cen = Matrix.from_df(df_means[version][nomenclature])
        results = distances(mat, centroids=cen).as_df()
        distance_results[version][nomenclature] = results
        print(version, nomenclature)
        print(results)

v4 nomenclature1
   saprotroph   biotroph  hemibiotroph  necrotroph   symbiont
0   24.207446  48.464886     42.572164    52.08305  33.164673
v5 nomenclature1
   saprotroph   biotroph  hemibiotroph  necrotroph   symbiont
0   24.086302  48.114535     42.868275   52.510473  32.898895
v6 nomenclature1
   saprotroph   biotroph  hemibiotroph  necrotroph   symbiont
0   25.117509  49.031679     42.678921   53.360188  32.233298
v7 nomenclature1
   saprotroph   biotroph  hemibiotroph  necrotroph   symbiont
0   24.734967  49.072847     42.427871   53.114315  32.082526
v4 nomenclature2
   saprotroph   biotroph  mesotroph  necrotroph       wilt
0   24.207446  40.958685  42.495611   38.159585  86.119347
v5 nomenclature2
   saprotroph   biotroph  mesotroph  necrotroph       wilt
0   24.086302  40.618587   42.81509   38.395277  86.665503
v6 nomenclature2
   saprotroph   biotroph  mesotroph  necrotroph       wilt
0   25.117509  41.215745  42.625412   40.297372  85.879848
v7 nomenclature2
   saprotroph 

In [13]:
for nomenclature in NOMENCLATURES:
    for version in VERSIONS:
        rcd_results = rcd(Matrix.from_df(distance_results[version][nomenclature])).as_df()
        print(version, nomenclature)
        print(rcd_results)

v4 nomenclature1
   saprotroph  biotroph  hemibiotroph  necrotroph  symbiont
0         1.0  0.129797       0.34119         0.0  0.678671
v5 nomenclature1
   saprotroph  biotroph  hemibiotroph  necrotroph  symbiont
0         1.0  0.154655      0.339225         0.0  0.689961
v6 nomenclature1
   saprotroph  biotroph  hemibiotroph  necrotroph  symbiont
0         1.0  0.153261      0.378196         0.0  0.748048
v7 nomenclature1
   saprotroph  biotroph  hemibiotroph  necrotroph  symbiont
0         1.0  0.142409      0.376557         0.0  0.741095
v4 nomenclature2
   saprotroph  biotroph  mesotroph  necrotroph  wilt
0         1.0  0.729434    0.70461    0.774645   0.0
v5 nomenclature2
   saprotroph  biotroph  mesotroph  necrotroph  wilt
0         1.0  0.735818   0.700719    0.771346   0.0
v6 nomenclature2
   saprotroph  biotroph  mesotroph  necrotroph  wilt
0         1.0  0.735062   0.711863    0.750176   0.0
v7 nomenclature2
   saprotroph  biotroph  mesotroph  necrotroph  wilt
0         1.0

They seem to work, now i'll apply them to the whole dataframes.

In [14]:
dists = defaultdict(dict)
for nomenclature in NOMENCLATURES:
    for version in VERSIONS:
        df = dfs[version].iloc[:,4:]
        df.index = dfs[version]["Species"]
        mat = Matrix.from_df(df)
        cen = Matrix.from_df(df_means[version][nomenclature])
        dists[version][nomenclature] = distances(mat, cen).as_df()

dists[DEFAULT_VERSION][DEFAULT_NOMENCLATURE]

Unnamed: 0,saprotroph,biotroph 1,biotroph 2,biotroph 3,mesotroph - internal,mesotroph - external,necrotroph - narrow host range,necrotroph - broad host range,wilt
Agaricus bisporus,24.086302,32.898895,52.830176,39.497252,66.777311,30.649960,50.822123,32.432919,86.665503
Albugo laibachii,49.255314,46.764533,34.888050,46.414940,104.573291,59.332293,86.196911,74.627479,122.689805
Alternaria brassicicola,34.609063,39.700760,56.625118,37.420932,45.133312,23.254209,23.552379,25.601971,67.998454
Armillaria mellea,46.854128,57.863619,75.882758,59.109223,50.003179,44.228858,43.366773,30.042268,70.049199
Aspergillus fumigatus,29.178765,33.920709,52.266913,32.380902,50.227106,21.428521,39.153206,25.070482,67.971673
Aspergillus nidulans,41.847302,49.993322,67.955063,47.034346,36.604500,32.591408,29.243070,26.562403,54.801566
Botrytis cinerea B05,33.494953,41.765368,59.895219,40.171211,47.850618,28.602100,38.513606,17.140403,63.957050
Botrytis cinerea BcDW1,47.030213,55.882062,74.027153,54.520378,42.817857,41.134087,39.926574,25.005212,53.332301
Botrytis cinerea T4,38.335945,46.884131,65.907169,45.410034,45.149779,33.194333,37.200953,19.202358,59.734224
Blumeria graminis fsp tritici,43.881033,33.917538,38.317082,36.434019,104.954834,51.838373,83.910952,71.037669,123.010280


In [15]:
if not os.path.exists("04-distance_to_centroids"):
    makedirs("04-distance_to_centroids")

rcds = defaultdict(dict)
for nomenclature in NOMENCLATURES:
    for version in VERSIONS:
        mat = Matrix.from_df(dists[version][nomenclature])
        df = rcd(mat).as_df()
        rcds[version][nomenclature] = pd.merge(labels[version], df, left_on="Species", right_index=True)
        rcds[version][nomenclature].to_csv(
            "04-distance_to_centroids/{}-{}-rcds.csv".format(version, nomenclature),
            index=False
        )

rcds[DEFAULT_VERSION][DEFAULT_NOMENCLATURE].head()

Unnamed: 0,Species,nomenclature1,nomenclature2,nomenclature3,saprotroph,biotroph 1,biotroph 2,biotroph 3,mesotroph - internal,mesotroph - external,necrotroph - narrow host range,necrotroph - broad host range,wilt
0,Agaricus bisporus,saprotroph,saprotroph,saprotroph,1.0,0.859177,0.54068,0.753737,0.317808,0.895114,0.572768,0.866623,0.0
1,Albugo laibachii,biotroph,biotroph,biotroph 2,0.836367,0.864735,1.0,0.868717,0.206334,0.721597,0.415628,0.547396,0.0
2,Alternaria brassicicola,necrotroph,necrotroph,necrotroph - narrow host range,0.746228,0.632432,0.254185,0.683384,0.511019,1.0,0.993336,0.947529,0.0
3,Armillaria mellea,necrotroph,necrotroph,necrotroph - broad host range,0.633253,0.393083,0.0,0.365911,0.564557,0.690523,0.709329,1.0,0.127258
4,Aspergillus fumigatus,saprotroph,saprotroph,saprotroph,0.833483,0.7316,0.337424,0.764683,0.38125,1.0,0.619177,0.921751,0.0


Ok so we now have our centroids and we have the relative centroid distances but how good is it at classification?
We can pretty easily find the closest centroid for each of the isolates with argmax.

This assumes that the isolates can only belong to one class, but it's a convenient way of looking at this for the start.

In [16]:
# For each row get the column name of the largest value.
predicted_labels = defaultdict(dict)
#np.argmax
for nomenclature in NOMENCLATURES:
    for version in VERSIONS:
        predicted_labels[version][nomenclature] = rcds[version][nomenclature].iloc[:, 4:].apply(pd.Series.idxmax, 1)

predicted_labels[DEFAULT_VERSION][DEFAULT_NOMENCLATURE][:5]

0                       saprotroph
1                       biotroph 2
2             mesotroph - external
3    necrotroph - broad host range
4             mesotroph - external
dtype: object

Now we can use the scikit learn evaluation metrics to check this out.

In [17]:
from sklearn.metrics import classification_report

for version in VERSIONS:
    for nomenclature in NOMENCLATURES:
        print(version, nomenclature)
        print(classification_report(
            dfs[version][nomenclature],
            predicted_labels[version][nomenclature]
            ))

v4 nomenclature1
              precision    recall  f1-score   support

    biotroph       0.82      0.60      0.69        30
hemibiotroph       0.22      0.36      0.27        14
  necrotroph       0.64      0.40      0.49        35
  saprotroph       0.29      0.36      0.32        14
    symbiont       0.52      0.78      0.62        18

   micro avg       0.50      0.50      0.50       111
   macro avg       0.50      0.50      0.48       111
weighted avg       0.57      0.50      0.52       111

v4 nomenclature2
              precision    recall  f1-score   support

    biotroph       0.86      0.79      0.83        48
   mesotroph       0.30      0.23      0.26        13
  necrotroph       0.67      0.58      0.62        24
  saprotroph       0.24      0.36      0.29        14
        wilt       0.67      0.83      0.74        12

   micro avg       0.63      0.63      0.63       111
   macro avg       0.55      0.56      0.55       111
weighted avg       0.65      0.63      0.64

So the classification was similar in accuracy to the leave one out logistic regression (except in this case the data are included in the model).

The other thing that might be useful would be if we look at the isolates with multiple labels.
In the paper a threshold distance of 0.95 was used for finding multiclass isolates.

In [18]:
secondary_predicted_labels = defaultdict(dict)

for nomenclature in NOMENCLATURES:
    for version in VERSIONS:
        classes = list()
        for i, row in rcds[version][nomenclature].iloc[:,4:].iterrows():
            
            is_class = row > 0.95
            classes.append("; ".join(row.index[is_class]))
        secondary_predicted_labels[version][nomenclature] = classes

secondary_predicted_labels[DEFAULT_VERSION][DEFAULT_NOMENCLATURE][:5]

['saprotroph',
 'biotroph 2',
 'mesotroph - external; necrotroph - narrow host range',
 'necrotroph - broad host range',
 'mesotroph - external']

In [19]:
from copy import copy

real_classes = dict()

for version in VERSIONS:
    real_classes[version] = copy(dfs[version][["Species"] + NOMENCLATURES])
    for nomenclature in NOMENCLATURES:
        real_classes[version][nomenclature + "_predictions"] = secondary_predicted_labels[version][nomenclature]
    
    real_classes[version]["nomenclature3_primary"] = predicted_labels[version]["nomenclature3"].apply(lambda x: x.split(" ")[0])
    real_classes[version].to_csv(
        "04-distance_to_centroids/{}-evaluate_predictions.csv".format(version),
        index=False
        )

real_classes[DEFAULT_VERSION][:5]

Unnamed: 0,Species,nomenclature1,nomenclature2,nomenclature3,nomenclature1_predictions,nomenclature2_predictions,nomenclature3_predictions,nomenclature3_primary
0,Agaricus bisporus,saprotroph,saprotroph,saprotroph,saprotroph,saprotroph,saprotroph,saprotroph
1,Albugo laibachii,biotroph,biotroph,biotroph 2,biotroph,biotroph,biotroph 2,biotroph
2,Alternaria brassicicola,necrotroph,necrotroph,necrotroph - narrow host range,hemibiotroph,necrotroph,mesotroph - external; necrotroph - narrow host...,mesotroph
3,Armillaria mellea,necrotroph,necrotroph,necrotroph - broad host range,hemibiotroph,mesotroph; necrotroph,necrotroph - broad host range,necrotroph
4,Aspergillus fumigatus,saprotroph,saprotroph,saprotroph,saprotroph; hemibiotroph,saprotroph; mesotroph; necrotroph,mesotroph - external,mesotroph


## Classifying the unknown isolates

You might remember that in the PCA notebook we filtered out a number of isolates what we didn't have known classes for.
Here we predict their classes using the RCD method.

In [20]:
# Import the catas methods

import catas.data
from catas.data import Version
from catas.data import Nomenclature
from catas.predict import predict
from catas.predict import distances
from catas.predict import rcd

I'll load the original counts from an earlier notebook

In [21]:
dfs = dict()
labels = dict()

for version in VERSIONS:
    df = pd.read_csv("02-count_cazymes/{}-cazy_counts.csv".format(version), sep="\t")

    labels[version] = df[["Species", "nomenclature1", "nomenclature2", "nomenclature3"]]
    labels[version].reset_index(drop=True, inplace=True)
    df.set_index("Species", drop=True, inplace=True)
    df.drop(["nomenclature1", "nomenclature2", "nomenclature3"], axis=1, inplace=True)
    df = df.astype(np.float32)
    dfs[version] = df

dfs[DEFAULT_VERSION][:5]

Unnamed: 0_level_0,AA1,AA10,AA11,AA12,AA13,AA2,AA3,AA4,AA5,AA6,...,PL3,PL4,PL5,PL6,PL7,PL8,PL9,SLH,cohesin,dockerin
Species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Agaricus bisporus,2.0,0.0,0.0,1.0,0.0,5.0,33.0,1.0,9.0,4.0,...,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
Albugo laibachii,2.0,0.0,0.0,0.0,0.0,28.0,2.0,3.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Alternaria brassicicola,2.0,0.0,4.0,3.0,1.0,7.0,14.0,2.0,3.0,2.0,...,11.0,3.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
Armillaria mellea,4.0,0.0,1.0,0.0,0.0,12.0,48.0,2.0,4.0,3.0,...,6.0,2.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,1.0
Aspergillus fumigatus,1.0,0.0,4.0,0.0,0.0,3.0,11.0,5.0,1.0,2.0,...,3.0,3.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


Now i'll apply the prediction pipeline across the whole dataframe.
We have to loop this time because the functions we defined are designed to take one record at a time (which is how we intend people to use the tool) rather than a dataframe.

In [22]:
all_preds = defaultdict(dict)

for nomenclature in NOMENCLATURES:
    for version in VERSIONS:
        new_rows = list()

        mat = Matrix.from_df(dfs[version])
        preds = predict(mat, version=Version[version], nomenclature=Nomenclature[nomenclature]).as_df()
        df = pd.merge(labels[version], preds, left_on="Species", right_index=True)        
        df.to_csv("04-distance_to_centroids/{}-{}-all_predictions.csv".format(version, nomenclature), index=False)
        all_preds[version][nomenclature] = df

all_preds[DEFAULT_VERSION][DEFAULT_NOMENCLATURE]

Unnamed: 0,Species,nomenclature1,nomenclature2,nomenclature3,saprotroph,biotroph 1,biotroph 2,biotroph 3,mesotroph - internal,mesotroph - external,necrotroph - narrow host range,necrotroph - broad host range,wilt
0,Agaricus bisporus,saprotroph,saprotroph,saprotroph,1.000000,0.859177,0.540680,0.753737,0.317808,0.895114,0.572768,0.866623,0.000000
1,Albugo laibachii,biotroph,biotroph,biotroph 2,0.836367,0.864735,1.000000,0.868717,0.206334,0.721597,0.415628,0.547396,0.000000
2,Alternaria brassicicola,necrotroph,necrotroph,necrotroph - narrow host range,0.746228,0.632432,0.254185,0.683385,0.511019,1.000000,0.993336,0.947529,0.000000
3,Armillaria mellea,necrotroph,necrotroph,necrotroph - broad host range,0.633253,0.393083,0.000000,0.365911,0.564557,0.690523,0.709329,1.000000,0.127258
4,Aspergillus fumigatus,saprotroph,saprotroph,saprotroph,0.833483,0.731600,0.337424,0.764683,0.381250,1.000000,0.619177,0.921751,0.000000
5,Aspergillus nidulans,saprotroph,saprotroph,saprotroph,0.630734,0.433935,0.000000,0.505421,0.757395,0.854346,0.935238,1.000000,0.317774
6,Botrytis cinerea B05,necrotroph,necrotroph,necrotroph - broad host range,0.650668,0.474013,0.086760,0.508064,0.344032,0.755179,0.543470,1.000000,0.000000
7,Botrytis cinerea BcDW1,necrotroph,necrotroph,necrotroph - broad host range,0.550711,0.370142,0.000000,0.397919,0.636640,0.670987,0.695619,1.000000,0.422155
8,Botrytis cinerea T4,necrotroph,necrotroph,necrotroph - broad host range,0.590329,0.407304,0.000000,0.438866,0.444438,0.700417,0.614631,1.000000,0.132170
9,Blumeria graminis fsp tritici,biotroph,biotroph,biotroph 2,0.888167,1.000000,0.950618,0.971754,0.202659,0.798852,0.438861,0.583354,0.000000


All done!

## Test data

Again, i'm going to generate some quick test data.

I'll just use my functions again because I've checked them all manually and I just want to make sure that any edits or optimisations don't change the results (and to check if it works on other systems).

In [26]:
from catas.data import test_dbcan
from catas.data import models
from catas.data import centroids
from catas.data import cazy_list

from catas.count import cazy_counts_multi
from catas.parsers import FileType
from catas.parsers import parse
from catas.predict import transform

from catas.predict import distances
from catas.predict import rcd

In [32]:
with open(test_dbcan(Version["v5"]), "r") as handle:
    required_cols = cazy_list(Version["v5"])
    counts = parse(handle, FileType["dbcan"])
    cnts = cazy_counts_multi([counts], ["test"], required_cols)
cnts.arr[:5,]

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [33]:
model = models(Version["v5"])
trans = transform(cnts, model)

In [34]:
dists = distances(trans, centroids=centroids(Version["v5"]))
dists

<catas.matrix.Matrix at 0x7fbceb7d58d0>

In [36]:
rcd(dists).as_df()

Unnamed: 0,saprotroph,biotroph 1,biotroph 2,biotroph 3,mesotroph - internal,mesotroph - external,necrotroph - narrow host range,necrotroph - broad host range,wilt
test,0.891908,0.973107,1.0,0.923049,0.22047,0.791967,0.447557,0.589808,0.0


In [None]:
dfs[DEFAULT_VERSION]