In [10]:
from sklearn.neighbors import BallTree
from skbio import OrdinationResults
import pandas as pd
import numpy as np
import qgrid
from knn import KNN
from sklearn.svm import OneClassSVM
from sklearn.ensemble import IsolationForest

In [2]:
qgrid.enable(dataframe=True)
qgrid.set_grid_option('maxVisibleRows', 10)
qgrid.set_grid_option('forceFitColumns', False)
qgrid.set_grid_option('defaultColumnWidth', 120)
#x = pd.DataFrame(np.random.random((100, 100)))

In [3]:
pcoa = OrdinationResults.read('data/unweighted-unifrac-rare5000-pcoa/ordination.txt')
metadata = pd.read_table('healthy_metadata.txt', 
                         index_col=0,dtype=str,
                         na_values=['Unspecified', 'NaN'])

In [37]:
m = (pcoa.samples**2).sum(axis=0)
coords = pcoa.samples.loc[:, pcoa.proportion_explained.cumsum() < 0.99]
coords, metadata = coords.align(metadata, axis=0, join='inner')
metadata = metadata.sample(frac=1) 
#metadata['subset_healthy'] = np.logical_or( 
#    metadata.subset_healthy == 'Yes',
#    metadata.subset_healthy == 'True'
#)

coords = coords.reindex(index=metadata.index)

In [45]:
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder


_X = coords.iloc[:8000]
_y = metadata.iloc[:8000].age_corrected

_y = _y.dropna()
X, y = _X.align(_y, axis=0, join='inner')
y = y.astype(np.float)
X = X.values
y = y.values

#le = LabelEncoder()
#y = le.fit_transform(y)

#skf = StratifiedKFold(n_splits=2)
#gen = skf.split(X, y)
#train_index, test_index = next(gen)

model = KNeighborsRegressor()
model.fit(X[train_index], y[train_index])

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
          metric_params=None, n_jobs=1, n_neighbors=5, p=2,
          weights='uniform')

In [46]:
pred = model.predict(coords.iloc[8000:])

In [52]:
df = pd.DataFrame({'pred': pred, 
                   'obs': metadata.iloc[8000:].age_corrected})
df = df.dropna().astype(np.float)
np.abs(df.obs - df.pred).mean()

14.346666666666668

In [35]:
list(metadata.columns)

['cardiovascular_disease',
 'cat',
 'cdiff',
 'census_region',
 'chickenpox',
 'clinical_condition',
 'collection_date',
 'collection_month',
 'collection_season',
 'collection_time',
 'collection_timestamp',
 'consume_animal_products_abx',
 'contraceptive',
 'cosmetics_frequency',
 'country',
 'country_of_birth',
 'country_residence',
 'csection',
 'deodorant_use',
 'depression_bipolar_schizophrenia',
 'depth',
 'description',
 'diabetes',
 'diabetes_type',
 'diet_type',
 'dna_extracted',
 'dog',
 'dominant_hand',
 'drinking_water_source',
 'drinks_per_session',
 'economic_region',
 'elevation',
 'env_biome',
 'env_feature',
 'env_material',
 'env_package',
 'epilepsy_or_seizure_disorder',
 'exercise_frequency',
 'exercise_location',
 'fed_as_infant',
 'fermented_plant_frequency',
 'flossing_frequency',
 'flu_vaccine_date',
 'frozen_dessert_frequency',
 'fruit_frequency',
 'fungal_overgrowth',
 'geo_loc_name',
 'gluten',
 'has_physical_specimen',
 'height_cm',
 'height_units',
 'high_

In [198]:

pred = model.predict(coords.iloc[:1000], metadata.iloc[:5000])

ValueError: query data dimension must match training data dimension

In [199]:
pred = pd.Series(pred, index=coords.iloc[:1000].index, name='pred')

In [200]:
from sklearn.metrics import roc_auc_score
obs_md = metadata.iloc[:5000]
y_pred = pred==1
#y_obs = obs_md.autoimmune != 'I do not have this condition'
y_obs = obs_md.subset_healthy
roc_auc_score(y_pred, y_obs, average='weighted')

ValueError: Found input variables with inconsistent numbers of samples: [1000, 5000]

In [126]:
np.sum(np.logical_and(y_pred, y_obs)) / np.sum(y_obs)

0.53313981615868411

In [127]:
pd.DataFrame({'y_pred': y_pred, 'y_obs': y_obs}, index=obs_md.index)

A Jupyter Widget

In [4]:
#idx = np.arange(coords.shape[0])
#np.random.shuffle(idx)
#train_idx = idx[:9000]
#test_idx = idx[9000:]

In [175]:
(metadata.healthy=='False').sum()

5674

In [17]:
knn = KNN(k=20) 
model = knn.fit(X=coords.iloc[:9000], 
                y=metadata.iloc[:9000]) 

In [18]:
i = 9000
gen = model.query(coords.iloc[i:])

In [25]:
dist, md = next(gen)
i += 1

In [26]:
metadata.loc[coords.iloc[i].name].healthy

'True'

In [27]:
pd.DataFrame({"healthy": md.healthy=='True', "dist": dist})

A Jupyter Widget

In [428]:
metadata.loc[coords.loc['10317.000017664'].name]

A Jupyter Widget

In [349]:
pred_md = model.predict(X=coords.iloc[9400:])

KeyboardInterrupt: 

In [106]:
from sklearn.metrics import f1_score
from sklearn.preprocessing import LabelEncoder

def compare(obs_md, pred_md):
    res = pd.DataFrame(columns=['type', 
                                'accuracy', 'f1_score',
                                'MAE', '5%', '25%', '50%', '75%', '95%', 'ratio'
                               ],
                       index=obs_md.columns)
    for c in obs_md.columns:
        obs_col = obs_md[c].copy()
        pred_col = pred_md[c].copy()
        try:
            # recognize that it is a floating point
            obs_col = obs_col.astype(np.float)
            res.loc[c, 'MAE'] = np.mean(np.abs(obs_col - pred_col).dropna())            
            lower = np.nanpercentile(obs_col, 5)
            upper = np.nanpercentile(obs_col, 95)
            med = np.nanpercentile(obs_col, 50)
            r = (med - lower) / (upper - lower)
            if np.isnan(r):
                r = 1
            res.loc[c, '5%'] = lower
            res.loc[c, '25%'] = np.nanpercentile(obs_col, 25)
            res.loc[c, '50%'] = med
            res.loc[c, '75%'] = np.nanpercentile(obs_col, 75)
            res.loc[c, '95%'] = upper
            res.loc[c, 'ratio'] = r
            res.loc[c, 'type'] = 'continuous'
        except:
            obs_col = obs_col.astype(np.str)
            pred_col = pred_col.astype(np.str)
            le = LabelEncoder()
            obs_col = obs_col.fillna('NaN')
            pred_col = pred_col.fillna('NaN')
            vals = set(obs_col.values) | set(pred_col.values)
            le = le.fit(list(vals))
            obs = le.transform(obs_col.values)
            pred = le.transform(pred_col.values)
            x = (obs == pred).astype(np.float)
            res.loc[c, 'accuracy'] = np.sum(x) / len(obs_col)
            res.loc[c, 'f1_score'] = f1_score(obs, pred, average='weighted')
            res.loc[c, 'type'] = 'categorical'
    return res

In [107]:
stats = compare(obs_md, pred_md)

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


In [112]:
stats.loc[stats.type=='categorical'].dropna(axis=1)

A Jupyter Widget

In [113]:
stats.loc[stats.type=='continuous'].dropna(axis=1)

A Jupyter Widget

In [124]:
pd.DataFrame({'obs': obs_md['ibd'], 
              'pred': pred_md['ibd']})

A Jupyter Widget