In [171]:
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.decomposition import PCA
from sklearn.manifold import MDS
from sklearn.manifold import TSNE
from sklearn.linear_model import LinearRegression
from skbio.diversity import beta_diversity, alpha_diversity
from skbio.stats.ordination import pcoa, pcoa_biplot
from skbio import DistanceMatrix
from scipy.stats import spearmanr, pearsonr
import statsmodels.api as sm 
import umap
from io import StringIO
from os.path import join
import pandas as pd
import xlsxwriter
import openpyxl
import os
import numpy as np
import itertools
import itertools as it
import kaleido
from pandas import Series, ExcelWriter
import scipy.io as sio
import plotly.graph_objects as go
import plotly.io as pio
import plotly.express as px
import seaborn as sns
from IPython.display import display, HTML
from fpdf import FPDF
import scanpy as sc 
from anndata import AnnData
import csv
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from matplotlib.lines import Line2D  # for legend handle

#### Variables

In [103]:
place = "home"

In [104]:
cros_path = "C:/Users/morts/Documents/TAU/BV_Project/Data/SPECIFIC_PAPERS/Serrano_et_al.2019/noDuplicatedID/DADA2_results/SILVA/PAGA_res/"
cros_file_name = "ps_results_IVA0_04072022"

temp_path = 'C:/Users/morts/Documents/TAU/BV_Project/Data/SPECIFIC_PAPERS/Ravel_et_al.2013/'
temp_file_name = 'Ravel2013_data'

sheet_abun = 'abundance'
sheet_meta = 'metadata'
sheet_umap = 'umap'

## Save path
temp_save_path = temp_path
temp_pca_file_path = 'temp_over_ps_05072022/'
temp_save_pca_file_name = "temp_over_ps_knn_opt05072022"

##
pca_cols = ['PCA' + str(i) for i in range(1, 51)] #['PCA1', 'PCA2', 'PCA3']

#### Load data

In [105]:
def get_data(all_file_path, file_name, sheet_name):
    file_full_name = all_file_path + file_name + '.xlsx'        
    df = pd.read_excel(file_full_name, sheet_name = sheet_name, index_col = 0)
    
    if "subjectID" in df.columns:
        df["subjectID"] = df["subjectID"].astype(object)
        
    return df

In [106]:
## Pseudotime DF
cros_df = get_data(cros_path, cros_file_name, sheet_abun)
cros_meta = get_data(cros_path, cros_file_name, sheet_meta)
cros_umap = get_data(cros_path, cros_file_name, sheet_umap)

## Temp DF
temp_df = get_data(temp_path, temp_file_name, sheet_abun)
temp_meta = get_data(temp_path, temp_file_name, 'meta')
temp_meta['DB_type'] = 'temp'
temp_meta.drop('age', inplace = True, axis = 1)


Unknown extension is not supported and will be removed



In [107]:
pattern = '|'.join(['g_', 'f_', 'p_', 'o_'])

In [108]:
cros_df.columns = cros_df.columns.str.replace(pattern, '')
print(cros_df.columns)

Index(['Acetanaerobacterium_elongatum', 'Acidaminococcus_intestini',
       'Acidothermus_sp', 'Acidovorax_temperans', 'Acinetobacter_baumannii',
       'Acinetobacter_haemolyticus', 'Acinetobacter_johnsonii',
       'Acinetobacter_lwoffii', 'Acinetobacter_nosocomialis',
       'Acinetobacter_radioresistens',
       ...
       'Rhodospirillales', 'Rickettsiales', 'SJA-28', 'Saccharimonadales',
       'Veillonellales-Selenomonadales', 'Actinobacteriota', 'Bacteroidota',
       'Firmicutes', 'Proteobacteria', 'WPS-2'],
      dtype='object', length=1126)



The default value of regex will change from True to False in a future version.



#### Intersect species

In [109]:
temp_df.columns = temp_df.columns.str.replace(' ', '_')

In [110]:
int_spec = cros_df.columns & temp_df.columns

print(cros_df.shape)
print(temp_df.shape)
print(len(int_spec))
# print(int_spec)

(3839, 1126)
(1657, 151)
95



Index.__and__ operating as a set operation is deprecated, in the future this will be a logical operation matching Series.__and__.  Use index.intersection(other) instead



In [111]:
int_cros_df = cros_df[int_spec]
int_temp_df = temp_df[int_spec]

both_df = pd.concat([cros_df, temp_df], join = 'inner')
both_df = both_df.fillna(0)

#### Nearest neighbor

In [127]:
def get_euc_nn(cros_df, temp_df):
    dist_arr = euclidean_distances(cros_df, temp_df)
    dist_df = pd.DataFrame(dist_arr, index = cros_df.index, columns = temp_df.index)
    
    return dist_df

In [128]:
def get_pcoa(df, metric):
    arr = df.to_numpy()
    bc_df = beta_diversity(counts = arr, ids = list(df.index), metric = metric, validate = False)

    return bc_df.to_data_frame()

def get_filt_distance_df(distance_df, cros_meta, temp_meta):
    cols_to_keep = temp_meta.index
    rows_to_keep = cros_meta.index
    
    filt_distance_df = distance_df[cols_to_keep]
    filt_distance_df = filt_distance_df[filt_distance_df.index.isin(rows_to_keep)]
    
    return filt_distance_df

In [129]:
def get_knn(filt_distance_df, meta_df):
    res_df = pd.DataFrame(columns = ['closest_sample', 'distance'], index = meta_df.index)
    
    for col in filt_distance_df.columns:
        min_distance = filt_distance_df[col].min()
        min_indx = filt_distance_df[col].idxmin()
        closest_sample = min_indx
                
        ## insert results to res_df
        res_df.loc[col, 'distance'] = min_distance
        res_df.loc[col, 'closest_sample'] = closest_sample
        
    return res_df

In [192]:
def merge_ps(dist_df, temp_meta, cros_meta):
#     cros_meta_c = cros_meta.reset_index(inplace = False)
#     res_df = pd.merge(dist_df, cros_meta_c[['index', 'mt_pseudotime', 'mt_subCST']], left_on = 'closest_sample', right_on = 'index').set_index(dist_df.index)
    
#     temp_meta_c = temp_meta.reset_index(inplace = False)
#     temp_meta_res = pd.concat([temp_meta_c, res_df[['index', 'mt_pseudotime', 'mt_subCST']]])
#     temp_meta_res.set_index('sampleID', inplace = True)

    ##
    dist_df = dist_df.reset_index(inplace = False)
    cros_meta_c = cros_meta.reset_index(inplace = False)
    cros_meta_c.rename(columns = {'index': 'closest_sample'}, inplace = True)

    res_df = pd.merge(dist_df, cros_meta_c[['closest_sample', 'mt_pseudotime', 'mt_subCST']], on = 'closest_sample')
    temp_meta_c = temp_meta.reset_index(inplace = False)
    temp_meta_res = pd.concat([temp_meta_c, res_df[['closest_sample', 'mt_pseudotime', 'mt_subCST']]])
    temp_meta_res.set_index('sampleID', inplace = True)
    
    return res_df

In [131]:
euc_dist_df = get_euc_nn(int_cros_df, int_temp_df)
euc_knn_res_df = get_knn(euc_dist_df, temp_meta)
print(len(euc_knn_res_df['closest_sample'].unique()))

euc_meta = merge_ps(euc_knn_res_df, temp_meta, cros_meta)

688


In [193]:
full_bc_dist_df = get_pcoa(both_df, 'braycurtis')
bc_dist_df = get_filt_distance_df(full_bc_dist_df, cros_meta, temp_meta)
bc_knn_res_df = get_knn(bc_dist_df, temp_meta)
bc_meta = merge_ps(bc_knn_res_df, temp_meta, cros_meta)

In [197]:
bc_meta
temp_meta_c = temp_meta.reset_index(inplace = False)
temp_meta_res = pd.concat([temp_meta_c, bc_meta[['closest_sample', 'mt_pseudotime', 'mt_subCST']]])
bc_meta

Unnamed: 0,sampleID,closest_sample,distance,mt_pseudotime,mt_subCST
0,s3.w1d2,SRR9145799,0.023811,0.749990,III-B
1,s3.w1d3,SRR9145799,0.084673,0.749990,III-B
2,s127.w9d7,SRR9145799,0.079,0.749990,III-B
3,s3.w1d5,SRR9145554,0.144019,0.859855,III-B
4,s3.w1d6,SRR9148331,0.086602,0.896848,IV-B
...,...,...,...,...,...
1651,s135.w9d2,SRR9145671,0.103692,0.693011,I-B
1652,s135.w9d4,SRR9147277,0.18525,0.769340,I-B
1653,s135.w9d6,SRR9148091,0.05345,0.708469,III-A
1654,s135.w10d3,SRR9146145,0.054873,0.680997,III-A


In [191]:
bc_meta.loc['s49.w1d1']

subjectID               49
time                   1.0
week                   1.0
dayInWeek              1.0
ethnicity            White
SBV                    0.0
ABV                    0.0
BV_medication          0.0
ph                     4.0
nugent                 0.0
VAG_ODOR               0.0
VAG_IRR                0.0
VAG_ITCH               0.0
VAG_BURN               0.0
VAG_DIS                0.0
MENSTRU1               NaN
MENSTRU2               NaN
MENSTRU3               NaN
I-A_sim_y         0.999018
I-B_sim_y          0.89852
II_sim_y          0.004234
III-A_sim_y       0.001647
III-B_sim_y       0.097598
IV-A_sim_y        0.001266
IV-B_sim_y        0.009732
IV-C0_sim_y       0.040271
IV-C1_sim_y       0.006205
IV-C2_sim_y        0.02624
IV-C3_sim_y       0.002479
IV-C4_sim_y       0.006519
V_sim_y           0.027199
subCST_y               I-A
score_y           0.999018
CST_y                    I
I-A_sim           0.999018
I-B_sim            0.89852
II_sim            0.004234
I

In [166]:
dist_df = bc_knn_res_df.reset_index(inplace = False)
cros_meta_c = cros_meta.reset_index(inplace = False)
cros_meta_c.rename(columns = {'index': 'closest_sample'}, inplace = True)

res_df = pd.merge(dist_df, cros_meta_c[['closest_sample', 'mt_pseudotime', 'mt_subCST']], on = 'closest_sample')
temp_meta_c = temp_meta.reset_index(inplace = False)
# temp_meta_res = pd.merge(temp_meta_c, res_df[['closest_sample', 'mt_pseudotime', 'mt_subCST']])#, on = 'sampleID')
temp_meta_res = pd.concat([temp_meta_c, res_df[['closest_sample', 'mt_pseudotime', 'mt_subCST']]])
print(temp_meta_c)
print(res_df)
display(temp_meta_res)

        sampleID subjectID  time  week  dayInWeek ethnicity  SBV  ABV  \
0        s3.w1d2         3     2     1          2     Black    0    0   
1        s3.w1d3         3     3     1          3     Black    0    0   
2        s3.w1d5         3     5     1          5     Black    0    0   
3        s3.w1d6         3     6     1          6     Black    0    0   
4        s3.w2d1         3     8     2          1     Black    1    0   
...          ...       ...   ...   ...        ...       ...  ...  ...   
1651  s135.w10d3       135    66    10          3     White    0    0   
1652  s135.w10d4       135    67    10          4     White    0    0   
1653  s135.w10d5       135    68    10          5     White    0    0   
1654  s135.w10d7       135    70    10          7     White    0    0   
1655  s135.w11d1       135    71    11          1     White    0    0   

      BV_medication   ph  ...  IV-C2_sim  IV-C3_sim  IV-C4_sim     V_sim  \
0                 0  4.0  ...   0.015918   0.00

Unnamed: 0,sampleID,subjectID,time,week,dayInWeek,ethnicity,SBV,ABV,BV_medication,ph,...,V_sim,subCST,score,CST,all_simp,all_menst,DB_type,closest_sample,mt_pseudotime,mt_subCST
0,s3.w1d2,3,2.0,1.0,2.0,Black,0.0,0.0,0.0,4.0,...,0.510269,III-B,0.888205,III,0.0,,temp,,,
1,s3.w1d3,3,3.0,1.0,3.0,Black,0.0,0.0,0.0,4.0,...,0.442101,III-B,0.933433,III,0.0,,temp,,,
2,s3.w1d5,3,5.0,1.0,5.0,Black,0.0,0.0,0.0,4.0,...,0.285891,III-B,0.841379,III,0.0,,temp,,,
3,s3.w1d6,3,6.0,1.0,6.0,Black,0.0,0.0,0.0,5.3,...,0.166855,IV-B,0.675053,IV-B,0.0,,temp,,,
4,s3.w2d1,3,8.0,2.0,1.0,Black,1.0,0.0,0.0,4.7,...,0.075886,IV-B,0.531362,IV-B,2.0,,temp,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1651,,,,,,,,,,,...,,,,,,,,SRR9145671,0.693011,I-B
1652,,,,,,,,,,,...,,,,,,,,SRR9147277,0.769340,I-B
1653,,,,,,,,,,,...,,,,,,,,SRR9148091,0.708469,III-A
1654,,,,,,,,,,,...,,,,,,,,SRR9146145,0.680997,III-A


In [150]:
bc_knn_res_df.loc['s49.w1d1']

closest_sample    SRR9145038
distance            0.005598
Name: s49.w1d1, dtype: object

In [146]:
bc_dist_df
res_df.filter(regex='s49')

s3.w1d2
s3.w1d3
s3.w1d5
s3.w1d6
s3.w2d1
...
s135.w10d3
s135.w10d4
s135.w10d5
s135.w10d7
s135.w11d1


In [170]:
path = temp_path + 'Ravel2013_data_knn_01' + '.xlsx' 
writer = pd.ExcelWriter(path, engine = 'xlsxwriter')
temp_df.to_excel(writer, sheet_name = 'abundance')
bc_meta.to_excel(writer, sheet_name = 'euc_meta')
res_df.to_excel(writer, sheet_name = 'bc_meta')
writer.save()
writer.close()


Calling close() on already closed file.



#### PCA projection

#### Intersect bacterial species

In [79]:
def get_append(first_df, second_df, filler):
    both_df = pd.concat([first_df, second_df], join = 'inner')  
    both_df_nona = both_df.fillna(filler)

    return both_df_nona

In [81]:
both_df = pd.concat([cros_df, temp_df], join = 'inner')
both_df_nona = both_df.fillna(0)

both_meta_df = pd.concat([cros_meta, temp_meta])

In [83]:
both_df.shape

(5496, 95)

#### PCA for cros data

In [93]:
def get_pca_reg(df): 
    pca = PCA(n_components = 50)
    pca.fit(df)
    pca_array = pca.transform(df)
    pca_df = pd.DataFrame(data = pca_array, columns = pca_cols, index = df.index)
    
    loadings_df = pd.DataFrame(pca.components_.T, columns = pca_cols, index = df.columns)
   
    return pca_df, loadings_df

In [94]:
ps_pca_df, ps_loadings = get_pca_reg(cros_df)

#### Order loadings by intersect species

In [95]:
def get_loadings_int(both_df, loadings_df):
    int_spec_lst = both_df.columns
    both_loadings = loadings_df.loc[int_spec_lst, :]
    not_int_spec = pd.concat([both_loadings, loadings_df]).drop_duplicates(keep = False)
    
    zero_df = pd.DataFrame(columns = not_int_spec.columns, index = not_int_spec.index)
    zero_df.fillna(0, inplace = True)
    
    int_loadings = pd.concat([both_loadings, zero_df])
    int_loadings.reindex(loadings_df.index)
    
    return int_loadings, both_loadings

In [96]:
int_loadings, both_loadings = get_loadings_int(both_df, ps_loadings)
display(both_loadings)

Unnamed: 0,PCA1,PCA2,PCA3,PCA4,PCA5,PCA6,PCA7,PCA8,PCA9,PCA10,...,PCA41,PCA42,PCA43,PCA44,PCA45,PCA46,PCA47,PCA48,PCA49,PCA50
Actinomyces_urogenitalis,2.647422e-07,1.550894e-07,-3.035560e-07,1.744388e-07,-3.580321e-07,-9.325089e-07,-5.550238e-07,-5.269796e-07,4.425202e-07,-1.068554e-07,...,-2.720165e-06,-9.206919e-07,9.919444e-06,-3.889971e-06,-2.912627e-06,6.262525e-06,-6.579049e-08,-4.154521e-06,9.206789e-06,-5.662512e-07
Aerococcus_christensenii,4.510621e-04,3.316239e-03,-4.054041e-03,-4.255729e-04,7.171793e-04,-2.168032e-03,-7.152186e-04,4.449002e-03,5.230975e-03,9.557497e-04,...,-8.265750e-02,4.335981e-02,-1.855584e-01,-8.631705e-02,-4.791127e-01,6.681224e-01,-1.320430e-01,1.764212e-01,-2.176005e-01,9.730522e-02
Alloscardovia_omnicolens,-3.335084e-04,6.555467e-04,8.671785e-04,5.405420e-03,-1.690921e-03,-3.249529e-03,-1.367175e-03,6.500776e-03,2.196777e-03,2.834601e-03,...,3.243841e-02,-2.789459e-02,6.130229e-02,-3.544178e-03,2.178293e-02,-2.619463e-02,2.163595e-02,1.067625e-02,-1.272303e-02,-4.131937e-03
Anaerococcus_tetradius,-1.963232e-04,9.488501e-04,-7.195005e-04,1.132297e-03,-7.607525e-04,-2.195675e-03,-1.113307e-03,1.361471e-03,1.087489e-03,4.249494e-04,...,-1.359369e-03,-3.259324e-03,-2.938476e-03,-2.698103e-03,-2.262521e-02,-9.077439e-03,5.789541e-05,-2.902487e-03,1.171205e-02,1.180369e-02
Atopobium_minutum,-1.292108e-05,5.712976e-05,-1.713482e-05,1.559912e-04,-8.312561e-05,-2.052153e-04,-4.497975e-05,2.118712e-04,2.367155e-04,8.023004e-06,...,3.633116e-04,1.518263e-03,-7.635002e-03,-1.090870e-02,7.857808e-03,1.479157e-03,-3.743929e-04,3.356381e-04,-3.052789e-03,-4.501790e-06
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Bacteroidales,-9.737420e-06,4.039155e-05,4.821021e-06,4.209529e-05,-3.620391e-05,-1.265618e-04,-7.473837e-05,-7.336332e-05,4.651128e-05,6.440793e-06,...,5.661483e-05,-1.337230e-04,-3.735863e-04,-8.537164e-05,-1.241498e-04,-3.523738e-04,-7.020936e-06,-2.514018e-04,-5.243580e-06,1.537551e-04
Lactobacillales,2.098302e-06,2.133577e-06,-1.804485e-07,5.692212e-06,-2.644168e-06,-1.466849e-05,-7.581515e-06,2.698933e-05,9.716342e-06,-2.246659e-05,...,-1.515823e-05,-3.201079e-05,4.348105e-05,5.087808e-06,-1.600578e-06,1.746225e-05,1.293270e-05,9.536903e-06,8.363285e-06,7.248696e-06
Pseudomonadales,5.352340e-08,2.689481e-09,-3.094234e-08,5.995769e-08,1.971776e-07,3.340747e-08,3.586264e-09,2.331278e-08,-2.222472e-08,4.857340e-08,...,-3.650840e-08,-5.627261e-07,9.836994e-07,1.441526e-07,5.846164e-07,7.796971e-07,-7.287772e-07,5.344429e-07,-2.831426e-07,5.808145e-07
Firmicutes,-9.443926e-07,2.102728e-05,1.742744e-05,-8.881999e-06,-1.541538e-05,-3.921011e-05,-2.808455e-05,-9.846424e-06,3.025312e-05,1.297631e-05,...,3.194801e-05,7.313206e-06,-1.101976e-04,-1.212960e-04,-1.633136e-04,2.678787e-05,-1.451869e-05,-2.241668e-04,1.665929e-04,-1.815232e-04


#### Project PCA on temp data

In [97]:
def get_project(loadings, proj_df):
    new_pca_df = pd.DataFrame(index = proj_df.index)
    loadings_arr = np.array(loadings)
    
    ## Center df
    mean_features = proj_df.mean(axis = 0)
    df_mean_features = proj_df - mean_features
    
    ## Matmul
    for i in range(0, len(loadings_arr.T)):
        res_arr = np.matmul(loadings_arr[:, i], np.array(df_mean_features.T))
        new_pca_df[loadings.columns[i]] = res_arr
        
    return new_pca_df

In [90]:
display(int_temp_df.shape)

(1657, 95)

In [100]:
temp_pca_df = get_project(both_loadings, int_temp_df)
int_ps_pca_df = get_project(both_loadings, int_cros_df)
display(int_ps_pca_df)
display(temp_pca_df)

Unnamed: 0,PCA1,PCA2,PCA3,PCA4,PCA5,PCA6,PCA7,PCA8,PCA9,PCA10,...,PCA41,PCA42,PCA43,PCA44,PCA45,PCA46,PCA47,PCA48,PCA49,PCA50
SRR9143172,-67.908128,-53.329212,-4.934998,-4.751471,0.567934,-0.064062,0.827747,-2.301201,-0.974409,-0.402059,...,0.110848,-0.187677,0.442618,-0.074397,0.082332,0.188514,0.054575,0.057944,-0.063073,0.014288
SRR9143173,-69.181103,-54.702183,-5.253613,-5.902259,-0.848516,-0.401881,0.798138,-2.216819,-0.964061,-0.365028,...,0.107447,-0.188871,0.443847,-0.077075,0.081422,0.193326,0.055153,0.061064,-0.065542,0.015916
SRR9143174,35.604081,-9.702166,4.517310,20.865707,32.881901,5.759275,2.490113,-4.121445,-1.233396,-1.220354,...,0.136777,-0.184217,0.482518,-0.041441,0.126327,0.108913,0.070097,0.024199,-0.028739,-0.004935
SRR9143176,-13.146753,21.493088,6.655014,43.862348,-35.763668,65.936747,-32.656532,-6.115487,-2.452374,-1.862952,...,0.119486,-0.257676,0.462721,-0.104046,0.117773,0.206716,0.013628,0.065060,-0.075549,0.005209
SRR9143177,-10.462197,34.488856,25.846357,-12.588829,0.186524,0.845224,1.574847,-0.664729,-2.084765,1.103367,...,-0.262121,0.385854,-0.419247,0.180751,0.223296,-0.199494,-0.075321,0.270851,-0.210151,-0.122763
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
SRR9149699,-12.192810,16.865198,10.629862,36.111150,-10.061390,-23.003614,-6.855722,65.128597,-52.091067,-30.887379,...,0.329307,-0.023414,0.331439,1.055844,-0.645530,-0.182896,-0.088145,0.323487,-0.509812,0.177383
SRR9149700,67.491326,-27.148678,-0.071964,-3.379930,-1.640460,-0.530572,0.946243,-2.300272,-0.920935,-0.297906,...,0.119275,-0.168524,0.454264,-0.041127,0.151390,0.088532,0.063588,0.022466,-0.052420,-0.009802
SRR9149701,-11.784777,15.778569,9.706060,27.302087,-6.086137,-10.923875,-2.918275,3.937228,1.488366,3.301226,...,-1.277986,0.380872,0.634552,0.324502,0.759043,0.292134,0.182506,0.323293,0.001470,0.147628
SRR9149702,69.238010,-28.244622,-0.338494,-4.337956,-3.005973,-0.941678,0.935836,-2.216962,-0.892248,-0.263216,...,0.117627,-0.167120,0.453907,-0.042355,0.151016,0.091691,0.065146,0.023095,-0.048973,-0.008008


Unnamed: 0_level_0,PCA1,PCA2,PCA3,PCA4,PCA5,PCA6,PCA7,PCA8,PCA9,PCA10,...,PCA41,PCA42,PCA43,PCA44,PCA45,PCA46,PCA47,PCA48,PCA49,PCA50
sampleID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
s3.w1d1,35.702536,-21.736640,7.329712,3.865769,16.224964,4.688633,2.010919,-3.332602,-3.563839,0.177269,...,0.178108,-0.507952,1.379724,-0.097139,1.213021,-0.362208,-1.465001,-0.184965,0.329191,-0.137694
s3.w1d2,33.982946,-25.846273,9.463847,5.249200,16.344714,4.340611,1.873695,-3.356470,-3.228236,0.051285,...,0.113817,-0.465396,1.238000,-0.153655,0.897469,0.060662,-1.547725,-0.083595,0.202974,-0.075575
s3.w1d3,31.500618,-29.400942,8.063462,1.183373,11.275906,3.493204,1.674983,-3.085200,-3.263343,0.191574,...,0.166058,-0.498569,1.362361,-0.102467,1.196379,-0.344191,-1.468210,-0.184155,0.326122,-0.134254
s3.w1d5,22.712943,-2.251380,-1.726721,-4.743417,-1.172093,0.498118,0.896775,0.537784,-5.182490,1.378410,...,-1.143410,-0.265909,-0.106008,-1.484941,-4.000966,7.574240,-2.909431,1.712302,-1.686165,1.003242
s3.w1d6,-1.101138,27.324947,-20.257073,-14.184626,1.415433,3.862887,2.029664,-0.056132,-6.029919,2.207090,...,-0.621679,0.010322,-0.711097,-0.923427,-3.393328,5.974247,-2.729384,1.421133,-1.636122,0.776404
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
s135.w10d3,45.832411,-31.395086,6.467680,-6.405186,-1.530561,0.759358,0.999787,-2.088731,-2.985244,0.623562,...,0.145404,-0.433958,1.186758,-0.113240,1.050926,-0.209181,-1.549968,-0.173382,0.282986,-0.108055
s135.w10d4,45.629926,-31.246604,6.271457,-6.558919,-1.529348,0.789604,1.010442,-2.056613,-3.014374,0.583136,...,0.124515,-0.424628,1.153722,-0.141353,0.934868,-0.032063,-1.556909,-0.135773,0.250099,-0.086024
s135.w10d5,45.228419,-21.633892,3.104380,-7.383212,-2.104396,0.831559,1.046924,-1.606182,-3.372871,0.854544,...,0.148503,-0.407386,1.097557,-0.096174,1.032417,-0.259651,-1.561283,-0.181886,0.271388,-0.103093
s135.w10d7,24.314169,-34.786920,5.290241,-6.471891,0.196115,1.309477,1.115983,-2.242682,-3.152652,0.555142,...,0.158909,-0.463263,1.265255,-0.094904,1.148405,-0.322022,-1.514156,-0.173539,0.291716,-0.124399


In [101]:
path = cros_path + 'pca_projection_results_05072022' + '.xlsx' 
writer = pd.ExcelWriter(path, engine = 'xlsxwriter')
int_ps_pca_df.to_excel(writer, sheet_name = 'cros_pca_df')
temp_pca_df.to_excel(writer, sheet_name = 'temp_pca_df')
writer.save()
writer.close()


Calling close() on already closed file.



#### Find euclidean nearest neighbor

In [None]:
def get_euc_nn(ps_df, temp_df):
    dist_arr = euclidean_distances(ps_df, temp_df)
    dist_df = pd.DataFrame(dist_arr, index = ps_df.index, columns = temp_df.index)
    
    return dist_df

In [None]:
euc_dist_df = get_euc_nn(int_ps_pca_df, temp_pca_df)

In [None]:
def get_knn(filt_distance_df, meta_df):
    res_df = pd.DataFrame(columns = ['closest_sample', 'distance'], index = meta_df.index)
    for col in filt_distance_df.columns:
        min_distance = filt_distance_df[col].min()
        min_indx = filt_distance_df[col].idxmin()
        closest_sample = min_indx
                
        ## insert results to res_df
        res_df.loc[col, 'distance'] = min_distance
        res_df.loc[col, 'closest_sample'] = closest_sample
        
    return res_df

In [None]:
euc_knn_res_df = get_knn(euc_dist_df, temp_meta_df)
print(len(euc_knn_res_df['closest_sample'].unique()))
# display(knn_res_df.sort_values('distance'))

In [None]:
both_pca_df = get_append(int_ps_pca_df[pca_cols[:3]], temp_pca_df[pca_cols[:3]], np.nan)

#### Find BC nearest neighbor

In [None]:
def get_pcoa(df, metric):
    arr = df.to_numpy()
    bc_df = beta_diversity(counts = arr, ids = list(df.index), metric = metric, validate = False)

    return bc_df.to_data_frame()

In [None]:
def get_filt_distance_df(distance_df, ps_meta_df, meta_df):
    cols_to_keep = meta_df.index
    rows_to_keep = ps_meta_df.index
    
    filt_distance_df = distance_df[cols_to_keep]
    filt_distance_df = filt_distance_df[filt_distance_df.index.isin(rows_to_keep)]
    
    return filt_distance_df

In [None]:
def get_knn(filt_distance_df, meta_df):
    res_df = pd.DataFrame(columns = ['closest_sample', 'distance'], index = meta_df.index)
    for col in filt_distance_df.columns:
        min_distance = filt_distance_df[col].min()
        min_indx = filt_distance_df[col].idxmin()
        closest_sample = min_indx
                
        ## insert results to res_df
        res_df.loc[col, 'distance'] = min_distance
        res_df.loc[col, 'closest_sample'] = closest_sample
        
    return res_df

In [None]:
bc_dist_df = get_pcoa(both_df, 'braycurtis')
filt_distance_df = get_filt_distance_df(bc_dist_df, ps_meta_df, temp_meta_df)
bc_knn_res_df = get_knn(filt_distance_df, temp_meta_df)

In [None]:
display(len(bc_knn_res_df['closest_sample'].unique()))