In [1]:
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.decomposition import PCA
from sklearn.manifold import MDS
from sklearn.manifold import TSNE
from skbio.diversity import beta_diversity, alpha_diversity
from skbio.stats.ordination import pcoa, pcoa_biplot
from skbio import DistanceMatrix
from scipy.stats import spearmanr, pearsonr
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
import statsmodels.api as sm 
import umap
from io import StringIO
from os.path import join
import pandas as pd
import xlsxwriter
import openpyxl
import os
import numpy as np
from pandas import Series, ExcelWriter
import scipy.io as sio
import plotly.graph_objects as go
import plotly.io as pio
import plotly.express as px
import seaborn as sns
from IPython.display import display, HTML
from fpdf import FPDF
import scanpy as sc 
from anndata import AnnData
import csv
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from matplotlib.lines import Line2D  # for legend handle
import warnings
warnings.filterwarnings('ignore')

In [2]:
import sys  
sys.path.insert(0, '../my_packages')
from dimen_reduc import get_bc_dist, get_pcoa
import meta_ord
import paga_res
import sliding_window
import statistic
from nn_finding import get_closest_nn, get_ps_from_n
from roc_ps import all_func

In [6]:
# 0. load data
# 1. find the nearest neighbor
# 2. assign pseudotime to nearest neighbor
# 3. ROC

#### Variables

In [3]:
top_path = "../../Data/Serrano_Ravel13_Ravel11_Carter22/"
    
ps_file = top_path + 'ps_res/ps_res_16052023.xlsx'
out_file = top_path + 'out_df_16052023.xlsx'

sheet_abun = 'abundance'
sheet_meta = 'meta'

date = '30052023'

#### 0. Load data

In [4]:
ps_df = pd.read_excel(ps_file, sheet_name = sheet_abun, index_col = 0)
out_df = pd.read_excel(out_file, sheet_name = sheet_abun, index_col = 0)
ps_meta = pd.read_excel(ps_file, sheet_name = sheet_meta, index_col = 0)
out_meta = pd.read_excel(out_file, sheet_name = sheet_meta, index_col = 0)
print(ps_df.shape)
print(out_df.shape)
print(ps_meta.shape)
print(out_meta.shape)

(7842, 424)
(514, 424)
(7842, 31)
(514, 28)


#### 1. Find nearest neighbor

### Euclidean distance

In [None]:
def get_euc_nn(cros_umap, temp_umap):
    dist_arr = euclidean_distances(cros_umap, temp_umap)
    dist_df = pd.DataFrame(dist_arr, index = cros_umap.index, columns = temp_umap.index)
    
    return dist_df

In [8]:
def get_knn(filt_distance_df, meta_df):
    '''input: distance matrix with rows as closest samples and cols as reference samples (i.e. wanted samples)
              and meta df of reference samples
       output: df of reference samples as row and cols: colsest sample and the distance'''
    res_df = pd.DataFrame(columns = ['closest_sample', 'distance'], index = meta_df.index)
    for col in filt_distance_df.columns:
        min_distance = filt_distance_df[col].min()
        min_indx = filt_distance_df[col].idxmin()
        closest_sample = min_indx
                
        ## insert results to res_df
        res_df.loc[col, 'distance'] = min_distance
        res_df.loc[col, 'closest_sample'] = closest_sample
        
    return res_df

In [None]:
# euc_dist_df = get_euc_nn(cros_umap, temp_umap)
# euc_nn_df = get_knn(euc_dist_df, temp_meta)
# print(len(euc_nn_df['closest_sample'].unique()))

### BC distance

In [18]:
def get_filt_distance_df(distance_df, ps_meta, out_meta):
    cols_to_keep = out_meta.index
    rows_to_keep = ps_meta.index
    
    filt_distance_df = distance_df[cols_to_keep]
    filt_distance_df = filt_distance_df[filt_distance_df.index.isin(rows_to_keep)]
    
    return filt_distance_df

In [19]:
both_df = pd.concat([ps_df, out_df])
print(both_df.shape)
both_df.fillna(0, inplace = True)
bc_dist = get_bc_dist(both_df)
print(bc_dist.shape)
filt_bc_dist = get_filt_distance_df(bc_dist, ps_meta, out_meta)
print(filt_bc_dist.shape)

(8356, 424)
(8356, 8356)
(7842, 514)


In [24]:
knn_df = get_closest_nn(filt_bc_dist, ps_df, out_df)

#### 2. Assign pseudotime to temporal samples

In [14]:
out_meta = get_ps_from_n(knn_df, ps_meta, out_meta)

In [26]:
out_meta.columns

Index(['sampleID', 'subjectID', 'day', 'week', 'time', 'ph', 'nugent', 'menst',
       'subCST', 'CST', 'score', 'db', 'nugnet', 'VAG_IRR', 'VAG_ITCH',
       'VAG_ODOR', 'age', 'ethnicity', 'symptoms', 'ABV', 'SBV',
       'BV_medication', 'shannon_index', 'whiff', 'clue', 'vag_fluid',
       'BV_status', 'BV_bin', 'closest_sample', 'distance', 'mt_pseudotime',
       'closest_subCST'],
      dtype='object')

#### 3. ROC

In [30]:
out_meta['nugent_bin'] = out_meta.apply(lambda row: meta_ord.label_numeric_bin(row, 'nugent', 7), axis = 1)
out_meta['ph_bin'] = out_meta.apply(lambda row: meta_ord.label_numeric_bin(row, 'ph', 5.51), axis = 1)
out_meta['BV_amsel_bin'] = out_meta['ABV'] + out_meta['SBV']

In [49]:
nugent_roc_df, nug_shuff = all_func(out_meta, 99, 'nugent_bin')
ph_roc_df, ph_shuff = all_func(out_meta, 99, 'ph_bin')
amsel_roc_df, amsel_shuff = all_func(out_meta, 99, 'BV_amsel_bin')

#### 4. Save

In [50]:
dict_nn = {'abundance': out_df, 'meta': out_meta}
nn_path = top_path + 'ps_res/' + 'nn_res_' + str(date) + '.xlsx'
paga_res.save_excel(nn_path, dict_nn)

In [51]:
dict_roc = {'nug': nugent_roc_df, 'ph': ph_roc_df, 'bv': amsel_roc_df}
roc_path = top_path + 'ps_res/' + 'nn_roc_' + str(date) + '.xlsx'
paga_res.save_excel(roc_path, dict_roc)

#### Check

In [7]:
out_meta[out_meta['menst'] > 0]

Unnamed: 0,sampleID,subjectID,day,week,time,ph,nugent,menst,subCST,CST,...,symptoms,ABV,SBV,BV_medication,shannon_index,whiff,clue,vag_fluid,BV_status,BV_bin
SRR13537301,UAB004_2_2,UAB004,2.0,2.0,9.0,4.4,0.0,1.0,I-A,I,...,,,,,0.720075,,,,,0
SRR13537298,UAB004_3_1,UAB004,1.0,3.0,15.0,4.0,0.0,1.0,I-A,I,...,,,,,0.107107,,,,,0
SRR13537297,UAB004_3_3,UAB004,3.0,3.0,17.0,4.0,0.0,1.0,I-A,I,...,,,,,0.111409,,,,,0
SRR13537289,UAB004_5_5,UAB004,5.0,5.0,33.0,4.0,0.0,1.0,I-A,I,...,,,,,0.380722,,,,,0
SRR13537288,UAB004_5_7,UAB004,7.0,5.0,35.0,4.0,0.0,1.0,I-A,I,...,,,,,0.195226,,,,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
SRR906140,UAB116_9_5,UAB116,5.0,9.0,52.0,7.0,,3.0,IV-B,IV-B,...,0.0,0.0,0.0,0.0,2.507989,,,,,0
SRR906139,UAB116_9_6,UAB116,6.0,9.0,53.0,5.5,,2.0,IV-B,IV-B,...,0.0,0.0,0.0,0.0,2.760613,,,,,0
SRR905235,UAB052_9_6,UAB052,6.0,9.0,51.0,4.7,,1.0,V,V,...,0.0,0.0,0.0,0.0,2.575765,,,,,0
SRR906138,UAB116_9_7,UAB116,7.0,9.0,54.0,5.5,,1.0,IV-B,IV-B,...,0.0,0.0,0.0,0.0,2.608431,,,,,0
