### Objective: 

for a given gene, get the set of N nearest neighbors in UMAP space. 

### Imports: 

In [1]:
import pandas as pd
from sklearn.neighbors import NearestNeighbors

### Files: 

In [2]:
# Review how you generated this file:
file_umap = '../../data/tests/df_umap_lfc.csv'

### Functions: 

In [3]:
def get_neighbors(data, n_neighbors = 5, ix = None, **kwargs): 
    """
    Given an input dataframe with a given coordinate system, 
    the function returns the indices of the nearest neighbors
    of a dataset or a specific data point. 

    Params 
    ------
    data (pd.DataFrame)
        Dataframe whose columns represent a given coordinate system where distances
        can be calculated.

    n_neighbors(int, default = 5)
        Number of nearest neighbors. 
        
    ix (int, default = None)
        Index of data point for which to find the nearest neighbors.
        Defaults to None, to return the indices of all data points. 

    Returns 
    -------
    neighbor_ixs (np.array)
        Array of the indices of the nearest neighbors of shape (n_datapoints, n_neighbors).
    """

    # Initalize Nearest Neighbors object
    knn = NearestNeighbors(n_neighbors = n_neighbors).fit(data)

    neighbors = knn.kneighbors()

    neighbor_ixs = neighbors[1]

    # Loop to return ixs of a specific point.
    if ix is not None: 
        return neighbor_ixs[ix]

    return neighbor_ixs


def get_neighbor_subdataset(neighbors, data, col_name, id_): 

    """
    Using the precomputed neighbor indices, returns a subsdataset 
    for an input datapoint with index id. 

    Params 
    ------

    neighbors (np.array)
        Neighbor indices of size (n_datapoints, n_neighbors). 

    data (pd.DataFrame)
        Pandas dataframe to be filtered.

    col_name (str)
        Column name for filtering. 

    id_ (str)
        Identifier to use for getting 
    
    Returns 
    -------
    sub_df (pd.DataFrame)
        Subset of data of the point's neighbors.
    """

    # Extract index of desired data point 
    target_ix = data[data[col_name].values == id_].index.values[0]

    # Get the neighbors indices of desired data point
    neighbor_ixs = neighbors[target_ix]
    
    # Get subdataset using iloc method
    sub_df = data.iloc[neighbor_ixs]
    
    return sub_df 

### Example: 

load UMAP dataframe: 

In [4]:
df_lfc_annot = pd.read_csv(file_umap)
df_lfc_annot = df_lfc_annot[['Rv_ID', 'gene_name', 'Annotation', 'func_tuberculist', 'u1', 'u2', 'u3']]
df_lfc_annot.head(3)

Unnamed: 0,Rv_ID,gene_name,Annotation,func_tuberculist,u1,u2,u3
0,Rv0001,dnaA,4.0,information pathways,20.734426,18.285404,6.110508
1,Rv0002,dnaN,4.0,information pathways,21.29206,18.294655,6.440992
2,Rv0003,recF,2.0,information pathways,19.92127,16.08922,4.47744


This function pre-computes N nearest neighbors for all genes and stores in a list of index lists.

In [54]:
neighbors = get_neighbors( df_lfc_annot[['u1', 'u2', 'u3']], n_neighbors = 60, ix = None )

In [55]:
neighbors

array([[ 554, 1833, 3006, ..., 1420,  233, 1040],
       [3719, 2789, 3004, ..., 2435,  233, 3881],
       [3505, 1256,  834, ..., 3605, 3511, 1745],
       ...,
       [3393, 3785, 2980, ..., 2938, 2286, 1756],
       [2755, 1945,  688, ...,   65, 2430, 1979],
       [1521, 3090, 3767, ..., 2769,  657, 3703]])

And here we select the N nearest neighbors for a given query gene: 

In [56]:
rvid = 'Rv3502c'
df_subset = get_neighbor_subdataset(neighbors, df_lfc_annot, col_name = 'Rv_ID', id_ = rvid)

If we only want to display the ones with high uniprot annotation score, we could do this: 

In [37]:
df_subset[df_subset.Annotation.values>=3]

Unnamed: 0,Rv_ID,gene_name,Annotation,func_tuberculist,u1,u2,u3
3666,Rv3544c,fadE28,5.0,lipid metabolism,18.45998,16.051607,4.671244
3665,Rv3543c,fadE29,5.0,lipid metabolism,18.492285,16.031841,4.676585
3694,Rv3571,hmp,5.0,intermediary metabolism and respiration,18.380219,16.04446,4.769896
498,Rv0485,-,3.0,regulatory proteins,18.539738,16.065975,4.694098
3648,Rv3526,-,5.0,intermediary metabolism and respiration,18.441978,15.95345,4.800039
3692,Rv3569c,bphD,5.0,intermediary metabolism and respiration,18.531462,15.931683,4.76603
208,Rv0202c,mmpL11,3.0,cell wall and cell processes,18.348436,16.226667,4.718264
195,Rv0190,-,5.0,conserved hypotheticals,18.481443,16.256952,4.554314
4000,Rv3868,-,5.0,cell wall and cell processes,18.330515,16.150555,4.826209
460,Rv0450c,mmpL4,4.0,cell wall and cell processes,18.609772,16.179726,4.747845


#### Reformat data for Anisha's dash app:

In [67]:
# initialize NN dataframe: 
cols = ['Rv_ID']+['NN_'+str(i) for i in range(1, 11)]
df_NN_dash = pd.DataFrame(columns=cols)

list_rvid = df_lfc_annot.Rv_ID.to_list()
counter = 0
for rvid in list_rvid:
    if counter%250==0:
        print(counter, 'out of', len(list_rvid))
    # get neighbors: 
    df_NN = get_neighbor_subdataset(neighbors, df_lfc_annot, col_name = 'Rv_ID', id_ = rvid)
    df_NN_annot = df_NN[df_NN.Annotation.values>=3].copy()
    row_to_append = [rvid]+df_NN_annot.iloc[:10, 0].to_list() # 10 annotated NN's
    
    if len(row_to_append)<11:
        size_pad = 11 - len(row_to_append)
        row_to_append = row_to_append + size_pad*['NONE']
    
    df_NN_dash.loc[len(df_NN_dash)] = row_to_append
    counter += 1

0 out of 4058
250 out of 4058
500 out of 4058
750 out of 4058
1000 out of 4058
1250 out of 4058
1500 out of 4058
1750 out of 4058
2000 out of 4058
2250 out of 4058
2500 out of 4058
2750 out of 4058
3000 out of 4058
3250 out of 4058
3500 out of 4058
3750 out of 4058
4000 out of 4058


In [72]:
file_out = '../../data/NN_for_dash.csv'
df_NN_dash.to_csv(file_out, index = False)

In [71]:
df_NN_dash[df_NN_dash.Rv_ID.values == 'Rv3400']

Unnamed: 0,Rv_ID,NN_1,NN_2,NN_3,NN_4,NN_5,NN_6,NN_7,NN_8,NN_9,NN_10
3522,Rv3400,Rv3876,Rv3869,Rv3877,Rv3870,Rv3883c,Rv2069,Rv3882c,Rv1236,Rv3871,Rv0450c
