# Part 2 - Patient Similarity Networks (PSNs)

## Table of Contents

Part 2.

10. **Patient Similarity Network Construction**

11. **DNA Methylation Network Analysis** 

In [None]:
# standard libraries
import os
import pickle

# scientific and data manipulation libraries
import numpy as np
import pandas as pd
from scipy import stats
from sklearn.feature_selection import mutual_info_regression
from astropy.stats import median_absolute_deviation
import mygene
import astropy

# graph and network libraries
import networkx as nx

# visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.io as pio
from IPython.display import Image
from IPython.display import display

import warnings
warnings.filterwarnings('ignore')

# import custom functions from the previous notebook
import sys
sys.path.insert(0 , '/tutorial/')
from functions import *

# 10. Patient Similarity Network (PSN)

- Based on the same expression matrix we can create a patient similarity network.
- Transposing the matrix will switch the rows and columns,
- meaning that patients will become the columns instead of genes
- By doing this, you can compute the correlation (or similarity) between patients based on their gene expression profiles,
- and then create a network where nodes represent patients and edges represent similarities.

In [None]:
# main data directories for the project

raw_data_dir = '/data/raw'
intermediate_data_dir = '/data/intermediate'

In [None]:
# read in os.path.join(intermediate_data_dir,"expression_data_filtered.csv")
df_renamed = pd.read_csv(os.path.join(intermediate_data_dir,
                                      "expression_data_filtered.csv"),
                                      index_col=0)

In [None]:
'''
#################################################
                YOUR CODE HERE
#################################################

We will now transpose the df_renamed df so that the rows represent the genes and the columns represent the patients.
Let's call the transposed df patient_gene_matrix.

'''

In [None]:
patient_gene_matrix

In [None]:
'''
#################################################
                YOUR CODE HERE
#################################################

We will now calculate the correlation matrix for the patient_gene_matrix using the Pearson correlation method.
Store the correlation matrix in a dictionary called patient_correlation_matrices with the key 'pearson'.
We don't have to do it, however if you want to calculate the correlation matrix using other methods,
you can do so and store them in the dictionary as well.

'''

# Dictionary to store different correlation matrices


# Pearson correlation


In [None]:
'''
#################################################
                YOUR CODE HERE
#################################################

Create a graph from the correlation matrix using the create_graph_from_correlation function.
Set the threshold to 0.8.
Store the graph in a variable called patient_pearson_graph.

'''


In [None]:
'''
#################################################
                YOUR CODE HERE
#################################################

Visualie the graph using the visualise_graph function.
Use appropriate title for the graph as the second argument.

'''

In [None]:
'''
#################################################
                YOUR CODE HERE
#################################################

Now use clean_graph function to clean the graph called patient_pearson_graph_pruned.
Consider the following parameters:
- degree_threshold
- keep_largest_component

'''


In [None]:
'''
#################################################
                YOUR CODE HERE
#################################################

Visualie the pruned graph using the visualise_graph function.
Use appropriate title for the graph as the second argument.

'''



In [None]:
'''
#################################################
                YOUR CODE HERE
#################################################

 Now do some sparsification of the graph using knn_sparsification function,
 call it patient_pearson_graph_pruned_knn.
 Set the k value to 10.               

'''



In [None]:
'''
#################################################
                YOUR CODE HERE
#################################################

Let's see some information about the graph using the print_graph_info function.
First, print the information about the patient_pearson_graph_pruned graph.
Use print("------------------------------------"), as a devide between the two graphs.
Then, print the information about the patient_pearson_graph_pruned_knn graph.            

'''



In [None]:
visualise_graph(patient_pearson_graph_pruned_knn, title='K-Nearest Neighbors (k=10) Patient Correlation Network')

# 11. DNA methylation PSN

In the second task, we are preparing an additional network for the same patients, this time based on DNA methylation data.

In [None]:
# Load the data using pickle from the ISMB_TCGA_DNAm.pkl file
with open(os.path.join(raw_data_dir,"ISMB_TCGA_DNAm.pkl") , 'rb') as file : 
    data = pd.read_pickle(file)

# Extract the methylation data from the dictionary similar to the previous data
meth_data = data["datExpr"]
meth_data


In [None]:
# load the data from the pickle file ISMB_TCGA_GE.pkl and call it GE_data
with open(os.path.join(raw_data_dir,"ISMB_TCGA_GE.pkl"), 'rb') as file:
    GE_data = pickle.load(file)

In [None]:
# A reminder about the structure of the GE_data, we can get a list of the patients using the following code
GE_data["datMeta"]["patient"].to_list()

We don't want to include all CpG sites in our analysis, so we are using dataset from the EWAS Catalog that contains smoking related CpG sites.

In [None]:
smoking_df = pd.read_csv(os.path.join(raw_data_dir,"smoking.tsv"),
                          delimiter='\t')
smoking_df

In [None]:
'''
#################################################
                YOUR CODE HERE
#################################################
1. Identify CpG sites that are commonly annotated in the smoking dataset
2. Filter the DNA methylation data to include only the common CpG sites identified in the previous step
3. Identify patients that are present in both the gene expression dataset and the methylation dataset
4. Filter the methylation data to include only the common patients and common CpG sites
5. Transpose the filtered methylation data matrix
'''

# Step 1: Count the occurrences of each unique value in the 'cpg' column using value_counts


# Step 2: Filter the counts to keep only those greater than 10


# Step 3: Get the index of the filtered counts and convert it to a list


# Step 4: Identify common CpG sites between the annotated list and the methylation dataset


# Step 5: Convert the cpgs set to a list


# Step 6: Identify common patients between the gene expression and methylation datasets
# remember how to get the list of patients from dataset and to convert it to a list


# Step 7: Filter the methylation data to include only the common patients and common CpG sites


# Step 8: Transpose the filtered methylation data matrix and call it patient_meth_matrix



In [None]:
# let's inspect the patient_meth_matrix that we have created
patient_meth_matrix

We can finish our network following the previous steps using the functions we have created.

In [None]:
# Dictionary to store different correlation matrices
p_meth_correlation_matrices = {}

# Pearson correlation
p_meth_correlation_matrices['pearson'] = patient_meth_matrix.corr(method='pearson')

p_meth_pearson_graph = create_graph_from_correlation(p_meth_correlation_matrices['pearson'], threshold=0.8)
# Clean the graph by removing unconnected nodes
p_meth_pearson_graph_pruned = clean_graph(p_meth_pearson_graph,
                                    degree_threshold=1,
                                    keep_largest_component=True)

visualise_graph(p_meth_pearson_graph_pruned, title='Pearson Correlation Network (Threshold = 0.8)')

In [None]:
# sparseify the graph using knn_sparsification or any other method
p_meth_pearson_graph_pruned_knn = knn_sparsification(p_meth_pearson_graph_pruned, k=10)

In [None]:
# visualise the graph using the visualise_graph function
visualise_graph(p_meth_pearson_graph_pruned_knn, title='Pearson Correlation Network (Threshold = 0.8)')