In [1]:
import numpy as np
import pandas as pd
import csv

In [2]:
# load data
data = pd.concat([pd.read_csv('new_data/input_train.csv', index_col=0), pd.read_csv('new_data/input_test.csv', index_col=0)])
out_data = pd.concat([pd.read_csv('new_data/output_train-1.csv', index_col=0, header=0), pd.read_csv('new_data/output_test-1.csv', index_col=0, header=0)])
data = pd.concat([data, out_data], axis=1)

In [3]:
# load gene names
with open('gene labels/input_genes.txt') as f:
    genes = [gene.strip() for gene in f]
    
with open('gene labels/output_genes-1.txt') as f:
    out_genes = [gene.strip() for gene in f]

In [6]:
# calculate Pearson's correlation
corr = dict()
for gene in out_genes:
    corr[gene] = data[genes + [gene]].corr()
    
    # Zero out self-correlations
    np.fill_diagonal(corr[gene].values, 0)

In [7]:
# get 50 highest correlations for each gene
network = dict()
for out in out_genes:
    network[out] = dict()
    for gene in genes + [out]:
        network[out][gene] = corr[out][gene].nlargest(50, keep='all')

In [8]:
# filter out genes with <= 0.0 correlation
for out in out_genes:
    network[out] = {gene: list(interact.index[interact > 0.0]) for gene, interact in network[out].items()}

In [10]:
# keep only reciprocal correlations
filtered = dict()
for out in out_genes:
    filtered[out] = dict()
    for gene, interacts in network[out].items():
        filtered[out][gene] = list()
        for interact in interacts:
            if gene in network[out][interact]:
                filtered[out][gene].append(interact)

In [12]:
# convert adjacency list to edge list
edges = dict()
for out in out_genes:
    edges[out] = list()
    for gene, interacts in filtered[out].items():
        edges[out] += [(gene, interact) for interact in interacts if gene != interact]

In [16]:
for out in out_genes:
    with open('{}_network.csv'.format(out), 'w', newline='') as f:
        writer = csv.writer(f)
        writer.writerows(edges)