In [2]:
"""
This file is used to prepare transcriptomic data for running in DynGENIE3 to build a Gene Regulatory Network

KEGG terms are used to get protein info / transcription factors
Transcriptomic read counts are used from Daphnia magna

1. Put timepoints in rows and genes in columns
2. Make timepoints numeric
3. Use KEGG file to match KEGG orthologs (transcription factors) with genes present in the transcriptomic data file
4. Extract gene order to maintain the correct order for alphas in the next file

"""

In [None]:
import numpy as np
import pandas as pd

In [7]:
genes = pd.read_csv('../../omics/rna_vst_proc.csv')
tfs = pd.read_csv('../../ortholog/protein_info_kegg_terms.csv')

  genes = pd.read_csv('../../omics/rna_vst_proc.csv')


In [8]:
#delete 2nd row (concatted treatment + timepoint info)
genes_dropped = genes.drop(1)

#separate diff conditions into 3 files, control, low, high
genes_list = genes_dropped['treatment'].tolist()

#function to make timepoints rows and genes columns
def format_GRN(df,string_name):

    #find condition information (control, low, high dose)
    control_columns = [col for col in genes_dropped.columns if col.startswith(string_name)]
    
    #separate conditions
    df = genes_dropped[control_columns]

    
    #add gene names
    df['time_points'] = genes_list
    df = df[['time_points'] + control_columns]

    #get list of timepoints
    time_points = list(df.iloc[0])
    timepoints = time_points[1:]
    #delete timepoints row
    df = df.iloc[1:]

    # Rename the columns to timepoints and gene names
    df.columns = ['time_points'] + timepoints

    #change to genes as column names
    df_transposed = df.T.reset_index()  # Reset the index to make 'time_points' a column

    # Set the new column names
    df_transposed.columns = df_transposed.iloc[0]

    # Drop the first row
    df_transposed = df_transposed.drop(df_transposed.index[0])
    df_transposed['time_points'] = timepoints

    #get mean for each timepoint experiment
    grouped_df = df_transposed.groupby('time_points').mean()
    #reset index and add timepoints as a column
    grouped_times = grouped_df.index.tolist()
    grouped_df.reset_index(drop=True, inplace=True)
    grouped_df.insert(0,'time_points',grouped_times)

    
    return grouped_df

In [9]:
control_df = format_GRN('control_df', 'control')
low_df = format_GRN('low_df', 'low')
high_df = format_GRN('high_df', 'high')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['time_points'] = genes_list
  grouped_df = df_transposed.groupby('time_points').mean()
  grouped_df.insert(0,'time_points',grouped_times)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['time_points'] = genes_list
  grouped_df = df_transposed.groupby('time_points').mean()
  grouped_df.insert(0,'time_points',grouped_times)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/

In [10]:
#make time_points numeric
def make_numeric(df):
    times = df['time_points'].tolist()
    new_times = []
    for point in times:
        if point == '12H':
            new_times.append(12)
        elif point == '1H':
            new_times.append(1)
        elif point == '24H':
            new_times.append(24)
        elif point == '2H':
            new_times.append(2)
        elif point == '4D':
            new_times.append(4*24)
        elif point == '5D':
            new_times.append(5*24)
        elif point == '6D':
            new_times.append(6*24)
        elif point == '6H':
            new_times.append(6)
        elif point == '7D':
            new_times.append(7*24)
        
    df['time_points'] = new_times
    
    return df

In [11]:
control_df = make_numeric(control_df)
low_df = make_numeric(low_df)
high_df = make_numeric(high_df)

In [16]:
#get a file of transcription factors which exist in our dataset
genes_list = control_df.columns.tolist()
tfs_genes = tfs.loc[tfs['regulatory'].isin(genes_list)]
tfs_genes.to_csv('regulatory_genes.csv')

In [17]:
#save to CSV file
control_df.to_csv('control_df.txt',sep='\t', index=False)
low_df.to_csv('low_df.txt',sep='\t', index=False)
high_df.to_csv('high_df.txt',sep='\t', index=False)

low_df.head()

Unnamed: 0,time_points,Dapma7bEVm004153,Dapma7bEVm004154,Dapma7bEVm004155,Dapma7bEVm004156,Dapma7bEVm004157,Dapma7bEVm004158,Dapma7bEVm004159,Dapma7bEVm004160,Dapma7bEVm004161,...,Dapma7bEVm030739,Dapma7bEVm030740,Dapma7bEVm030744,Dapma7bEVm030748,Dapma7bEVm030749,Dapma7bEVm030753,Dapma7bEVm030754,Dapma7bEVm030755,Dapma7bEVm629765,Dapma7bEVm629948
0,12,7.352828,6.394134,6.776569,5.351553,9.928181,7.184691,6.473805,5.924651,5.790883,...,6.120184,4.629555,5.699719,5.862255,3.295922,4.994942,5.771966,5.693342,1.352579,4.201193
1,1,7.642886,6.686304,6.997765,5.518121,9.323471,7.63235,7.082823,4.902429,5.643739,...,5.632045,4.970264,5.965698,5.408692,3.551104,5.315286,5.551543,6.029283,3.327923,4.118649
2,24,7.335762,6.658602,7.0019,6.62642,8.435655,7.160253,5.647166,4.818133,5.289874,...,5.851358,4.670966,5.671279,5.459678,4.014373,3.84116,4.700124,5.990468,1.844929,3.239676
3,2,7.595695,6.476415,6.90254,4.623494,9.349699,6.509663,6.765066,6.486855,5.540381,...,4.645297,4.333281,5.045723,5.360963,3.749174,5.310375,5.690172,6.123093,3.083042,3.390129
4,96,7.271276,6.710377,6.805541,4.317107,7.991139,4.674173,7.135154,7.128517,6.125496,...,5.832441,5.510325,6.079204,5.865779,3.14205,5.49082,5.715251,5.692676,1.804685,4.363281


In [30]:
#get the order of genes to add to alphas file
gene_order_control = control_df.columns.tolist()
gene_order = gene_order_control[1:]
gene_order_df = pd.DataFrame(gene_order,columns = ['Genes'])
gene_order_df.to_csv('gene_order.csv')