In [None]:
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from keras.models import load_model

In [None]:
# Load the model from the .h5 file
model = load_model('model_NN_final.h5')

In [None]:
! mkdir -p data_parsing/data/json

In [None]:
! aws s3 sync --no-sign-request s3://sg-nex-data/data/processed_data/m6Anet/SGNex_A549_directRNA_replicate5_run1/ data_parsing/data/json/SGNex_A549_directRNA_replicate5_run1
! aws s3 sync --no-sign-request s3://sg-nex-data/data/processed_data/m6Anet/SGNex_A549_directRNA_replicate6_run1/ data_parsing/data/json/SGNex_A549_directRNA_replicate6_run1
! aws s3 sync --no-sign-request s3://sg-nex-data/data/processed_data/m6Anet/SGNex_Hct116_directRNA_replicate3_run1/ data_parsing/data/json/SGNex_Hct116_directRNA_replicate3_run1
! aws s3 sync --no-sign-request s3://sg-nex-data/data/processed_data/m6Anet/SGNex_Hct116_directRNA_replicate3_run4/ data_parsing/data/json/SGNex_Hct116_directRNA_replicate3_run4
! aws s3 sync --no-sign-request s3://sg-nex-data/data/processed_data/m6Anet/SGNex_Hct116_directRNA_replicate4_run3/ data_parsing/data/json/SGNex_Hct116_directRNA_replicate4_run3
! aws s3 sync --no-sign-request s3://sg-nex-data/data/processed_data/m6Anet/SGNex_HepG2_directRNA_replicate5_run2/ data_parsing/data/json/SGNex_HepG2_directRNA_replicate5_run2
! aws s3 sync --no-sign-request s3://sg-nex-data/data/processed_data/m6Anet/SGNex_HepG2_directRNA_replicate6_run1/ data_parsing/data/json/SGNex_HepG2_directRNA_replicate6_run1
! aws s3 sync --no-sign-request s3://sg-nex-data/data/processed_data/m6Anet/SGNex_K562_directRNA_replicate4_run1/ data_parsing/data/json/SGNex_K562_directRNA_replicate4_run1
! aws s3 sync --no-sign-request s3://sg-nex-data/data/processed_data/m6Anet/SGNex_K562_directRNA_replicate5_run1/ data_parsing/data/json/SGNex_K562_directRNA_replicate5_run1
! aws s3 sync --no-sign-request s3://sg-nex-data/data/processed_data/m6Anet/SGNex_K562_directRNA_replicate6_run1/ data_parsing/data/json/SGNex_K562_directRNA_replicate6_run1
! aws s3 sync --no-sign-request s3://sg-nex-data/data/processed_data/m6Anet/SGNex_MCF7_directRNA_replicate3_run1/ data_parsing/data/json/SGNex_MCF7_directRNA_replicate3_run1
! aws s3 sync --no-sign-request s3://sg-nex-data/data/processed_data/m6Anet/SGNex_MCF7_directRNA_replicate4_run1/ data_parsing/data/json/SGNex_MCF7_directRNA_replicate4_run1

In [None]:
from collections import Counter

def count_acgt_letters(input_string):
    letter_counts = dict(Counter(input_string))
    acgt_counts = {
        'A': letter_counts.get('A', 0),
        'C': letter_counts.get('C', 0),
        'G': letter_counts.get('G', 0),
        'T': letter_counts.get('T', 0),
    }
    
    return acgt_counts


def count_data(data_list):
    parsed_data = []  
    for data in data_list:
        for transcript_id, positions in data.items():
            for position, details in positions.items():
                for sequence, readings in details.items():
                    letter_counts = count_acgt_letters(sequence[1:6]) 
                    for reading in readings:
                        value = reading[3:6]
                        parsed_data.append([transcript_id, int(position), sequence[1:6],letter_counts['A'], letter_counts['C'], letter_counts['G'], letter_counts['T']] + value)

    columns = ['transcript_id', 'position','sequence', 'A', 'C', 'G', 'T', '0 length', '0 sd', '0 mean']

    return pd.DataFrame(parsed_data, columns=columns)

In [None]:
file_paths = ['data_parsing/data/json/SGNex_A549_directRNA_replicate5_run1/data.json',
              'data_parsing/data/json/SGNex_A549_directRNA_replicate6_run1/data.json',
              'data_parsing/data/json/SGNex_Hct116_directRNA_replicate3_run1/data.json',
              'data_parsing/data/json/SGNex_Hct116_directRNA_replicate3_run4/data.json',
              'data_parsing/data/json/SGNex_Hct116_directRNA_replicate4_run3/data.json',
              'data_parsing/data/json/SGNex_HepG2_directRNA_replicate5_run2/data.json',
              'data_parsing/data/json/SGNex_HepG2_directRNA_replicate6_run1/data.json',
              'data_parsing/data/json/SGNex_K562_directRNA_replicate4_run1/data.json',
              'data_parsing/data/json/SGNex_K562_directRNA_replicate5_run1/data.json',
              'data_parsing/data/json/SGNex_K562_directRNA_replicate6_run1/data.json',
              'data_parsing/data/json/SGNex_MCF7_directRNA_replicate3_run1/data.json',
              'data_parsing/data/json/SGNex_MCF7_directRNA_replicate4_run1/data.json']

In [None]:
import gzip
import json
import os
              
              
for file_path in file_paths:
    # Get the directory path
    directory_path = os.path.dirname(file_path)

    # Get the folder name
    folder_name = os.path.basename(directory_path)
    
    print(folder_name)

    # Create a list to hold the JSON objects
    data_list = []

    # Open and decompress the gz file, then load the JSON data line by line
    with open(file_path, 'rt', encoding='utf-8') as file:
        for line in file:
            # Parse the JSON data for the line and append to data_list
            json_obj = json.loads(line.strip())
            data_list.append(json_obj)

    # Example: Convert the first JSON object to a Pandas DataFrame
    df = pd.json_normalize(data_list[0])

    df = count_data(data_list)
    
    df['0 min'] = df['0 mean'] - df['0 sd']*1.96
    df['0 max'] = df['0 mean'] + df['0 sd']*1.96
    
    data = pd.get_dummies(df, columns=['sequence'], prefix='sequence')
    
    # Apply the function to each row
    data['specific_sequence'] = data.apply(extract_specific_sequence, axis=1)

    data['1st_pos'] = data['specific_sequence'].apply(lambda x: x[0])
    data['2nd_pos'] = data['specific_sequence'].apply(lambda x: x[1])
    data['3rd_pos'] = data['specific_sequence'].apply(lambda x: x[2])
    data['4th_pos'] = data['specific_sequence'].apply(lambda x: x[3])
    data['5th_pos'] = data['specific_sequence'].apply(lambda x: x[4])
    seq_col = [col for col in data.columns.tolist() if col.startswith('sequence_')]
    data.drop(seq_col, axis=1, inplace=True)
    data.drop('specific_sequence', axis=1, inplace=True)
    data['1st_pos'] = data['1st_pos'].astype('category')
    data['2nd_pos'] = data['2nd_pos'].astype('category')
    data['3rd_pos'] = data['3rd_pos'].astype('category')
    data['4th_pos'] = data['4th_pos'].astype('category')
    data['5th_pos'] = data['5th_pos'].astype('category')

    
    grouped_data = data.groupby("transcript_id")

    # Apply the standardization function to the "0 mean" column within each group
    data["0 mean standardized"] = grouped_data["0 mean"].transform(standardize_column_mean)
    data["0 sd standardized"] = grouped_data["0 sd"].transform(standardize_column_sd)
    data["0 length standardized"] = grouped_data["0 length"].transform(standardize_column_mean)

    final_predict_dataset = data.drop(columns=['transcript_id','position','0 length', '0 sd', '0 mean', '0 min', '0 max'])

    # Perform one-hot encoding for categorical columns
    final_predict_dataset = pd.get_dummies(final_predict_dataset, columns=['1st_pos', '2nd_pos', '3rd_pos', '4th_pos', '5th_pos'])


    transcript = data[['transcript_id','position']]
    data_pred_prob = model.predict(final_predict_dataset)

    data_pred_prob = pd.DataFrame(pd.DataFrame(data_pred_prob)[0])
    data_pred_prob.columns = ['score']
    result_data = pd.merge(transcript, data_pred_prob, left_index=True, right_index=True, how='inner')
    agg_functions = {'score': 'mean'}

    result_data = result_data.groupby(['transcript_id', 'position']).agg(agg_functions).reset_index()

    result_data.to_csv('prediction_generation'+folder_name+'_Result.csv', index=False)

