# Data Balancing
We will attempt to balance the data by randomly sampling positive examples and generating synthetic data using a mean of a sample of the readings from a given sample.


In [2]:
JSON_DATA = "../data/dataset0.json"
LABELS = "../data/data.info"

In [3]:
import pandas as pd
data_info = pd.read_csv(LABELS, dtype = 'str')

In [4]:
data_info.head()

Unnamed: 0,gene_id,transcript_id,transcript_position,label
0,ENSG00000004059,ENST00000000233,244,0
1,ENSG00000004059,ENST00000000233,261,0
2,ENSG00000004059,ENST00000000233,316,0
3,ENSG00000004059,ENST00000000233,332,0
4,ENSG00000004059,ENST00000000233,368,0


In [5]:
import json
jsonlist = []
with open(JSON_DATA) as f:
    for jsonobj in f:
        jsonlist.append(json.loads(jsonobj))

In [6]:
from tqdm import tqdm

In [7]:
df_dict = {'transcript_id' : [], 'transcript_position' : [], '5-mers': [],'readings':[]}
for i in tqdm(jsonlist):

    transcript_id = list(i.keys())[0]
    transcript_position = list(i[transcript_id].keys())[0]
    five_mer = list(i[transcript_id][transcript_position].keys())[0]
    readings = list(i[transcript_id][transcript_position][five_mer])
    df_dict['transcript_id'].append(transcript_id)
    df_dict['transcript_position'].append(transcript_position)
    df_dict['5-mers'].append(five_mer)
    df_dict['readings'].append(readings)


100%|██████████| 121838/121838 [00:00<00:00, 238448.26it/s]


In [8]:
df  = pd.DataFrame(df_dict)

In [9]:
df

Unnamed: 0,transcript_id,transcript_position,5-mers,readings
0,ENST00000000233,244,AAGACCA,"[[0.00299, 2.06, 125.0, 0.0177, 10.4, 122.0, 0..."
1,ENST00000000233,261,CAAACTG,"[[0.0126, 1.95, 111.0, 0.0125, 1.27, 108.0, 0...."
2,ENST00000000233,316,GAAACAG,"[[0.00432, 2.02, 104.0, 0.00299, 3.56, 99.3, 0..."
3,ENST00000000233,332,AGAACAT,"[[0.0134, 4.71, 132.0, 0.00447, 4.24, 98.8, 0...."
4,ENST00000000233,368,AGGACAA,"[[0.015, 6.97, 118.0, 0.0106, 3.04, 123.0, 0.0..."
...,...,...,...,...
121833,ENST00000641834,1348,GGGACAT,"[[0.0112, 2.96, 116.0, 0.0093, 3.24, 115.0, 0...."
121834,ENST00000641834,1429,CTGACAC,"[[0.00697, 4.25, 112.0, 0.00481, 8.67, 119.0, ..."
121835,ENST00000641834,1531,TGGACAC,"[[0.00996, 3.12, 112.0, 0.00432, 4.5, 115.0, 0..."
121836,ENST00000641834,1537,CTGACCA,"[[0.00396, 3.14, 108.0, 0.00747, 5.79, 125.0, ..."


Find the shortest list of readings

In [10]:
min = 99999
for id, row in df.iterrows():
    length = len(row[3])
    if length < min:
      min = length
print(min)


20


In [11]:
full_data = df.merge(data_info, on = ['transcript_position','transcript_id'])

In [12]:
full_data.head()

Unnamed: 0,transcript_id,transcript_position,5-mers,readings,gene_id,label
0,ENST00000000233,244,AAGACCA,"[[0.00299, 2.06, 125.0, 0.0177, 10.4, 122.0, 0...",ENSG00000004059,0
1,ENST00000000233,261,CAAACTG,"[[0.0126, 1.95, 111.0, 0.0125, 1.27, 108.0, 0....",ENSG00000004059,0
2,ENST00000000233,316,GAAACAG,"[[0.00432, 2.02, 104.0, 0.00299, 3.56, 99.3, 0...",ENSG00000004059,0
3,ENST00000000233,332,AGAACAT,"[[0.0134, 4.71, 132.0, 0.00447, 4.24, 98.8, 0....",ENSG00000004059,0
4,ENST00000000233,368,AGGACAA,"[[0.015, 6.97, 118.0, 0.0106, 3.04, 123.0, 0.0...",ENSG00000004059,0


In [13]:
columns = list(full_data.columns)
print(columns)

['transcript_id', 'transcript_position', '5-mers', 'readings', 'gene_id', 'label']


In [14]:
positive_data = full_data[full_data['label']=='1']
negative_data = full_data[full_data['label']=='0']
print(positive_data.shape)
print(negative_data.shape)

(5475, 6)
(116363, 6)


## Generate new positive samples by randomly selecting a positive example, and taking a random n-sized sample of the readings to calculate a weighted mean

In [15]:
import random
random.seed(4266)
## Control sample size using n.  n <= 20
n = 15
row_list = []
# create new positive data until we achieve 1:1 ratio of positive to nagatives
while len(row_list) < len(negative_data):
  # sample a random row
  row = positive_data.sample(n=1)
  # sample n readings and create a new row for the new positive data
  sample_reads = random.sample(row['readings'].tolist()[0],n)
  t_id = row['transcript_id'].tolist()[0]
  t_pos = row['transcript_position'].tolist()[0]
  fmer = row['5-mers'].tolist()[0]
  gene_id = row['gene_id'].tolist()[0]
  label = row['label'].tolist()[0]
  row_dict = {'transcript_id':t_id, 'transcript_position':t_pos, '5-mers': fmer, 'readings':sample_reads, 'gene_id':gene_id, 'label':label}
  row_list.append(row_dict)

new_positive_data = pd.DataFrame(row_list)
print(new_positive_data.shape)
new_positive_data.head()




(116363, 6)


Unnamed: 0,transcript_id,transcript_position,5-mers,readings,gene_id,label
0,ENST00000542575,800,CTGACAG,"[[0.00402, 2.19, 104.0, 0.00465, 3.73, 113.0, ...",ENSG00000105281,1
1,ENST00000464015,1210,TGGACTC,"[[0.0169, 7.85, 122.0, 0.00716, 8.31, 124.0, 0...",ENSG00000161179,1
2,ENST00000005286,3178,AGGACCC,"[[0.00498, 3.46, 118.0, 0.0047, 6.49, 121.0, 0...",ENSG00000006118,1
3,ENST00000331483,722,CGGACTC,"[[0.00432, 1.99, 128.0, 0.00598, 8.34, 123.0, ...",ENSG00000185624,1
4,ENST00000453527,1476,CTGACTG,"[[0.0137, 3.89, 111.0, 0.00486, 4.4, 124.0, 0....",ENSG00000164733,1


Combine the data, and calculate weighted mean and sd for each row.

In [45]:
df = pd.concat([new_positive_data, negative_data], ignore_index=True)
print(df.shape)
df.head()

(232726, 6)


Unnamed: 0,transcript_id,transcript_position,5-mers,readings,gene_id,label
0,ENST00000542575,800,CTGACAG,"[[0.00402, 2.19, 104.0, 0.00465, 3.73, 113.0, ...",ENSG00000105281,1
1,ENST00000464015,1210,TGGACTC,"[[0.0169, 7.85, 122.0, 0.00716, 8.31, 124.0, 0...",ENSG00000161179,1
2,ENST00000005286,3178,AGGACCC,"[[0.00498, 3.46, 118.0, 0.0047, 6.49, 121.0, 0...",ENSG00000006118,1
3,ENST00000331483,722,CGGACTC,"[[0.00432, 1.99, 128.0, 0.00598, 8.34, 123.0, ...",ENSG00000185624,1
4,ENST00000453527,1476,CTGACTG,"[[0.0137, 3.89, 111.0, 0.00486, 4.4, 124.0, 0....",ENSG00000164733,1


In [46]:
import numpy as np

def calculate_mean(row):
    readings_array = np.array(row)
    return np.mean(readings_array, axis=0).tolist()


In [47]:
df['readings'] = df['readings'].apply(calculate_mean)

In [48]:
df.head()


Unnamed: 0,transcript_id,transcript_position,5-mers,readings,gene_id,label
0,ENST00000542575,800,CTGACAG,"[0.007847333333333333, 3.5353333333333334, 107...",ENSG00000105281,1
1,ENST00000464015,1210,TGGACTC,"[0.008038666666666668, 3.2640000000000007, 119...",ENSG00000161179,1
2,ENST00000005286,3178,AGGACCC,"[0.010369999999999999, 5.838000000000002, 114....",ENSG00000006118,1
3,ENST00000331483,722,CGGACTC,"[0.0055966666666666665, 2.6166666666666667, 12...",ENSG00000185624,1
4,ENST00000453527,1476,CTGACTG,"[0.005635333333333332, 3.1926666666666668, 111...",ENSG00000164733,1


In [50]:
# Split the "readings" column into separate columns
split_readings = df['readings'].apply(lambda x: pd.Series(x))
split_readings.columns = [f'value_{i}' for i in range(9)]

# Concatenate the split columns with the original DataFrame
df = pd.concat([df, split_readings], axis=1)
# Drop readings
df.drop(columns='readings',inplace = True)
df.head()

Unnamed: 0,transcript_id,transcript_position,5-mers,gene_id,label,value_0,value_1,value_2,value_3,value_4,value_5,value_6,value_7,value_8
0,ENST00000542575,800,CTGACAG,ENSG00000105281,1,0.007847,3.535333,107.4,0.00609,6.302667,119.133333,0.00583,2.271333,84.333333
1,ENST00000464015,1210,TGGACTC,ENSG00000161179,1,0.008039,3.264,119.0,0.007065,5.5,120.2,0.006717,2.082667,89.813333
2,ENST00000005286,3178,AGGACCC,ENSG00000006118,1,0.01037,5.838,114.666667,0.012367,7.316667,116.466667,0.010341,3.740667,78.626667
3,ENST00000331483,722,CGGACTC,ENSG00000185624,1,0.005597,2.616667,121.466667,0.008681,6.741333,122.866667,0.00579,1.986867,88.086667
4,ENST00000453527,1476,CTGACTG,ENSG00000164733,1,0.005635,3.192667,111.333333,0.006803,7.152667,121.4,0.011558,3.702667,88.406667


In [49]:
column_mapping = {
    'value_0': 'dwell_time_-1',
    'value_1': 'sd_-1',
    'value_2': 'mean_-1',
    'value_3': 'dwell_time_0',
    'value_4': 'sd_0',
    'value_5': 'mean_0',
    'value_6': 'dwell_time_1',
    'value_7': 'sd_1',
    'value_8': 'mean_1'
}

In [52]:
df = df.rename(columns=column_mapping)
df.head()

Unnamed: 0,transcript_id,transcript_position,5-mers,gene_id,label,dwell_time_-1,sd_-1,mean_-1,dwell_time_0,sd_0,mean_0,dwell_time_1,sd_1,mean_1
0,ENST00000542575,800,CTGACAG,ENSG00000105281,1,0.007847,3.535333,107.4,0.00609,6.302667,119.133333,0.00583,2.271333,84.333333
1,ENST00000464015,1210,TGGACTC,ENSG00000161179,1,0.008039,3.264,119.0,0.007065,5.5,120.2,0.006717,2.082667,89.813333
2,ENST00000005286,3178,AGGACCC,ENSG00000006118,1,0.01037,5.838,114.666667,0.012367,7.316667,116.466667,0.010341,3.740667,78.626667
3,ENST00000331483,722,CGGACTC,ENSG00000185624,1,0.005597,2.616667,121.466667,0.008681,6.741333,122.866667,0.00579,1.986867,88.086667
4,ENST00000453527,1476,CTGACTG,ENSG00000164733,1,0.005635,3.192667,111.333333,0.006803,7.152667,121.4,0.011558,3.702667,88.406667


In [54]:
df.shape

(232726, 14)

# Train-test split and export csv.


In [55]:
from sklearn.model_selection import train_test_split

# shuffle data
df = df.sample(frac=1, random_state=4266)
# Perform a train-test split
train_ratio = 0.8  # Adjust this ratio as needed
train_size = int(len(df) * train_ratio)

train_df = df[:train_size]
test_df = df[train_size:]

# Export the training and testing DataFrames to CSV files
train_df.to_csv('train_synth_data.csv', index=False)
test_df.to_csv('test_synth_data.csv', index=False)
