# Data Balancing
We will attempt to balance the data by randomly sampling positive examples and generating synthetic data using a mean of a sample of the readings from a given sample.


In [None]:
# mount the Google Drive
from google.colab import drive
drive.mount("/content/drive")
%cd /content/drive/MyDrive/p2

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/p2


In [None]:
JSON_DATA = "./dataset0.json"
LABELS = "./data.info"

In [None]:
import pandas as pd
data_info = pd.read_csv(LABELS, dtype = 'str')

In [None]:
data_info.head()

Unnamed: 0,gene_id,transcript_id,transcript_position,label
0,ENSG00000004059,ENST00000000233,244,0
1,ENSG00000004059,ENST00000000233,261,0
2,ENSG00000004059,ENST00000000233,316,0
3,ENSG00000004059,ENST00000000233,332,0
4,ENSG00000004059,ENST00000000233,368,0


In [None]:
import json
jsonlist = []
with open(JSON_DATA) as f:
    for jsonobj in f:
        jsonlist.append(json.loads(jsonobj))

In [None]:
from tqdm import tqdm

In [None]:
df_dict = {'transcript_id' : [], 'transcript_position' : [], '5-mers': [],'readings':[]}
for i in tqdm(jsonlist):

    transcript_id = list(i.keys())[0]
    transcript_position = list(i[transcript_id].keys())[0]
    five_mer = list(i[transcript_id][transcript_position].keys())[0]
    readings = list(i[transcript_id][transcript_position][five_mer])
    df_dict['transcript_id'].append(transcript_id)
    df_dict['transcript_position'].append(transcript_position)
    df_dict['5-mers'].append(five_mer)
    df_dict['readings'].append(readings)


100%|██████████| 121838/121838 [00:00<00:00, 138126.71it/s]


In [None]:
df  = pd.DataFrame(df_dict)

In [None]:
df

Unnamed: 0,transcript_id,transcript_position,5-mers,readings
0,ENST00000000233,244,AAGACCA,"[[0.00299, 2.06, 125.0, 0.0177, 10.4, 122.0, 0..."
1,ENST00000000233,261,CAAACTG,"[[0.0126, 1.95, 111.0, 0.0125, 1.27, 108.0, 0...."
2,ENST00000000233,316,GAAACAG,"[[0.00432, 2.02, 104.0, 0.00299, 3.56, 99.3, 0..."
3,ENST00000000233,332,AGAACAT,"[[0.0134, 4.71, 132.0, 0.00447, 4.24, 98.8, 0...."
4,ENST00000000233,368,AGGACAA,"[[0.015, 6.97, 118.0, 0.0106, 3.04, 123.0, 0.0..."
...,...,...,...,...
121833,ENST00000641834,1348,GGGACAT,"[[0.0112, 2.96, 116.0, 0.0093, 3.24, 115.0, 0...."
121834,ENST00000641834,1429,CTGACAC,"[[0.00697, 4.25, 112.0, 0.00481, 8.67, 119.0, ..."
121835,ENST00000641834,1531,TGGACAC,"[[0.00996, 3.12, 112.0, 0.00432, 4.5, 115.0, 0..."
121836,ENST00000641834,1537,CTGACCA,"[[0.00396, 3.14, 108.0, 0.00747, 5.79, 125.0, ..."


Find the shortest list of readings

In [None]:
min = 99999
for id, row in df.iterrows():
    length = len(row[3])
    if length < min:
      min = length
print(min)


20


In [None]:
full_data = df.merge(data_info, on = ['transcript_position','transcript_id'])

In [None]:
full_data.head()

Unnamed: 0,transcript_id,transcript_position,5-mers,readings,gene_id,label
0,ENST00000000233,244,AAGACCA,"[[0.00299, 2.06, 125.0, 0.0177, 10.4, 122.0, 0...",ENSG00000004059,0
1,ENST00000000233,261,CAAACTG,"[[0.0126, 1.95, 111.0, 0.0125, 1.27, 108.0, 0....",ENSG00000004059,0
2,ENST00000000233,316,GAAACAG,"[[0.00432, 2.02, 104.0, 0.00299, 3.56, 99.3, 0...",ENSG00000004059,0
3,ENST00000000233,332,AGAACAT,"[[0.0134, 4.71, 132.0, 0.00447, 4.24, 98.8, 0....",ENSG00000004059,0
4,ENST00000000233,368,AGGACAA,"[[0.015, 6.97, 118.0, 0.0106, 3.04, 123.0, 0.0...",ENSG00000004059,0


In [None]:
columns = list(full_data.columns)
print(columns)

['transcript_id', 'transcript_position', '5-mers', 'readings', 'gene_id', 'label']


In [None]:
from sklearn.model_selection import train_test_split

# shuffle data
df = full_data.sample(frac=1, random_state=4266)
# Perform a train-test split
train_ratio = 0.8  # Adjust this ratio as needed
train_size = int(len(df) * train_ratio)

test_df = df[train_size:]
df = df[:train_size]

In [None]:
positive_data = df[df['label']=='1']
negative_data = df[df['label']=='0']
print(positive_data.shape)
print(negative_data.shape)

(4415, 6)
(93055, 6)


## Generate new positive samples by randomly selecting a positive example, and taking a random n-sized sample of the readings to calculate a weighted mean

In [None]:
import random
random.seed(4266)
## Control sample size using n.  n <= 20
n = 15
row_list = []
# create new positive data until we achieve 1:1 ratio of positive to nagatives
while len(row_list) < len(negative_data):
  # sample a random row
  row = positive_data.sample(n=1)
  # sample n readings and create a new row for the new positive data
  sample_reads = random.sample(row['readings'].tolist()[0],n)
  t_id = row['transcript_id'].tolist()[0]
  t_pos = row['transcript_position'].tolist()[0]
  fmer = row['5-mers'].tolist()[0]
  gene_id = row['gene_id'].tolist()[0]
  label = row['label'].tolist()[0]
  row_dict = {'transcript_id':t_id, 'transcript_position':t_pos, '5-mers': fmer, 'readings':sample_reads, 'gene_id':gene_id, 'label':label}
  row_list.append(row_dict)

new_positive_data = pd.DataFrame(row_list)
print(new_positive_data.shape)
new_positive_data.head()




(93055, 6)


Unnamed: 0,transcript_id,transcript_position,5-mers,readings,gene_id,label
0,ENST00000393812,619,TGAACCG,"[[0.00823, 7.21, 117.0, 0.00523, 3.37, 103.0, ...",ENSG00000157593,1
1,ENST00000341116,1210,AGGACAC,"[[0.0111, 2.87, 115.0, 0.00863, 3.49, 116.0, 0...",ENSG00000128309,1
2,ENST00000322157,3101,CAAACCA,"[[0.00465, 2.0, 103.0, 0.00996, 4.25, 104.0, 0...",ENSG00000141279,1
3,ENST00000371696,924,ATGACCT,"[[0.00974, 4.73, 99.6, 0.00606, 6.22, 124.0, 0...",ENSG00000169692,1
4,ENST00000343820,1438,AGGACAC,"[[0.00266, 7.56, 120.0, 0.0219, 5.73, 109.0, 0...",ENSG00000177426,1


Combine the data, and calculate weighted mean and sd for each row.

In [None]:
df = pd.concat([new_positive_data, negative_data], ignore_index=True)
print(df.shape)
df.head()

(186110, 6)


Unnamed: 0,transcript_id,transcript_position,5-mers,readings,gene_id,label
0,ENST00000393812,619,TGAACCG,"[[0.00823, 7.21, 117.0, 0.00523, 3.37, 103.0, ...",ENSG00000157593,1
1,ENST00000341116,1210,AGGACAC,"[[0.0111, 2.87, 115.0, 0.00863, 3.49, 116.0, 0...",ENSG00000128309,1
2,ENST00000322157,3101,CAAACCA,"[[0.00465, 2.0, 103.0, 0.00996, 4.25, 104.0, 0...",ENSG00000141279,1
3,ENST00000371696,924,ATGACCT,"[[0.00974, 4.73, 99.6, 0.00606, 6.22, 124.0, 0...",ENSG00000169692,1
4,ENST00000343820,1438,AGGACAC,"[[0.00266, 7.56, 120.0, 0.0219, 5.73, 109.0, 0...",ENSG00000177426,1


In [None]:
import numpy as np

def calculate_mean(row):
    readings_array = np.array(row)
    return np.mean(readings_array, axis=0).tolist()


In [None]:
df['readings'] = df['readings'].apply(calculate_mean)

In [None]:
df.head()


Unnamed: 0,transcript_id,transcript_position,5-mers,readings,gene_id,label
0,ENST00000393812,619,TGAACCG,"[0.006208666666666668, 7.566666666666666, 118....",ENSG00000157593,1
1,ENST00000341116,1210,AGGACAC,"[0.013931333333333334, 5.706666666666668, 115....",ENSG00000128309,1
2,ENST00000322157,3101,CAAACCA,"[0.009080666666666664, 2.058, 101.92, 0.008111...",ENSG00000141279,1
3,ENST00000371696,924,ATGACCT,"[0.008301333333333334, 3.338, 96.9066666666666...",ENSG00000169692,1
4,ENST00000343820,1438,AGGACAC,"[0.007909999999999999, 6.236000000000001, 115....",ENSG00000177426,1


In [None]:
# Split the "readings" column into separate columns
split_readings = df['readings'].apply(lambda x: pd.Series(x))
split_readings.columns = [f'value_{i}' for i in range(9)]

# Concatenate the split columns with the original DataFrame
df = pd.concat([df, split_readings], axis=1)
# Drop readings
df.drop(columns='readings',inplace = True)
df.head()

Unnamed: 0,transcript_id,transcript_position,5-mers,gene_id,label,value_0,value_1,value_2,value_3,value_4,value_5,value_6,value_7,value_8
0,ENST00000393812,619,TGAACCG,ENSG00000157593,1,0.006209,7.566667,118.466667,0.006304,3.51,97.106667,0.009476,1.914,84.44
1,ENST00000341116,1210,AGGACAC,ENSG00000128309,1,0.013931,5.706667,115.533333,0.009088,4.492,116.466667,0.006839,3.361333,82.406667
2,ENST00000322157,3101,CAAACCA,ENSG00000141279,1,0.009081,2.058,101.92,0.008111,3.222,100.02,0.00611,2.014,83.366667
3,ENST00000371696,924,ATGACCT,ENSG00000169692,1,0.008301,3.338,96.906667,0.007269,5.936667,119.4,0.008025,3.619333,81.493333
4,ENST00000343820,1438,AGGACAC,ENSG00000177426,1,0.00791,6.236,115.266667,0.010421,5.457333,113.266667,0.005696,2.811333,82.393333


In [None]:
column_mapping = {
    'value_0': 'dwell_time_-1',
    'value_1': 'sd_-1',
    'value_2': 'mean_-1',
    'value_3': 'dwell_time_0',
    'value_4': 'sd_0',
    'value_5': 'mean_0',
    'value_6': 'dwell_time_1',
    'value_7': 'sd_1',
    'value_8': 'mean_1'
}

In [None]:
df = df.rename(columns=column_mapping)
df.head()

Unnamed: 0,transcript_id,transcript_position,5-mers,gene_id,label,dwell_time_-1,sd_-1,mean_-1,dwell_time_0,sd_0,mean_0,dwell_time_1,sd_1,mean_1
0,ENST00000393812,619,TGAACCG,ENSG00000157593,1,0.006209,7.566667,118.466667,0.006304,3.51,97.106667,0.009476,1.914,84.44
1,ENST00000341116,1210,AGGACAC,ENSG00000128309,1,0.013931,5.706667,115.533333,0.009088,4.492,116.466667,0.006839,3.361333,82.406667
2,ENST00000322157,3101,CAAACCA,ENSG00000141279,1,0.009081,2.058,101.92,0.008111,3.222,100.02,0.00611,2.014,83.366667
3,ENST00000371696,924,ATGACCT,ENSG00000169692,1,0.008301,3.338,96.906667,0.007269,5.936667,119.4,0.008025,3.619333,81.493333
4,ENST00000343820,1438,AGGACAC,ENSG00000177426,1,0.00791,6.236,115.266667,0.010421,5.457333,113.266667,0.005696,2.811333,82.393333


In [None]:
df.shape

(186110, 14)

In [None]:
# Define the possible gene types
gene_types = ['A', 'C', 'T', 'G']

# Create a one-hot encoding for each position and gene type
for position in range(7):
    for gene_type in gene_types:
        col_name = f'5-mer-{position}_{gene_type}'
        df[col_name] = (df['5-mers'].str[position] == gene_type).astype(int)  # Convert to 1 or 0

# Drop the original "5-mers" column if needed
df = df.drop('5-mers', axis=1)

# Now, 'df' contains separate columns for each position and gene type with 1's and 0's
df.head(10)

Unnamed: 0,transcript_id,transcript_position,gene_id,label,dwell_time_-1,sd_-1,mean_-1,dwell_time_0,sd_0,mean_0,...,5-mer-4_T,5-mer-4_G,5-mer-5_A,5-mer-5_C,5-mer-5_T,5-mer-5_G,5-mer-6_A,5-mer-6_C,5-mer-6_T,5-mer-6_G
0,ENST00000393812,619,ENSG00000157593,1,0.006209,7.566667,118.466667,0.006304,3.51,97.106667,...,0,0,0,1,0,0,0,0,0,1
1,ENST00000341116,1210,ENSG00000128309,1,0.013931,5.706667,115.533333,0.009088,4.492,116.466667,...,0,0,1,0,0,0,0,1,0,0
2,ENST00000322157,3101,ENSG00000141279,1,0.009081,2.058,101.92,0.008111,3.222,100.02,...,0,0,0,1,0,0,1,0,0,0
3,ENST00000371696,924,ENSG00000169692,1,0.008301,3.338,96.906667,0.007269,5.936667,119.4,...,0,0,0,1,0,0,0,0,1,0
4,ENST00000343820,1438,ENSG00000177426,1,0.00791,6.236,115.266667,0.010421,5.457333,113.266667,...,0,0,1,0,0,0,0,1,0,0
5,ENST00000267750,786,ENSG00000128463,1,0.008975,3.305333,121.8,0.006742,3.712667,127.533333,...,0,0,0,0,1,0,0,1,0,0
6,ENST00000434494,1846,ENSG00000167004,1,0.011108,6.885333,117.2,0.007666,5.776667,122.066667,...,0,0,0,0,1,0,1,0,0,0
7,ENST00000337195,2217,ENSG00000175029,1,0.008491,2.64,109.133333,0.006451,3.496667,102.98,...,0,0,0,0,1,0,0,0,0,1
8,ENST00000253237,1326,ENSG00000105447,1,0.014075,6.168667,115.333333,0.010188,7.364,114.866667,...,0,0,1,0,0,0,0,0,0,1
9,ENST00000380172,1200,ENSG00000099810,1,0.008245,3.901333,122.6,0.010977,6.782667,123.933333,...,0,0,1,0,0,0,0,0,1,0


# Train-test split and export csv.


In [None]:
# Define the columns to drop
columns_to_drop = ['5-mer-3_A', '5-mer-3_C', '5-mer-3_T', '5-mer-3_G',
                   '5-mer-4_A', '5-mer-4_C', '5-mer-4_T', '5-mer-4_G',
                   '5-mer-1_C', '5-mer-2_C', '5-mer-2_T', '5-mer-5_G',]

# Drop the specified columns
df = df.drop(columns=columns_to_drop)

In [None]:
df.shape

(186110, 29)

In [None]:
test_df['readings'] = test_df['readings'].apply(calculate_mean)

In [None]:
# Split the "readings" column into separate columns
split_readings = test_df['readings'].apply(lambda x: pd.Series(x))
split_readings.columns = [f'value_{i}' for i in range(9)]

# Concatenate the split columns with the original DataFrame
test_df = pd.concat([test_df, split_readings], axis=1)
# Drop readings
test_df.drop(columns='readings',inplace = True)
test_df.head()

Unnamed: 0,transcript_id,transcript_position,5-mers,gene_id,label,value_0,value_1,value_2,value_3,value_4,value_5,value_6,value_7,value_8
94043,ENST00000429711,148,AGAACCA,ENSG00000144713,0,0.010216,6.895889,132.951417,0.006676,4.23749,95.638057,0.006001,1.907982,85.187247
105374,ENST00000523944,2085,TGGACCC,ENSG00000104738,0,0.0072,3.947843,118.058824,0.007439,6.113137,120.607843,0.004663,3.310196,80.356863
113340,ENST00000582730,1946,GTGACTG,ENSG00000136450,0,0.008813,4.434474,102.891447,0.00755,6.304868,124.184211,0.006557,3.109474,92.607895
38268,ENST00000306125,1111,AGAACTA,ENSG00000172053,0,0.008166,7.344167,131.2,0.010954,4.223333,104.3,0.00647,2.463667,92.85
76233,ENST00000376759,109,TTAACAC,ENSG00000102317,0,0.009987,1.540525,90.176271,0.009452,1.982814,93.570339,0.00792,2.734407,89.137288


In [None]:
test_df = test_df.rename(columns=column_mapping)
test_df.head()

Unnamed: 0,transcript_id,transcript_position,5-mers,gene_id,label,dwell_time_-1,sd_-1,mean_-1,dwell_time_0,sd_0,mean_0,dwell_time_1,sd_1,mean_1
94043,ENST00000429711,148,AGAACCA,ENSG00000144713,0,0.010216,6.895889,132.951417,0.006676,4.23749,95.638057,0.006001,1.907982,85.187247
105374,ENST00000523944,2085,TGGACCC,ENSG00000104738,0,0.0072,3.947843,118.058824,0.007439,6.113137,120.607843,0.004663,3.310196,80.356863
113340,ENST00000582730,1946,GTGACTG,ENSG00000136450,0,0.008813,4.434474,102.891447,0.00755,6.304868,124.184211,0.006557,3.109474,92.607895
38268,ENST00000306125,1111,AGAACTA,ENSG00000172053,0,0.008166,7.344167,131.2,0.010954,4.223333,104.3,0.00647,2.463667,92.85
76233,ENST00000376759,109,TTAACAC,ENSG00000102317,0,0.009987,1.540525,90.176271,0.009452,1.982814,93.570339,0.00792,2.734407,89.137288


In [None]:
# Define the possible gene types
gene_types = ['A', 'C', 'T', 'G']

# Create a one-hot encoding for each position and gene type
for position in range(7):
    for gene_type in gene_types:
        col_name = f'5-mer-{position}_{gene_type}'
        test_df[col_name] = (test_df['5-mers'].str[position] == gene_type).astype(int)  # Convert to 1 or 0

# Drop the original "5-mers" column if needed
test_df = test_df.drop('5-mers', axis=1)

# Now, 'df' contains separate columns for each position and gene type with 1's and 0's
test_df.head(10)

Unnamed: 0,transcript_id,transcript_position,gene_id,label,dwell_time_-1,sd_-1,mean_-1,dwell_time_0,sd_0,mean_0,...,5-mer-4_T,5-mer-4_G,5-mer-5_A,5-mer-5_C,5-mer-5_T,5-mer-5_G,5-mer-6_A,5-mer-6_C,5-mer-6_T,5-mer-6_G
94043,ENST00000429711,148,ENSG00000144713,0,0.010216,6.895889,132.951417,0.006676,4.23749,95.638057,...,0,0,0,1,0,0,1,0,0,0
105374,ENST00000523944,2085,ENSG00000104738,0,0.0072,3.947843,118.058824,0.007439,6.113137,120.607843,...,0,0,0,1,0,0,0,1,0,0
113340,ENST00000582730,1946,ENSG00000136450,0,0.008813,4.434474,102.891447,0.00755,6.304868,124.184211,...,0,0,0,0,1,0,0,0,0,1
38268,ENST00000306125,1111,ENSG00000172053,0,0.008166,7.344167,131.2,0.010954,4.223333,104.3,...,0,0,0,0,1,0,1,0,0,0
76233,ENST00000376759,109,ENSG00000102317,0,0.009987,1.540525,90.176271,0.009452,1.982814,93.570339,...,0,0,1,0,0,0,0,1,0,0
103089,ENST00000504520,990,ENSG00000251022,0,0.010423,7.455455,129.181818,0.012257,3.543182,96.418182,...,0,0,1,0,0,0,1,0,0,0
7260,ENST00000228825,644,ENSG00000111229,0,0.006367,6.637937,120.405263,0.008812,6.085263,125.984211,...,0,0,1,0,0,0,0,0,0,1
65410,ENST00000367549,2261,ENSG00000135829,0,0.008103,3.1276,105.839,0.00676,3.5907,99.836,...,0,0,1,0,0,0,1,0,0,0
9990,ENST00000244496,936,ENSG00000124541,1,0.008502,3.1074,116.42,0.007052,5.4832,116.3,...,0,0,0,0,1,0,1,0,0,0
76480,ENST00000377191,429,ENSG00000088930,0,0.00595,9.950811,115.486486,0.005534,4.017838,94.216216,...,0,0,0,1,0,0,1,0,0,0


In [None]:
test_df = test_df.drop(columns=columns_to_drop)

In [None]:
test_df.shape

(24368, 29)

In [None]:
# Export the training and testing DataFrames to CSV files
df.to_csv('train_synth_data.csv', index=False)
test_df.to_csv('test_synth_data.csv', index=False)
