Importing modules

In [99]:
import csv 
import intervaltree
from intervaltree import Interval, IntervalTree 
import pandas as pd
import copy
import numpy as np
import matplotlib.pyplot as plt

In [100]:
#!pip install intervaltree

The below code takes a genome then splits the genome into intervals called segments. The duplication features of each segment is pooled together to form the features of a segment, this would be called a segmental duplication. We first create intervals of length 1MB, find out which duplicated regiosn overlap with it and then pool all the features to form a dataset.

In [101]:
def find_smallest_region(genome):
    smallest=1000000000000000000
    for i in genome:
        for j in [-1,0,1]:
            x=np.abs(list(genome[i])[j].end-list(genome[i])[j+1].begin)
            if x<smallest:
                smallest=x
                print(i)
    return smallest

In [102]:
file_name="SD_network/NMF_Analysis/files/telocentro_hg38.bed"
file=open(file_name,"r")
data=file.readlines()

load the genome 

In [103]:
genome={}

for i in data:
    x=i.split("\t")
    if x[0] not in genome.keys():
        genome[x[0]]=IntervalTree()
        genome[x[0]][int(x[1]):int(x[2])]=(int(x[1]),int(x[2]))
    else:
        genome[x[0]][int(x[1]):int(x[2])]=(int(x[1]),int(x[2]))
        
for key in genome.keys():
    genome[key].merge_overlaps(strict=False)

del genome["chrX"]
del genome["chrY"]

In [104]:
find_smallest_region(genome)

chr1
chr2
chr3
chr4
chr5
chr8
chr10
chr12
chr13
chr14
chr18
chr21


10890000

In [105]:
genome

{'chr1': IntervalTree([Interval(0, 10000, (0, 10000)), Interval(121700000, 125100000), Interval(248946422, 248956422, (248946422, 248956422))]),
 'chr2': IntervalTree([Interval(0, 10000, (0, 10000)), Interval(91800000, 96000000), Interval(242183529, 242193529, (242183529, 242193529))]),
 'chr3': IntervalTree([Interval(0, 10000, (0, 10000)), Interval(87800000, 94000000), Interval(198285559, 198295559, (198285559, 198295559))]),
 'chr4': IntervalTree([Interval(0, 10000, (0, 10000)), Interval(48200000, 51800000), Interval(190204555, 190214555, (190204555, 190214555))]),
 'chr5': IntervalTree([Interval(0, 10000, (0, 10000)), Interval(46100000, 51400000), Interval(181528259, 181538259, (181528259, 181538259))]),
 'chr6': IntervalTree([Interval(0, 10000, (0, 10000)), Interval(58500000, 62600000), Interval(170795979, 170805979, (170795979, 170805979))]),
 'chr7': IntervalTree([Interval(0, 10000, (0, 10000)), Interval(58100000, 62100000), Interval(159335973, 159345973, (159335973, 159345973))]

In [106]:
def get_splits(chromosome_name):
    splits = IntervalTree()
    x = sorted(genome[chromosome_name])
    split_length = 10**6 # 1Mb, depends on the segmental duplication length

    for i in range(len(x) - 1):
        s = x[i].end
        e = x[i + 1].begin

        for i in range(s + 1, (s + e) // 2, split_length + 1):
            splits[i:i + split_length] = (i, i + split_length)

        for i in range(e - 1, (s + e) // 2 + (e - s) % (split_length + 1), -split_length - 1):
            splits[i - split_length:i] = (i - split_length, i)

    return splits

In [107]:
splits=get_splits("chr21")

In [108]:
len(splits)

45

In [109]:
len(splits)

45

duplicated regions dataset is preprocessed below

In [110]:
duplicated_regions=pd.read_csv("SD_network/NMF_Analysis/files/out_df_ws_jumps.csv")

In [111]:
duplicated_regions.columns

Index(['chr', 'coor_s', 'coor_e', 'ids', 'length', 'centro', 'telo', 'gaps',
       'genes', 'intra_frac', 'cpgisl_in', 'cpgisl_bor', 'ctcf', 'repli_in',
       'repli_bor', 'repli_bor_deriv', 'repli_deriv', 'repli_vari',
       'recomb_in', 'recomb_bor', 'dnase_in', 'dnase_bor', 'DNA_l', 'LINE_l',
       'LTR_l', 'SINE_l', 'Low_complexity_l', 'Retroposon_l', 'Satellite_l',
       'Simple_repeat_l', 'rRNA_l', 'snRNA_l', 'scRNA_l', 'srpRNA_l', 'tRNA_l',
       'RC_l', 'DNA_r', 'LINE_r', 'LTR_r', 'SINE_r', 'Low_complexity_r',
       'Retroposon_r', 'Satellite_r', 'Simple_repeat_r', 'rRNA_r', 'snRNA_r',
       'scRNA_r', 'srpRNA_r', 'tRNA_r', 'RC_r', 'L1_s_l', 'L2_s_l', 'MIR_s_l',
       'Alu_s_l', 'Satellite_s_l', 'L1_s_r', 'L2_s_r', 'MIR_s_r', 'Alu_s_r',
       'Satellite_s_r', 'used_coor_l_s', 'used_coor_l_e', 'used_coor_r_s',
       'used_coor_r_e', 'CG_frac_l', 'CG_frac_r', 'CG_frac_in', 'jumps'],
      dtype='object')

In [112]:
def get_duplicated_regions():
    add_columns = ['component_size', 'intra_degree', 'iner_degree', 'self_loops', 'edges_double', 'edges_tandem', 'edges_ident_mean']
    old_data = pd.read_csv("SD_network/NMF_Analysis/files/Duplicated_Regions_old_data.csv")
    duplicated_regions = pd.read_csv("SD_network/NMF_Analysis/files/out_df_ws_jumps.csv")

    duplicated_regions["CG_frac_l"][duplicated_regions["CG_frac_l"] == -1.000] = 0.0
    duplicated_regions["CG_frac_l"][duplicated_regions["CG_frac_l"] == 0.0] = duplicated_regions["CG_frac_r"][duplicated_regions["CG_frac_l"] == 0.0]
    duplicated_regions["CG_frac_r"][duplicated_regions["CG_frac_r"] == -1.000] = 0.0
    duplicated_regions["CG_frac_r"][duplicated_regions["CG_frac_r"] == 0.0] = duplicated_regions["CG_frac_l"][duplicated_regions["CG_frac_r"] == 0.0]
    duplicated_regions["CG_frac_in"][duplicated_regions["CG_frac_in"] == -1.000] = 0.0
    duplicated_regions["CG_frac_in"][duplicated_regions["CG_frac_in"] == 0.0] = np.mean(duplicated_regions["CG_frac_in"])

    duplicated_regions["CG_frac"] = (duplicated_regions["CG_frac_l"] + duplicated_regions["CG_frac_r"]) / 2
    duplicated_regions["CG_frac"][duplicated_regions["CG_frac"] == 0.0] = np.mean(duplicated_regions["CG_frac_in"])

    duplicated_regions.drop(["CG_frac_l", "CG_frac_r"], axis=1, inplace=True)

    for i in add_columns:
        duplicated_regions[i] = old_data[i]

    left_right_columns = ["DNA", "LINE", "LTR", "SINE", "Low_complexity", "Retroposon", "Satellite", "Simple_repeat", "rRNA", "snRNA", "scRNA", "srpRNA", "tRNA", "RC", 'L1_s', 'L2_s', 'MIR_s', 'Alu_s', 'Satellite_s']

    for i in left_right_columns:
        duplicated_regions[i] = duplicated_regions[i + "_r"] + duplicated_regions[i + "_l"]
        duplicated_regions.drop([i + "_r", i + "_l"], axis=1, inplace=True)

    return duplicated_regions


In [113]:
def find_duplicated_regions(splits, duplicated_regions, chromosome_name):
    splits2 = copy.deepcopy(splits)
    df = duplicated_regions[duplicated_regions["chr"] == chromosome_name]

    for i, row in df.iterrows():
        splits2[row["coor_s"]:row["coor_e"]] = row["ids"]

    interval_with_duplicated_regions = {}

    for i in splits:
        overlaps = splits2.overlap(i.begin, i.end)
        overlaps.remove(Interval(i.begin, i.end, (i.begin, i.end)))
        interval_with_duplicated_regions[(i.begin, i.end)] = overlaps

    return interval_with_duplicated_regions


In [114]:
def length_of_overlap(reg1,reg2):
    if reg1[0]<=reg2[0]:
        return(reg1[1]-reg2[0])
    else:
        return(reg2[1]-reg1[0])

In [115]:
def binning(x):
    if x>0.933:
        return 1
    else:
        return 0

In [116]:
def binning1(x): #(1.745, 0.196)
    if x>0.40174999999999994:
        return 1
    else:
        return 0

In [117]:
def binning2(x): #0.749, 0.065
    if x>0.424:
        return 1
    else:
        return 0

In [118]:
def binning_length(x):
    if x<2574.5:
        return 0 
    else:
        return 1

In [119]:
columns_needed=["chromosome_name","start", "end",'length_0','length_1', 'jumps',  'gaps', 'genes',
       'cpgisl_in', 'cpgisl_bor', 'repli_in', 'repli_bor', 'repli_bor_deriv',
       'repli_deriv', 'recomb_in', 'recomb_bor', 'dnase_in', 'dnase_bor','DNA', 'LINE',
       'LTR', 'SINE', 'Low_complexity', 'Retroposon', 'Satellite',
       'Simple_repeat', 'rRNA', 'snRNA', 'scRNA', 'srpRNA', 'tRNA', 'RC',
       'L1_s', 'L2_s', 'MIR_s', 'Alu_s', 'Satellite_s','component_size', 'intra_degree', 'iner_degree', 'self_loops',
       'edges_double', 'edges_tandem','edges_ident_mean_0','edges_ident_mean_1',"CG_frac_0","CG_frac_1","CG_frac_in_0","CG_frac_in_1"]

In [120]:
len(columns_needed)

49

In [121]:
def create_training_data(duplicated_regions):
    columns_needed = ["chromosome_name", "start", "end", 'length_0', 'length_1', 'jumps', 'gaps', 'genes',
                      'cpgisl_in', 'cpgisl_bor', 'repli_in', 'repli_bor', 'repli_bor_deriv',
                      'repli_deriv', 'recomb_in', 'recomb_bor', 'dnase_in', 'dnase_bor', 'DNA', 'LINE',
                      'LTR', 'SINE', 'Low_complexity', 'Retroposon', 'Satellite',
                      'Simple_repeat', 'rRNA', 'snRNA', 'scRNA', 'srpRNA', 'tRNA', 'RC',
                      'L1_s', 'L2_s', 'MIR_s', 'Alu_s', 'Satellite_s', 'component_size', 'intra_degree',
                      'iner_degree', 'self_loops', 'edges_double', 'edges_tandem', 'edges_ident_mean_0',
                      'edges_ident_mean_1', "CG_frac_0", "CG_frac_1", "CG_frac_in_0", "CG_frac_in_1"]

    df = pd.DataFrame(columns=columns_needed)

    for i in range(1, 23):
        print("Processing Chromosome:", i)
        chr_name = "chr" + str(i)
        splits = get_splits(chr_name)
        overlap_data = find_duplicated_regions(splits, duplicated_regions, i)

        for split in overlap_data:
            start = split[0]
            end = split[1]
            cumm_features = np.zeros(len(columns_needed))
            cumm_features[0] = i
            cumm_features[1] = start
            cumm_features[2] = end

            for dups in overlap_data[split]:
                dups_id = dups[2]
                dup_data = features[features["ids"] == dups_id]

                if len(dup_data) != 0:
                    length_of_dup = np.array(dup_data["length"])[0]
                    cumm_features[3 + binning_length(length_of_dup)] += 1
                    index_repli = 5
                    for count in np.array(
                            dup_data[['jumps', 'gaps', 'genes', 'cpgisl_in', 'cpgisl_bor', 'repli_in',
                                       'repli_bor', 'repli_bor_deriv', 'repli_deriv', 'recomb_in',
                                       'recomb_bor', 'dnase_in', 'dnase_bor', 'DNA', 'LINE', 'LTR',
                                       'SINE', 'Low_complexity', 'Retroposon', 'Satellite', 'Simple_repeat',
                                       'rRNA', 'snRNA', 'scRNA', 'srpRNA', 'tRNA', 'RC', 'L1_s', 'L2_s',
                                       'MIR_s', 'Alu_s', 'Satellite_s', 'component_size', 'intra_degree',
                                       'iner_degree', 'self_loops', 'edges_double', 'edges_tandem',
                                       "CG_frac", "CG_frac_in"]])[0]:
                        cumm_features[index_repli] += int(count)
                        index_repli += 1
                    index_edges_ident_mean = 43
                    cumm_features[
                        index_edges_ident_mean + int(binning(np.array(dup_data['edges_ident_mean'])[0]))] += 1
                    index_cg_frac = 45
                    cumm_features[index_cg_frac + int(binning1(np.array(dup_data['CG_frac'])[0]))] += 1
                    index_cg_frac_in = 47
                    cumm_features[index_cg_frac_in + int(binning2(np.array(dup_data['CG_frac_in'])[0]))] += 1

            df2 = pd.DataFrame([cumm_features], columns=columns_needed)
            df = pd.concat([df, df2])

    return df


In [122]:
features=get_duplicated_regions()

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  duplicated_regions["CG_frac_l"][duplicated_regions["CG_frac_l"] == -1.000] = 0.0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-vie

In [123]:
features.to_csv("SD_network/NMF_Analysis/outputs/1MB/Duplicated_Regions_1MB_processed.csv",index=False)

In [124]:
df_ML_output=create_training_data(features)

Processing Chromosome: 1


  df = pd.concat([df, df2])


Processing Chromosome: 2
Processing Chromosome: 3
Processing Chromosome: 4
Processing Chromosome: 5
Processing Chromosome: 6
Processing Chromosome: 7
Processing Chromosome: 8
Processing Chromosome: 9
Processing Chromosome: 10
Processing Chromosome: 11
Processing Chromosome: 12
Processing Chromosome: 13
Processing Chromosome: 14
Processing Chromosome: 15
Processing Chromosome: 16
Processing Chromosome: 17
Processing Chromosome: 18
Processing Chromosome: 19
Processing Chromosome: 20
Processing Chromosome: 21
Processing Chromosome: 22


In [125]:
df_ML_output

Unnamed: 0,chromosome_name,start,end,length_0,length_1,jumps,gaps,genes,cpgisl_in,cpgisl_bor,...,iner_degree,self_loops,edges_double,edges_tandem,edges_ident_mean_0,edges_ident_mean_1,CG_frac_0,CG_frac_1,CG_frac_in_0,CG_frac_in_1
0,1.0,187946361.0,188946361.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,1.0,186946360.0,187946360.0,1.0,1.0,2.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2.0,0.0,0.0,2.0,1.0,1.0
0,1.0,27010028.0,28010028.0,3.0,1.0,6.0,0.0,4.0,0.0,0.0,...,3.0,0.0,0.0,0.0,2.0,2.0,2.0,2.0,1.0,3.0
0,1.0,32010033.0,33010033.0,3.0,1.0,4.0,0.0,4.0,0.0,0.0,...,4.0,0.0,0.0,0.0,2.0,2.0,2.0,2.0,2.0,2.0
0,1.0,247946421.0,248946421.0,4.0,9.0,17.0,1.0,14.0,0.0,0.0,...,13.0,0.0,4.0,14.0,9.0,4.0,11.0,2.0,11.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,22.0,11699998.0,12699998.0,1.0,6.0,15.0,9.0,1.0,6.0,1.0,...,23.0,3.0,140.0,10.0,1.0,6.0,1.0,6.0,5.0,2.0
0,22.0,9699996.0,10699996.0,0.0,1.0,2.0,1.0,0.0,3.0,1.0,...,8.0,2.0,97.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0
0,22.0,12699999.0,13699999.0,0.0,3.0,6.0,5.0,0.0,1.0,1.0,...,0.0,0.0,5.0,2.0,0.0,3.0,0.0,3.0,2.0,1.0
0,22.0,10699997.0,11699997.0,0.0,10.0,17.0,13.0,1.0,13.0,0.0,...,33.0,0.0,40.0,5.0,2.0,8.0,2.0,8.0,6.0,4.0


In [126]:
df_ML_output.columns

Index(['chromosome_name', 'start', 'end', 'length_0', 'length_1', 'jumps',
       'gaps', 'genes', 'cpgisl_in', 'cpgisl_bor', 'repli_in', 'repli_bor',
       'repli_bor_deriv', 'repli_deriv', 'recomb_in', 'recomb_bor', 'dnase_in',
       'dnase_bor', 'DNA', 'LINE', 'LTR', 'SINE', 'Low_complexity',
       'Retroposon', 'Satellite', 'Simple_repeat', 'rRNA', 'snRNA', 'scRNA',
       'srpRNA', 'tRNA', 'RC', 'L1_s', 'L2_s', 'MIR_s', 'Alu_s', 'Satellite_s',
       'component_size', 'intra_degree', 'iner_degree', 'self_loops',
       'edges_double', 'edges_tandem', 'edges_ident_mean_0',
       'edges_ident_mean_1', 'CG_frac_0', 'CG_frac_1', 'CG_frac_in_0',
       'CG_frac_in_1'],
      dtype='object')

In [127]:
df_ML_output.DNA

0    0.0
0    0.0
0    1.0
0    0.0
0    0.0
    ... 
0    0.0
0    0.0
0    0.0
0    0.0
0    1.0
Name: DNA, Length: 2810, dtype: float64

In [128]:
df_ML_output.to_csv("SD_network/NMF_Analysis/outputs/1MB/Duplicated_Regions_Final_1MB.csv",index=False)