In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pyBigWig

__Browser visualization:__

1) run this notebook: read in the correct file you want to visualize and format the pandas dataframe to have 4 columns (chr, start_pos, end_pos, value)
2) format the pandas dataframe to not have any overlapping regions
3) create 4 lists from the 4 pandas dataframe columns
4) initialize and open for writing the bigwig file
5) add header to the bigwig file
6) write data into the bigwig file
7) close file
8) read file back in and check that everything is correct
9) go to the neural_net_new/browser_visualisation/ folder from the terminal write: 
    * git add mybigwigfilethatIjustcreated.bw
    * git commit -m "pushing my new bigwig file because I am cool"
    * git push origin main
10) go to the USCS genome browser, go to my tracks and add a new track
    * example of the track line to input: track type=bigWig name="Example One" description="A bigWig file" bigDataUrl=http://genome.ucsc.edu/goldenPath/help/examples/bigWigExample.bw
    * replace the name, description with relevant information and get the raw bigwig file link from github and replace the bigDataUrl with this raw link
    * submit the track and check that everything looks correct
    * if you wish to create a collection, do a right click on the visualized track, create a new collection, add tracks there and then use merge option transparent to show transparent bar charts that overlap

# Observed

In [2]:
test_data_tsv_file_with_positions = pd.read_csv("../../../Anika/preprocessing/Overlapping_preprocessing_Carmen/preprocessing_results_genome/Train_val_test_files/Seq_masked_wo_target_test.tsv", sep='\t')


In [4]:
test_data_tsv_file_with_positions.head()

Unnamed: 0,#chr,start_pos,end_pos,1A,1C,1G,1T,2A,2C,2G,...,199G,199T,200A,200C,200G,200T,201A,201C,201G,201T
0,chr12,10025,10226,0,0,0,1,1,0,0,...,0,1,1,0,0,0,1,0,0,0
1,chr12,10050,10251,1,0,0,0,1,0,0,...,0,0,1,0,0,0,0,1,0,0
2,chr12,10075,10276,1,0,0,0,0,1,0,...,0,0,0,1,0,0,0,1,0,0
3,chr12,10100,10301,0,1,0,0,0,1,0,...,0,0,0,1,0,0,0,1,0,0
4,chr12,10125,10326,0,1,0,0,0,1,0,...,0,0,0,1,0,0,0,1,0,0


In [5]:
test_data_tsv_file_with_positions = test_data_tsv_file_with_positions.iloc[:, 0:3]

In [6]:
test_data_tsv_file_with_positions.head()

Unnamed: 0,#chr,start_pos,end_pos
0,chr12,10025,10226
1,chr12,10050,10251
2,chr12,10075,10276
3,chr12,10100,10301
4,chr12,10125,10326


In [7]:
test_data_observed = pd.read_csv("../../../Anika/preprocessing/Overlapping_preprocessing_Carmen/preprocessing_results_genome/Train_val_test_files/Target_C02M02_masked_arcsinh_test.tsv", sep='\t')

In [8]:
test_data_observed.head()

Unnamed: 0,target_C02M02
0,0.00168
1,0.00168
2,0.00168
3,0.002527
4,0.003704


In [10]:
observed_with_pos = pd.concat([test_data_tsv_file_with_positions.reset_index(drop=True), test_data_observed], axis=1)

In [11]:
observed_with_pos.head()

Unnamed: 0,#chr,start_pos,end_pos,target_C02M02
0,chr12,10025,10226,0.00168
1,chr12,10050,10251,0.00168
2,chr12,10075,10276,0.00168
3,chr12,10100,10301,0.002527
4,chr12,10125,10326,0.003704


In [13]:
chr12 = observed_with_pos.loc[observed_with_pos['#chr'] == "chr12"]
chr22 = observed_with_pos.loc[observed_with_pos['#chr'] == "chr22"]

## Read in data

In [9]:
data = pd.read_csv('../../../Anika/preprocessing/Overlapping_preprocessing_Carmen/preprocessing_results_genome/chr22_target_C02M02_masked_arcsinh.tsv', sep='\t')
#data = pd.read_csv('../predictions/final_model_predictions_overlapping.tsv', sep='\t')
data

Unnamed: 0,target_C02M02
0,0.001680
1,0.001680
2,0.001680
3,0.001680
4,0.001680
...,...
1372221,0.017570
1372222,0.015614
1372223,0.019402
1372224,0.022897


In [53]:
chr12 = data.loc[data['#chr'] == "chr12"]
chr22 = data.loc[data['#chr'] == "chr22"]

In [14]:
chr22.head()

Unnamed: 0,#chr,start_pos,end_pos,target_C02M02
5172326,chr22,10510350,10510551,0.00168
5172327,chr22,10510375,10510576,0.00168
5172328,chr22,10510400,10510601,0.00168
5172329,chr22,10511375,10511576,0.00168
5172330,chr22,10511400,10511601,0.00168


In [15]:
chr22.tail()

Unnamed: 0,#chr,start_pos,end_pos,target_C02M02
6544547,chr22,50807950,50808151,0.01757
6544548,chr22,50807975,50808176,0.015614
6544549,chr22,50808000,50808201,0.019402
6544550,chr22,50808025,50808226,0.022897
6544551,chr22,50808050,50808251,0.024074


## Select relevant columns

In [63]:
chr12_rm = chr12.iloc[:,-1:]

In [64]:
chr12_rm.head()

Unnamed: 0,target_C02M02
0,0.00168
1,0.00168
2,0.00168
3,0.002527
4,0.003704


In [66]:
new_df = pd.concat([chr12.iloc[:,0:3],chr12.iloc[:,-1:]],axis = 1)

In [67]:
new_df.reset_index(drop=True, inplace=True) 

In [68]:
new_df.head()

Unnamed: 0,#chr,start_pos,end_pos,target_C02M02
0,chr12,10025,10226,0.00168
1,chr12,10050,10251,0.00168
2,chr12,10075,10276,0.00168
3,chr12,10100,10301,0.002527
4,chr12,10125,10326,0.003704


## Rename columns if needed

In [69]:
new_df = new_df.rename(columns = {"#chr": "chr"})

In [70]:
new_df.tail()

Unnamed: 0,chr,start_pos,end_pos,target_C02M02
5172321,chr12,133264625,133264826,0.150208
5172322,chr12,133264650,133264851,0.205605
5172323,chr12,133264675,133264876,0.245385
5172324,chr12,133264700,133264901,0.271474
5172325,chr12,133264725,133264926,0.274787


## Make windows non overlapping (the next one will begin where the last one ended)

In [71]:
start_pos_list_replace = new_df["start_pos"].tolist()
start_pos_list_replace = start_pos_list_replace[1:] + [new_df["end_pos"].tolist()[-1]]

In [72]:
new_df_non_overlap = new_df
new_df_non_overlap["end_pos"] = start_pos_list_replace
new_df_non_overlap.tail()

Unnamed: 0,chr,start_pos,end_pos,target_C02M02
5172321,chr12,133264625,133264650,0.150208
5172322,chr12,133264650,133264675,0.205605
5172323,chr12,133264675,133264700,0.245385
5172324,chr12,133264700,133264725,0.271474
5172325,chr12,133264725,133264926,0.274787


## Initialize the bigwig file and add header

In [73]:
bw = pyBigWig.open("bigwig_chr_12_observed.bw", "w")
bw.addHeader([("chr12", 133275309)])

# chromosome lengths can be for hg38 can be found from here: https://www.ncbi.nlm.nih.gov/assembly/GCF_000001405.39

## Get lists from pandas dataframe columns

In [75]:
chrom_list = new_df_non_overlap["chr"].tolist()
chrom_list = np.array(["chr12"] * len(chrom_list)) 
start_pos_list = new_df_non_overlap["start_pos"].tolist()
end_pos_list = new_df_non_overlap["end_pos"].tolist()
target_list = new_df_non_overlap["target_C02M02"].tolist()
target_list[0:6]

[0.0016799991963924,
 0.0016799991963924,
 0.0016799991963924,
 0.002527161483715,
 0.0037037726320879,
 0.0048803786529556]

## Write data to bigwig file

In [76]:
for i in range(0, len(chrom_list)):
    bw.addEntries([chrom_list[i]], [start_pos_list[i]], ends=[end_pos_list[i]], values=[target_list[i]])

## Close the created bigwig file

In [77]:
bw.close()

## Read file in again and check that everything is correct

In [89]:
C02M02 = pyBigWig.open("bigwig_chr_22_observed.bw")

In [90]:
C02M02.isBigWig()

True

In [91]:
C02M02.chroms()

{'chr22': 50818468}

In [92]:
C02M02.values("chr22", 50807950, 50808176)

[0.017569543793797493,
 0.017569543793797493,
 0.017569543793797493,
 0.017569543793797493,
 0.017569543793797493,
 0.017569543793797493,
 0.017569543793797493,
 0.017569543793797493,
 0.017569543793797493,
 0.017569543793797493,
 0.017569543793797493,
 0.017569543793797493,
 0.017569543793797493,
 0.017569543793797493,
 0.017569543793797493,
 0.017569543793797493,
 0.017569543793797493,
 0.017569543793797493,
 0.017569543793797493,
 0.017569543793797493,
 0.017569543793797493,
 0.017569543793797493,
 0.017569543793797493,
 0.017569543793797493,
 0.017569543793797493,
 0.015614042989909649,
 0.015614042989909649,
 0.015614042989909649,
 0.015614042989909649,
 0.015614042989909649,
 0.015614042989909649,
 0.015614042989909649,
 0.015614042989909649,
 0.015614042989909649,
 0.015614042989909649,
 0.015614042989909649,
 0.015614042989909649,
 0.015614042989909649,
 0.015614042989909649,
 0.015614042989909649,
 0.015614042989909649,
 0.015614042989909649,
 0.015614042989909649,
 0.01561404