## This notebook extracts TE counts from the telescope quantification
- note that I customized the GTF by making the transcript ID repName+thw swScore column
- then in all later quantification, I used the transcript ID as identifier
- limitation: I have not taken into account the biases induced by multiple genome loci mapping into different transcript IDs

In [3]:
import sys
import os
import numpy as np
import pandas as pd

## load data

### load annotation table

In [3]:
rmsk = pd.read_csv("./mm39_rmsk_full.tsv", delimiter='\t', header = 0, index_col = False )
rmsk.head()

Unnamed: 0,#bin,swScore,milliDiv,milliDel,milliIns,genoName,genoStart,genoEnd,genoLeft,strand,repName,repClass,repFamily,repStart,repEnd,repLeft,id
0,1,3777,194,105,11,chr1,8387806,8388657,-186765622,+,Lx2B2,LINE,L1,5997,7041,-8,7
1,1,595,269,47,47,chr1,41942994,41943142,-153211137,+,B3,SINE,B2,4,151,-65,6
2,1,1796,281,48,69,chr1,50331618,50332377,-144821902,-,Lx7,LINE,L1,-1163,6533,5801,7
3,1,5180,80,29,11,chr1,58720077,58721182,-136433097,+,L1MdV_III,LINE,L1,681,1770,-2002,8
4,2,1316,273,85,19,chr1,100663164,100663479,-94490800,+,MLTR14,LTR,ERV1,161,505,-38,1


In [4]:
transcript_id_col = [ rmsk.loc[i, 'repName'] + '_' + str( rmsk.loc[i, 'swScore'] ) for i in rmsk.index ] 

In [5]:
transcript_id_col[0]

'Lx2B2_3777'

In [6]:
rmsk.shape

(5320771, 17)

### load KO results

In [7]:
KO_fnmaes = [ f'KO_{i+1}-telescope_report.tsv' for i in range(0,4) ] 
KO_fnmaes

['KO_1-telescope_report.tsv',
 'KO_2-telescope_report.tsv',
 'KO_3-telescope_report.tsv',
 'KO_4-telescope_report.tsv']

#### test data format

In [12]:
test1 = pd.read_csv(KO_fnmaes[0], header = 1, index_col = 0, delimiter='\t' )
test1.head()

Unnamed: 0_level_0,transcript_length,final_count,final_conf,final_prop,init_aligned,unique_count,init_best,init_best_random,init_best_avg,init_prop
transcript,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
RLTR4_MM-int_19866,2582,66221,2825.0,0.0218,66242,2822,2894,28342,28284.92,0.0122
SSU-rRNA_Hsa_15330,1849,65772,65765.0,0.0187,65774,62815,63694,64462,64440.5,0.0186
RLTR4_MM-int_11275,1839,64587,21.0,0.0189,64608,18,21,27985,27988.57,0.0133
RLTR4_MM-int_12911,1863,21171,21104.0,0.00913,21173,19845,19866,20484,20473.83,0.00887
RLTR4_MM-int_17371,2370,17955,17191.0,0.00752,17986,14556,14686,15998,15985.99,0.00686


transcript
RLTR4_MM-int_19866     66221
SSU-rRNA_Hsa_15330     65772
RLTR4_MM-int_11275     64587
RLTR4_MM-int_12911     21171
RLTR4_MM-int_17371     17955
                       ...  
L1MdTf_I_34972             0
L1MdA_VI_6625              0
ERVB7_1-LTR_MM_2674        0
MMERGLN_LTR_3779           0
MMERGLN_LTR_3790           0
Name: final_count, Length: 93672, dtype: int64

In [32]:
test2 = pd.read_csv(KO_fnmaes[1], header = 1, index_col = 0, delimiter='\t' )
test2.head()

Unnamed: 0_level_0,transcript_length,final_count,final_conf,final_prop,init_aligned,unique_count,init_best,init_best_random,init_best_avg,init_prop
transcript,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
RLTR4_MM-int_19866,2582,63911,2389.0,0.021,63938,2388,2460,27034,27031.58,0.012
RLTR4_MM-int_11275,1839,62434,24.0,0.019,62446,22,24,27477,27229.47,0.0132
SSU-rRNA_Hsa_15330,1849,52553,52547.0,0.0156,52554,50280,50969,51503,51516.0,0.0155
RLTR4_MM-int_12911,1863,21387,21344.0,0.00975,21389,20003,20029,20688,20677.82,0.00947
RLTR4_MM-int_17371,2370,17466,16721.0,0.00771,17497,14158,14276,15545,15549.01,0.00701


In [33]:
KO_df = pd.DataFrame()
KO_df.loc[:, 'test1' ] = test1['final_count'] 

KO_df.merge( test2['final_count'], left_index= True,  right_index= True, how = 'outer' , suffixes=['', '_' + 'test2'] ) 

Unnamed: 0_level_0,test1,final_count
transcript,Unnamed: 1_level_1,Unnamed: 2_level_1
(A)n_15,22.0,18.0
(A)n_16,44.0,54.0
(A)n_17,75.0,51.0
(A)n_18,112.0,112.0
(A)n_19,210.0,208.0
...,...,...
tRNA-Trp-TGG_617,3.0,1.0
tRNA-Trp-TGG_646,2.0,
tRNA-Trp-TGG_662,1.0,
tRNA-Tyr-TAC_480,,3.0


#### a function to get all counts from KO samples

In [8]:
KO_ids = [ f'KO_{i+1}' for i in range(0,4) ] 

In [34]:
KO_df = pd.DataFrame()

In [35]:
for i,kf in enumerate( KO_fnmaes):
    print( kf) 
    dat = pd.read_csv(kf, header = 1, index_col = 0, delimiter='\t' )
    if( KO_df.shape[1] == 0 ):
        KO_df.loc[:, KO_ids[i] ] = dat['final_count'] 
        continue
    else:
        #print( 'at merge' )
        KO_df = KO_df.merge( dat['final_count'], left_index= True,  right_index= True, how = 'outer' , suffixes=['', '_' + KO_ids[i]] ) 

KO_1-telescope_report.tsv
KO_2-telescope_report.tsv
KO_3-telescope_report.tsv
KO_4-telescope_report.tsv


In [36]:
KO_df.head()

Unnamed: 0_level_0,KO_1,final_count,final_count_KO_3,final_count_KO_4
transcript,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
(A)n_15,22.0,18.0,20.0,18.0
(A)n_16,44.0,54.0,53.0,50.0
(A)n_17,75.0,51.0,76.0,64.0
(A)n_18,112.0,112.0,113.0,121.0
(A)n_19,210.0,208.0,170.0,185.0


In [44]:
KO_df.columns = KO_ids

### load WT resutls

In [37]:
WT_fnmaes = [ f'WT_{i+1}-telescope_report.tsv' for i in range(0,4) ] 
WT_fnmaes

['WT_1-telescope_report.tsv',
 'WT_2-telescope_report.tsv',
 'WT_3-telescope_report.tsv',
 'WT_4-telescope_report.tsv']

In [38]:
WT_ids = [ f'WT_{i+1}' for i in range(0,4) ] 

In [39]:
WT_df = pd.DataFrame()

In [40]:
for i,kf in enumerate( WT_fnmaes):
    print( kf) 
    dat = pd.read_csv(kf, header = 1, index_col = 0, delimiter='\t' )
    if( WT_df.shape[1] == 0 ):
        WT_df.loc[:, WT_ids[i] ] = dat['final_count'] 
        continue
    else:
        #print( 'at merge' )
        WT_df = WT_df.merge( dat['final_count'], left_index= True,  right_index= True, how = 'outer' , suffixes=['', '_' + KO_ids[i]] ) 

WT_1-telescope_report.tsv
WT_2-telescope_report.tsv
WT_3-telescope_report.tsv
WT_4-telescope_report.tsv


In [41]:
WT_df.head()

Unnamed: 0_level_0,WT_1,final_count,final_count_KO_3,final_count_KO_4
transcript,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
(A)n_15,31.0,26.0,31.0,31.0
(A)n_16,45.0,65.0,54.0,56.0
(A)n_17,79.0,67.0,105.0,73.0
(A)n_18,93.0,137.0,125.0,119.0
(A)n_19,174.0,212.0,178.0,176.0


In [42]:
WT_df.shape

(143660, 4)

In [43]:
WT_df[ WT_df.index.str.startswith('Lx2B2') ]

Unnamed: 0_level_0,WT_1,final_count,final_count_KO_3,final_count_KO_4
transcript,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Lx2B2_1024,15.0,18.0,10.0,6.0
Lx2B2_1033,,,,1.0
Lx2B2_1036,,7.0,,
Lx2B2_10485,,1.0,,
Lx2B2_1054,16.0,20.0,9.0,4.0
...,...,...,...,...
Lx2B2_944,,3.0,,
Lx2B2_950,,2.0,,1.0
Lx2B2_962,,1.0,,
Lx2B2_9753,7.0,20.0,24.0,6.0


In [45]:
WT_df.columns = WT_ids

In [46]:
WT_df.head()

Unnamed: 0_level_0,WT_1,WT_2,WT_3,WT_4
transcript,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
(A)n_15,31.0,26.0,31.0,31.0
(A)n_16,45.0,65.0,54.0,56.0
(A)n_17,79.0,67.0,105.0,73.0
(A)n_18,93.0,137.0,125.0,119.0
(A)n_19,174.0,212.0,178.0,176.0


## Get individual loci result

In [48]:
print( KO_df.shape)
print( WT_df.shape)

(155300, 4)
(143660, 4)


In [56]:
indiv_df = WT_df.merge( KO_df, left_index=True, right_index= True, how = 'outer' )
indiv_df.shape

(186775, 8)

In [57]:
indiv_df.head()

Unnamed: 0_level_0,WT_1,WT_2,WT_3,WT_4,KO_1,KO_2,KO_3,KO_4
transcript,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
(A)n_15,31.0,26.0,31.0,31.0,22.0,18.0,20.0,18.0
(A)n_16,45.0,65.0,54.0,56.0,44.0,54.0,53.0,50.0
(A)n_17,79.0,67.0,105.0,73.0,75.0,51.0,76.0,64.0
(A)n_18,93.0,137.0,125.0,119.0,112.0,112.0,113.0,121.0
(A)n_19,174.0,212.0,178.0,176.0,210.0,208.0,170.0,185.0


In [52]:
#indiv_df.fillna( 0 , inplace= True)

## get subF result

In [58]:
subF = indiv_df.copy()

In [62]:
repnames = [ "_".join( s.split("_")[0:-1]) for s in indiv_df.index ]
#repnames

In [66]:
#repnames[-500: -490]

In [68]:
subF.loc[: ,'repName'] = repnames

In [69]:
subF.head()

Unnamed: 0_level_0,WT_1,WT_2,WT_3,WT_4,KO_1,KO_2,KO_3,KO_4,repName
transcript,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
(A)n_15,31.0,26.0,31.0,31.0,22.0,18.0,20.0,18.0,(A)n
(A)n_16,45.0,65.0,54.0,56.0,44.0,54.0,53.0,50.0,(A)n
(A)n_17,79.0,67.0,105.0,73.0,75.0,51.0,76.0,64.0,(A)n
(A)n_18,93.0,137.0,125.0,119.0,112.0,112.0,113.0,121.0,(A)n
(A)n_19,174.0,212.0,178.0,176.0,210.0,208.0,170.0,185.0,(A)n


In [70]:
subF = subF.groupby("repName").sum()

In [71]:
subF.head()

Unnamed: 0_level_0,WT_1,WT_2,WT_3,WT_4,KO_1,KO_2,KO_3,KO_4
repName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
(A)n,4258.0,4761.0,4701.0,4254.0,4281.0,4280.0,4587.0,4142.0
(AAAAAAC)n,16.0,14.0,8.0,10.0,13.0,7.0,6.0,7.0
(AAAAAAG)n,5.0,8.0,4.0,1.0,5.0,4.0,6.0,7.0
(AAAAAC)n,177.0,210.0,210.0,188.0,200.0,169.0,162.0,151.0
(AAAAACA)n,1.0,2.0,0.0,0.0,0.0,0.0,1.0,0.0


## save data

In [77]:
logi = pd.DataFrame( index = ['nb_created' , 'nb_updated' , 'explain'] )

In [73]:
indiv_df.to_csv("./count_table_indiv.csv", header = True, index = True )

In [74]:
subF.to_csv("./count_table_subF.csv", header = True, index = True )

In [75]:
rmsk.to_csv("./mm39_rmsk_full.tsv", sep='\t', header = True, index = False )

In [78]:
logi['count_table_indiv.csv'] = [ '1' , 'NA' , 'count table obtained from combining the final_count column from each sample s telescope report']

logi['count_table_subF.csv'] = [ '1' , 'NA' , 'count table obtained from the indiv count table. Summed up all elements with the same repName']

logi['mm39_rmsk_full.tsv'] = [ 'downloaded from ucsc table browser' , '1' , 'added the column of transcript_id, which is the ID used to run individual locus for telescope']


In [79]:
logi.head()

Unnamed: 0,count_table_indiv.csv,count_table_subF.csv,mm39_rmsk_full.tsv
nb_created,1,1,downloaded from ucsc table browser
nb_updated,,,1
explain,count table obtained from combining the final_...,count table obtained from the indiv count tabl...,"added the column of transcript_id, which is th..."


In [80]:
logi.to_csv("./OUTPUT_EXPLAIN.csv", header = True, index = True )

In [2]:
#logi

In [4]:
deseq_coldata = [ f'WT_{i+1}' for i in range(4)] + [ f'KO_{i+1}' for i in range(4)]

In [5]:
deseq_coldata

['WT_1', 'WT_2', 'WT_3', 'WT_4', 'KO_1', 'KO_2', 'KO_3', 'KO_4']

In [6]:
deseq_coldata_condition = [ f'WT' for i in range(4)] + [ f'KO' for i in range(4)]

In [7]:
coldata_df = pd.DataFrame()
coldata_df['samples'] = deseq_coldata
coldata_df['conditions'] = deseq_coldata_condition

In [8]:
coldata_df

Unnamed: 0,samples,conditions
0,WT_1,WT
1,WT_2,WT
2,WT_3,WT
3,WT_4,WT
4,KO_1,KO
5,KO_2,KO
6,KO_3,KO
7,KO_4,KO


In [9]:
coldata_df.to_csv("./deseq_coldata.csv", header = True, index = False )