## Manipolazione di un file GTF (Gene Transfer Format) attraverso la libreria `Pandas`

#### 1) Importare `Pandas`

In [78]:
import pandas as pd

#### 2) Leggere il file GTF

    df = pd.read_csv(gtf_file_name, sep='\t', header = None)

In [79]:
df = pd.read_csv('./input.gtf', sep = '\t', header = None)
df

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,ENm006,VEGA_Known,exon,71783,71788,.,-,.,"transcript_id ""U52112.4-005""; gene_id ""ARHGAP4"";"
1,ENm006,VEGA_Known,CDS,71783,71788,.,-,0,"transcript_id ""U52112.4-005""; gene_id ""ARHGAP4"";"
2,ENm006,VEGA_Known,exon,70312,70440,.,-,.,"transcript_id ""U52112.4-005""; gene_id ""ARHGAP4"";"
3,ENm006,VEGA_Known,CDS,70312,70440,.,-,0,"transcript_id ""U52112.4-005""; gene_id ""ARHGAP4"";"
4,ENm006,VEGA_Known,exon,69989,70210,.,-,.,"transcript_id ""U52112.4-005""; gene_id ""ARHGAP4"";"
...,...,...,...,...,...,...,...,...,...
405,ENm006,VEGA_Known,CDS,56303,56327,.,+,0,"transcript_id ""U52112.2-001""; gene_id ""AVPR2"";..."
406,ENm006,VEGA_Known,exon,56689,57573,.,+,.,"transcript_id ""U52112.2-001""; gene_id ""AVPR2"";..."
407,ENm006,VEGA_Known,CDS,56689,57573,.,+,2,"transcript_id ""U52112.2-001""; gene_id ""AVPR2"";..."
408,ENm006,VEGA_Known,exon,57680,58323,.,+,.,"transcript_id ""U52112.2-001""; gene_id ""AVPR2"";..."


#### 3) Cambiare i nomi delle colonne

I nomi delle colonne devono essere:
- reference
- source
- feature
- start
- end
- score
- strand
- frame
- attributes

In [80]:
replace_dict = {0 : 'reference', 1 : 'source', 2 : 'feature', 3 : 'start', 4 : 'end', 5 : 'score', 6 : 'strand', 7 : 'frame', 8 : 'attributes'}
df.rename(columns = replace_dict, inplace=True)

In [81]:
df

Unnamed: 0,reference,source,feature,start,end,score,strand,frame,attributes
0,ENm006,VEGA_Known,exon,71783,71788,.,-,.,"transcript_id ""U52112.4-005""; gene_id ""ARHGAP4"";"
1,ENm006,VEGA_Known,CDS,71783,71788,.,-,0,"transcript_id ""U52112.4-005""; gene_id ""ARHGAP4"";"
2,ENm006,VEGA_Known,exon,70312,70440,.,-,.,"transcript_id ""U52112.4-005""; gene_id ""ARHGAP4"";"
3,ENm006,VEGA_Known,CDS,70312,70440,.,-,0,"transcript_id ""U52112.4-005""; gene_id ""ARHGAP4"";"
4,ENm006,VEGA_Known,exon,69989,70210,.,-,.,"transcript_id ""U52112.4-005""; gene_id ""ARHGAP4"";"
...,...,...,...,...,...,...,...,...,...
405,ENm006,VEGA_Known,CDS,56303,56327,.,+,0,"transcript_id ""U52112.2-001""; gene_id ""AVPR2"";..."
406,ENm006,VEGA_Known,exon,56689,57573,.,+,.,"transcript_id ""U52112.2-001""; gene_id ""AVPR2"";..."
407,ENm006,VEGA_Known,CDS,56689,57573,.,+,2,"transcript_id ""U52112.2-001""; gene_id ""AVPR2"";..."
408,ENm006,VEGA_Known,exon,57680,58323,.,+,.,"transcript_id ""U52112.2-001""; gene_id ""AVPR2"";..."


#### 4) Eliminare le colonne `source` e `score` e sostituire l'identificatore `ENm006` con l'identificatore `ENCODE_REGION` in tutti i campi della colonna `reference`

In [82]:
df.drop(['source', 'score'], axis = 1, inplace = True)

In [83]:
df

Unnamed: 0,reference,feature,start,end,strand,frame,attributes
0,ENm006,exon,71783,71788,-,.,"transcript_id ""U52112.4-005""; gene_id ""ARHGAP4"";"
1,ENm006,CDS,71783,71788,-,0,"transcript_id ""U52112.4-005""; gene_id ""ARHGAP4"";"
2,ENm006,exon,70312,70440,-,.,"transcript_id ""U52112.4-005""; gene_id ""ARHGAP4"";"
3,ENm006,CDS,70312,70440,-,0,"transcript_id ""U52112.4-005""; gene_id ""ARHGAP4"";"
4,ENm006,exon,69989,70210,-,.,"transcript_id ""U52112.4-005""; gene_id ""ARHGAP4"";"
...,...,...,...,...,...,...,...
405,ENm006,CDS,56303,56327,+,0,"transcript_id ""U52112.2-001""; gene_id ""AVPR2"";..."
406,ENm006,exon,56689,57573,+,.,"transcript_id ""U52112.2-001""; gene_id ""AVPR2"";..."
407,ENm006,CDS,56689,57573,+,2,"transcript_id ""U52112.2-001""; gene_id ""AVPR2"";..."
408,ENm006,exon,57680,58323,+,.,"transcript_id ""U52112.2-001""; gene_id ""AVPR2"";..."


In [84]:
df['reference'] = 'ENCODE_REGION'

In [85]:
df

Unnamed: 0,reference,feature,start,end,strand,frame,attributes
0,ENCODE_REGION,exon,71783,71788,-,.,"transcript_id ""U52112.4-005""; gene_id ""ARHGAP4"";"
1,ENCODE_REGION,CDS,71783,71788,-,0,"transcript_id ""U52112.4-005""; gene_id ""ARHGAP4"";"
2,ENCODE_REGION,exon,70312,70440,-,.,"transcript_id ""U52112.4-005""; gene_id ""ARHGAP4"";"
3,ENCODE_REGION,CDS,70312,70440,-,0,"transcript_id ""U52112.4-005""; gene_id ""ARHGAP4"";"
4,ENCODE_REGION,exon,69989,70210,-,.,"transcript_id ""U52112.4-005""; gene_id ""ARHGAP4"";"
...,...,...,...,...,...,...,...
405,ENCODE_REGION,CDS,56303,56327,+,0,"transcript_id ""U52112.2-001""; gene_id ""AVPR2"";..."
406,ENCODE_REGION,exon,56689,57573,+,.,"transcript_id ""U52112.2-001""; gene_id ""AVPR2"";..."
407,ENCODE_REGION,CDS,56689,57573,+,2,"transcript_id ""U52112.2-001""; gene_id ""AVPR2"";..."
408,ENCODE_REGION,exon,57680,58323,+,.,"transcript_id ""U52112.2-001""; gene_id ""AVPR2"";..."


#### 5) Sostituire la colonna degli attributi con le due colonne  `transcript` e `gene`

La colonne `transcript` e `gene` dovranno contenere solo l'ID del trascritto e del gene.

In [86]:
df['gene'] = ''
df['transcript'] = ''

In [87]:
import re

In [88]:
for (index, row) in df.iterrows():
    transcript_id = re.search(r'transcript_id\s+(.+?);', row['attributes']).group(1).replace('"', '')
    gene_id = re.search(r'gene_id\s+(.+?);', row['attributes']).group(1).replace('"', '')
    df.loc[index, 'transcript'] = transcript_id
    df.loc[index, 'gene'] = gene_id

In [89]:
df

Unnamed: 0,reference,feature,start,end,strand,frame,attributes,gene,transcript
0,ENCODE_REGION,exon,71783,71788,-,.,"transcript_id ""U52112.4-005""; gene_id ""ARHGAP4"";",ARHGAP4,U52112.4-005
1,ENCODE_REGION,CDS,71783,71788,-,0,"transcript_id ""U52112.4-005""; gene_id ""ARHGAP4"";",ARHGAP4,U52112.4-005
2,ENCODE_REGION,exon,70312,70440,-,.,"transcript_id ""U52112.4-005""; gene_id ""ARHGAP4"";",ARHGAP4,U52112.4-005
3,ENCODE_REGION,CDS,70312,70440,-,0,"transcript_id ""U52112.4-005""; gene_id ""ARHGAP4"";",ARHGAP4,U52112.4-005
4,ENCODE_REGION,exon,69989,70210,-,.,"transcript_id ""U52112.4-005""; gene_id ""ARHGAP4"";",ARHGAP4,U52112.4-005
...,...,...,...,...,...,...,...,...,...
405,ENCODE_REGION,CDS,56303,56327,+,0,"transcript_id ""U52112.2-001""; gene_id ""AVPR2"";...",AVPR2,U52112.2-001
406,ENCODE_REGION,exon,56689,57573,+,.,"transcript_id ""U52112.2-001""; gene_id ""AVPR2"";...",AVPR2,U52112.2-001
407,ENCODE_REGION,CDS,56689,57573,+,2,"transcript_id ""U52112.2-001""; gene_id ""AVPR2"";...",AVPR2,U52112.2-001
408,ENCODE_REGION,exon,57680,58323,+,.,"transcript_id ""U52112.2-001""; gene_id ""AVPR2"";...",AVPR2,U52112.2-001


In [90]:
df.drop('attributes', axis=1, inplace=True)

In [92]:
df

Unnamed: 0,reference,feature,start,end,strand,frame,gene,transcript
0,ENCODE_REGION,exon,71783,71788,-,.,ARHGAP4,U52112.4-005
1,ENCODE_REGION,CDS,71783,71788,-,0,ARHGAP4,U52112.4-005
2,ENCODE_REGION,exon,70312,70440,-,.,ARHGAP4,U52112.4-005
3,ENCODE_REGION,CDS,70312,70440,-,0,ARHGAP4,U52112.4-005
4,ENCODE_REGION,exon,69989,70210,-,.,ARHGAP4,U52112.4-005
...,...,...,...,...,...,...,...,...
405,ENCODE_REGION,CDS,56303,56327,+,0,AVPR2,U52112.2-001
406,ENCODE_REGION,exon,56689,57573,+,.,AVPR2,U52112.2-001
407,ENCODE_REGION,CDS,56689,57573,+,2,AVPR2,U52112.2-001
408,ENCODE_REGION,exon,57680,58323,+,.,AVPR2,U52112.2-001


#### 6) Aggiungere la colonna `length` contenente la lunghezza della feature

In [93]:
df['length'] = df['end']-df['start']+1

In [94]:
df

Unnamed: 0,reference,feature,start,end,strand,frame,gene,transcript,length
0,ENCODE_REGION,exon,71783,71788,-,.,ARHGAP4,U52112.4-005,6
1,ENCODE_REGION,CDS,71783,71788,-,0,ARHGAP4,U52112.4-005,6
2,ENCODE_REGION,exon,70312,70440,-,.,ARHGAP4,U52112.4-005,129
3,ENCODE_REGION,CDS,70312,70440,-,0,ARHGAP4,U52112.4-005,129
4,ENCODE_REGION,exon,69989,70210,-,.,ARHGAP4,U52112.4-005,222
...,...,...,...,...,...,...,...,...,...
405,ENCODE_REGION,CDS,56303,56327,+,0,AVPR2,U52112.2-001,25
406,ENCODE_REGION,exon,56689,57573,+,.,AVPR2,U52112.2-001,885
407,ENCODE_REGION,CDS,56689,57573,+,2,AVPR2,U52112.2-001,885
408,ENCODE_REGION,exon,57680,58323,+,.,AVPR2,U52112.2-001,644


In [97]:
df = df.reindex(columns = ['reference', 'feature', 'start', 'end', 'length', 'strand', 'frame', 'gene', 'transcript'])

In [98]:
df

Unnamed: 0,reference,feature,start,end,length,strand,frame,gene,transcript
0,ENCODE_REGION,exon,71783,71788,6,-,.,ARHGAP4,U52112.4-005
1,ENCODE_REGION,CDS,71783,71788,6,-,0,ARHGAP4,U52112.4-005
2,ENCODE_REGION,exon,70312,70440,129,-,.,ARHGAP4,U52112.4-005
3,ENCODE_REGION,CDS,70312,70440,129,-,0,ARHGAP4,U52112.4-005
4,ENCODE_REGION,exon,69989,70210,222,-,.,ARHGAP4,U52112.4-005
...,...,...,...,...,...,...,...,...,...
405,ENCODE_REGION,CDS,56303,56327,25,+,0,AVPR2,U52112.2-001
406,ENCODE_REGION,exon,56689,57573,885,+,.,AVPR2,U52112.2-001
407,ENCODE_REGION,CDS,56689,57573,885,+,2,AVPR2,U52112.2-001
408,ENCODE_REGION,exon,57680,58323,644,+,.,AVPR2,U52112.2-001


#### 7) Rimuovere tutte le features di lunghezza minore o uguale a 6 basi

#### 8) Ottenere un data frame ordinato per coordinate crescenti delle features

#### 9) Ottenere un data frame ordinato per lunghezza crescente delle features

#### 10) Determinare la lista dei geni annotati

#### 11) Determinare la lista degli identificatori dei trascritti annotati

#### 12) Calcolare la lunghezza media delle features

#### 13) Determinare la lunghezza minima degli esoni e i trascritti che contengono un esone di lunghezza minima

a) Determinare la lunghezza minina degli esoni

b) Estrarre la lista dei trascritti che contengono un esone di lunghezza minima

#### 14) Contare quanti trascritti sono annotati per il gene `ARHGAP4`

#### 15) Estrarre la lista dei geni con strand `+`

#### 16) Estrarre il set degli esoni (distinti) del gene `ATP6AP1`

**NB**: il set deve essere composto da tuple (start, end).

#### 17) Contare il numero di trascritti del gene `ARHGAP4` che hanno una CDS annotata

#### 18) Estrarre lo strand del gene  `ATP6AP1`

#### 19) Estrarre per ogni trascritto del gene  `ATP6AP1` la lista delle tuple (start, end) dei suoi esoni

a) Estrarre il data frame delle sole righe relative alle features `exon` dei trascritti del gene `ATP6AP1` e delle sole colonne `start`, `end` e `transcript`.

b) Estrarre la lista dei trascritti

c) Estrarre per ogni trascritto la lista delle sue features (start, end)

d) Produrre in output per ogni trascritto la lista delle sue features (start, end)

#### 20) Contare per ogni gene quante sono le features annotate per ognuno dei tipi presenti nel GTF