## Manipolazione di un file GTF (Gene Transfer Format) con `Pandas`

### Importare `Pandas` e leggere il file `GTF`

In [2]:
import pandas as pd

In [6]:
df = pd.read_csv('./input.gtf', sep = '\t', header = None)

### Cambiare i nomi delle colonne

I nomi delle colonne devono essere:
- reference
- source
- feature
- start
- end
- score
- strand
- frame
- attributes

In [7]:
replace_dict = {0 : 'reference', 1 : 'source', 2 : 'feature', 3 : 'start',
                4 : 'end', 5 : 'score', 6 : 'strand', 7 : 'frame', 8 : 'attributes'}

df.rename(columns = replace_dict, inplace = True)

In [8]:
df

Unnamed: 0,reference,source,feature,start,end,score,strand,frame,attributes
0,ENm006,VEGA_Known,exon,71783,71788,.,-,.,"transcript_id ""U52112.4-005""; gene_id ""ARHGAP4"";"
1,ENm006,VEGA_Known,CDS,71783,71788,.,-,0,"transcript_id ""U52112.4-005""; gene_id ""ARHGAP4"";"
2,ENm006,VEGA_Known,exon,70312,70440,.,-,.,"transcript_id ""U52112.4-005""; gene_id ""ARHGAP4"";"
3,ENm006,VEGA_Known,CDS,70312,70440,.,-,0,"transcript_id ""U52112.4-005""; gene_id ""ARHGAP4"";"
4,ENm006,VEGA_Known,exon,69989,70210,.,-,.,"transcript_id ""U52112.4-005""; gene_id ""ARHGAP4"";"
...,...,...,...,...,...,...,...,...,...
405,ENm006,VEGA_Known,CDS,56303,56327,.,+,0,"transcript_id ""U52112.2-001""; gene_id ""AVPR2"";..."
406,ENm006,VEGA_Known,exon,56689,57573,.,+,.,"transcript_id ""U52112.2-001""; gene_id ""AVPR2"";..."
407,ENm006,VEGA_Known,CDS,56689,57573,.,+,2,"transcript_id ""U52112.2-001""; gene_id ""AVPR2"";..."
408,ENm006,VEGA_Known,exon,57680,58323,.,+,.,"transcript_id ""U52112.2-001""; gene_id ""AVPR2"";..."


### Eliminare le colonne `source` e `score`

In [9]:
df.drop(['source', 'score'], axis = 1, inplace = True)

In [10]:
df

Unnamed: 0,reference,feature,start,end,strand,frame,attributes
0,ENm006,exon,71783,71788,-,.,"transcript_id ""U52112.4-005""; gene_id ""ARHGAP4"";"
1,ENm006,CDS,71783,71788,-,0,"transcript_id ""U52112.4-005""; gene_id ""ARHGAP4"";"
2,ENm006,exon,70312,70440,-,.,"transcript_id ""U52112.4-005""; gene_id ""ARHGAP4"";"
3,ENm006,CDS,70312,70440,-,0,"transcript_id ""U52112.4-005""; gene_id ""ARHGAP4"";"
4,ENm006,exon,69989,70210,-,.,"transcript_id ""U52112.4-005""; gene_id ""ARHGAP4"";"
...,...,...,...,...,...,...,...
405,ENm006,CDS,56303,56327,+,0,"transcript_id ""U52112.2-001""; gene_id ""AVPR2"";..."
406,ENm006,exon,56689,57573,+,.,"transcript_id ""U52112.2-001""; gene_id ""AVPR2"";..."
407,ENm006,CDS,56689,57573,+,2,"transcript_id ""U52112.2-001""; gene_id ""AVPR2"";..."
408,ENm006,exon,57680,58323,+,.,"transcript_id ""U52112.2-001""; gene_id ""AVPR2"";..."


### Sostituire la colonna degli attributi con le due colonne  `gene` e `transcript`

La colonne `gene` e `transcript` dovranno contenere solo il nome del gene e l'ID del trascritto.

In [12]:
import re

In [13]:
df['transcript_id'] = df['attributes'].apply(lambda x : re.search(r'transcript_id\s+"(.+?)";', x).group(1))

In [15]:
df['gene'] = df['attributes'].apply(lambda x : re.search(r'gene_id\s+"(.+?)";', x).group(1))

In [16]:
df

Unnamed: 0,reference,feature,start,end,strand,frame,attributes,transcript_id,gene
0,ENm006,exon,71783,71788,-,.,"transcript_id ""U52112.4-005""; gene_id ""ARHGAP4"";",U52112.4-005,ARHGAP4
1,ENm006,CDS,71783,71788,-,0,"transcript_id ""U52112.4-005""; gene_id ""ARHGAP4"";",U52112.4-005,ARHGAP4
2,ENm006,exon,70312,70440,-,.,"transcript_id ""U52112.4-005""; gene_id ""ARHGAP4"";",U52112.4-005,ARHGAP4
3,ENm006,CDS,70312,70440,-,0,"transcript_id ""U52112.4-005""; gene_id ""ARHGAP4"";",U52112.4-005,ARHGAP4
4,ENm006,exon,69989,70210,-,.,"transcript_id ""U52112.4-005""; gene_id ""ARHGAP4"";",U52112.4-005,ARHGAP4
...,...,...,...,...,...,...,...,...,...
405,ENm006,CDS,56303,56327,+,0,"transcript_id ""U52112.2-001""; gene_id ""AVPR2"";...",U52112.2-001,AVPR2
406,ENm006,exon,56689,57573,+,.,"transcript_id ""U52112.2-001""; gene_id ""AVPR2"";...",U52112.2-001,AVPR2
407,ENm006,CDS,56689,57573,+,2,"transcript_id ""U52112.2-001""; gene_id ""AVPR2"";...",U52112.2-001,AVPR2
408,ENm006,exon,57680,58323,+,.,"transcript_id ""U52112.2-001""; gene_id ""AVPR2"";...",U52112.2-001,AVPR2


In [17]:
df.drop('attributes', axis = 1, inplace = True)

In [18]:
df

Unnamed: 0,reference,feature,start,end,strand,frame,transcript_id,gene
0,ENm006,exon,71783,71788,-,.,U52112.4-005,ARHGAP4
1,ENm006,CDS,71783,71788,-,0,U52112.4-005,ARHGAP4
2,ENm006,exon,70312,70440,-,.,U52112.4-005,ARHGAP4
3,ENm006,CDS,70312,70440,-,0,U52112.4-005,ARHGAP4
4,ENm006,exon,69989,70210,-,.,U52112.4-005,ARHGAP4
...,...,...,...,...,...,...,...,...
405,ENm006,CDS,56303,56327,+,0,U52112.2-001,AVPR2
406,ENm006,exon,56689,57573,+,.,U52112.2-001,AVPR2
407,ENm006,CDS,56689,57573,+,2,U52112.2-001,AVPR2
408,ENm006,exon,57680,58323,+,.,U52112.2-001,AVPR2


### Aggiungere la colonna `length` contenente la lunghezza della *feature*

In [19]:
df['length'] = df['end'] - df['start'] + 1

In [20]:
df

Unnamed: 0,reference,feature,start,end,strand,frame,transcript_id,gene,length
0,ENm006,exon,71783,71788,-,.,U52112.4-005,ARHGAP4,6
1,ENm006,CDS,71783,71788,-,0,U52112.4-005,ARHGAP4,6
2,ENm006,exon,70312,70440,-,.,U52112.4-005,ARHGAP4,129
3,ENm006,CDS,70312,70440,-,0,U52112.4-005,ARHGAP4,129
4,ENm006,exon,69989,70210,-,.,U52112.4-005,ARHGAP4,222
...,...,...,...,...,...,...,...,...,...
405,ENm006,CDS,56303,56327,+,0,U52112.2-001,AVPR2,25
406,ENm006,exon,56689,57573,+,.,U52112.2-001,AVPR2,885
407,ENm006,CDS,56689,57573,+,2,U52112.2-001,AVPR2,885
408,ENm006,exon,57680,58323,+,.,U52112.2-001,AVPR2,644


In [21]:
df = df.reindex(columns = ['reference', 'feature', 'start', 'end',
                      'length', 'strand', 'frame', 'gene', 'transcript_id'])

In [22]:
df

Unnamed: 0,reference,feature,start,end,length,strand,frame,gene,transcript_id
0,ENm006,exon,71783,71788,6,-,.,ARHGAP4,U52112.4-005
1,ENm006,CDS,71783,71788,6,-,0,ARHGAP4,U52112.4-005
2,ENm006,exon,70312,70440,129,-,.,ARHGAP4,U52112.4-005
3,ENm006,CDS,70312,70440,129,-,0,ARHGAP4,U52112.4-005
4,ENm006,exon,69989,70210,222,-,.,ARHGAP4,U52112.4-005
...,...,...,...,...,...,...,...,...,...
405,ENm006,CDS,56303,56327,25,+,0,AVPR2,U52112.2-001
406,ENm006,exon,56689,57573,885,+,.,AVPR2,U52112.2-001
407,ENm006,CDS,56689,57573,885,+,2,AVPR2,U52112.2-001
408,ENm006,exon,57680,58323,644,+,.,AVPR2,U52112.2-001


### Rimuovere tutte le *features* di lunghezza ≤ 6

In [24]:
df = df[df['length'] > 6]

In [25]:
df

Unnamed: 0,reference,feature,start,end,length,strand,frame,gene,transcript_id
2,ENm006,exon,70312,70440,129,-,.,ARHGAP4,U52112.4-005
3,ENm006,CDS,70312,70440,129,-,0,ARHGAP4,U52112.4-005
4,ENm006,exon,69989,70210,222,-,.,ARHGAP4,U52112.4-005
5,ENm006,CDS,69989,70210,222,-,0,ARHGAP4,U52112.4-005
6,ENm006,exon,64935,65036,102,-,.,ARHGAP4,U52112.4-005
...,...,...,...,...,...,...,...,...,...
405,ENm006,CDS,56303,56327,25,+,0,AVPR2,U52112.2-001
406,ENm006,exon,56689,57573,885,+,.,AVPR2,U52112.2-001
407,ENm006,CDS,56689,57573,885,+,2,AVPR2,U52112.2-001
408,ENm006,exon,57680,58323,644,+,.,AVPR2,U52112.2-001


### Estrarre il *data frame* dei 20 esoni più lunghi e degli ultimi 20 esoni localizzati sulla *reference sequence*

In [27]:
exon_df = df[df['feature'] == 'exon']

In [29]:
first_df = exon_df.sort_values('length', ascending = False).head(20)

In [32]:
second_df = exon_df.sort_values('end').tail(20)

In [33]:
pd.merge(first_df, second_df, how = 'outer')

Unnamed: 0,reference,feature,start,end,length,strand,frame,gene,transcript_id
0,ENm006,exon,56689,57573,885,+,.,AVPR2,U52112.2-001
1,ENm006,exon,56689,57938,1250,+,.,AVPR2,U52112.2-003
2,ENm006,exon,57176,57573,398,+,.,AVPR2,U52112.2-002
3,ENm006,exon,57680,58322,643,+,.,AVPR2,U52112.2-002
4,ENm006,exon,57680,58323,644,+,.,AVPR2,U52112.2-001
5,ENm006,exon,58524,59119,596,-,.,ARHGAP4,U52112.4-024
6,ENm006,exon,58533,59119,587,-,.,ARHGAP4,U52112.4-001
7,ENm006,exon,58534,59119,586,-,.,ARHGAP4,U52112.4-002
8,ENm006,exon,58534,59119,586,-,.,ARHGAP4,U52112.4-003
9,ENm006,exon,58572,59119,548,-,.,ARHGAP4,U52112.4-016


### Estrarre i nomi dei geni e gli identificatori dei trascritti annotati nel file GTF

### Determinare, per ogni gene, la lunghezza media, massima e minima degli esoni

### Determinare la lunghezza minima degli esoni ed estrarre tutti i trascritti che contengono un esone di lunghezza minima

a) Determinare la lunghezza minima degli esoni

b) Estrarre la lista dei trascritti che contengono un esone di lunghezza minima

### Contare quanti trascritti sono annotati per il gene `ARHGAP4`

### Estrarre la lista dei geni con strand `+`

### Estrarre gli esoni (distinti) del gene `ATP6AP1` in una lista di tuple (start, end)

### Contare il numero di trascritti del gene `ARHGAP4` che hanno una CDS annotata

### Estrarre lo strand del gene  `ATP6AP1`

### Determinare il trascritto che ha più esoni

### Estrarre per ogni trascritto del gene  `ARHGAP4` la lista delle tuple (start, end) dei suoi esoni ordinate per start crescente

### Determinare, per ogni gene e ogni trascritto il numero di esoni che lo compongono

### Determinare per ogni gene, la lunghezza del suo *locus*