## Manipolazione di un file GTF (Gene Transfer Format) attraverso la libreria `Pandas`

#### 1) Importare `Pandas`

In [30]:
import pandas as pd

#### 2) Leggere il file GTF

    df = pd.read_csv(gtf_file_name, sep='\t', header = None)

In [31]:
df = pd.read_csv('./input.gtf', sep='\t', header = None)
#df

**NB**: `read_csv()` ha un attributo `names` che permette di specificare la lista dei nomi delle colonne del data frame restutuito.

#### 3) Cambiare i nomi delle colonne

I nomi delle colonne devono essere:
- reference
- source
- feature
- start
- end
- score
- strand
- frame
- attributes

In [32]:
replace_dict = {0: 'reference', 1: 'source', 2: 'feature', 3: 'start', 4: 'end', 5: 'score', 6: 'strand', 7: 'frame', 8: 'attributes'}
df.rename(columns = replace_dict, inplace = True)
#df

#### 4) Eliminare le colonne `source` e `score`

In [33]:
df.drop(['source', 'score'], axis = 1, inplace = True)
#df

#### 5) Sostituire la colonna degli attributi con le due colonne  `transcript` e `gene`

La colonne `transcript` e `gene` dovranno contenere solo l'ID del trascritto e del gene.

In [36]:
import re

In [37]:
df['gene'] = ''
df['transcript'] = ''

In [38]:
for (index, record) in df.iterrows():
    transcript = re.search('transcript_id\s+(.+?);', record['attributes']).group(1).replace('"', '')
    gene = re.search('gene_id\s+(.+?);', record['attributes']).group(1).replace('"', '')
    df.at[index, 'gene'] =  gene
    df.at[index, 'transcript'] =  transcript

df.drop(['attributes'], axis=1, inplace = True)

df

Unnamed: 0,reference,feature,start,end,strand,frame,gene,transcript
0,ENm006,exon,71783,71788,-,.,ARHGAP4,U52112.4-005
1,ENm006,CDS,71783,71788,-,0,ARHGAP4,U52112.4-005
2,ENm006,exon,70312,70440,-,.,ARHGAP4,U52112.4-005
3,ENm006,CDS,70312,70440,-,0,ARHGAP4,U52112.4-005
4,ENm006,exon,69989,70210,-,.,ARHGAP4,U52112.4-005
5,ENm006,CDS,69989,70210,-,0,ARHGAP4,U52112.4-005
6,ENm006,exon,64935,65036,-,.,ARHGAP4,U52112.4-005
7,ENm006,CDS,64935,65036,-,0,ARHGAP4,U52112.4-005
8,ENm006,exon,64566,64673,-,.,ARHGAP4,U52112.4-005
9,ENm006,CDS,64566,64673,-,0,ARHGAP4,U52112.4-005


#### Aggiungo dopo `end` la colonna `length` con la lunghezza della feature

In [8]:
df['length'] = df['end'] - df['start'] + 1
columns_reindex = ['reference', 'source', 'feature', 'start', 'end', 'length', 'strand', 'gene', 'transcript']
df = df.reindex(columns = columns_reindex)
df

Unnamed: 0,reference,source,feature,start,end,length,strand,gene,transcript
0,ENm006,VEGA_Known,exon,71783,71788,6,-,ARHGAP4,U52112.4-005
1,ENm006,VEGA_Known,CDS,71783,71788,6,-,ARHGAP4,U52112.4-005
2,ENm006,VEGA_Known,exon,70312,70440,129,-,ARHGAP4,U52112.4-005
3,ENm006,VEGA_Known,CDS,70312,70440,129,-,ARHGAP4,U52112.4-005
4,ENm006,VEGA_Known,exon,69989,70210,222,-,ARHGAP4,U52112.4-005
5,ENm006,VEGA_Known,CDS,69989,70210,222,-,ARHGAP4,U52112.4-005
6,ENm006,VEGA_Known,exon,64935,65036,102,-,ARHGAP4,U52112.4-005
7,ENm006,VEGA_Known,CDS,64935,65036,102,-,ARHGAP4,U52112.4-005
8,ENm006,VEGA_Known,exon,64566,64673,108,-,ARHGAP4,U52112.4-005
9,ENm006,VEGA_Known,CDS,64566,64673,108,-,ARHGAP4,U52112.4-005


#### Rimuovo tutte le feature di al più 6 basi

In [9]:
df.drop(df[df.length <= 6].index, inplace = True)
df

Unnamed: 0,reference,source,feature,start,end,length,strand,gene,transcript
2,ENm006,VEGA_Known,exon,70312,70440,129,-,ARHGAP4,U52112.4-005
3,ENm006,VEGA_Known,CDS,70312,70440,129,-,ARHGAP4,U52112.4-005
4,ENm006,VEGA_Known,exon,69989,70210,222,-,ARHGAP4,U52112.4-005
5,ENm006,VEGA_Known,CDS,69989,70210,222,-,ARHGAP4,U52112.4-005
6,ENm006,VEGA_Known,exon,64935,65036,102,-,ARHGAP4,U52112.4-005
7,ENm006,VEGA_Known,CDS,64935,65036,102,-,ARHGAP4,U52112.4-005
8,ENm006,VEGA_Known,exon,64566,64673,108,-,ARHGAP4,U52112.4-005
9,ENm006,VEGA_Known,CDS,64566,64673,108,-,ARHGAP4,U52112.4-005
10,ENm006,VEGA_Known,exon,64385,64459,75,-,ARHGAP4,U52112.4-005
11,ENm006,VEGA_Known,CDS,64385,64459,75,-,ARHGAP4,U52112.4-005


#### Ordino il data frame per coordinate crescenti delle features

In [10]:
df.sort_values('start', inplace = True)
df

Unnamed: 0,reference,source,feature,start,end,length,strand,gene,transcript
399,ENm006,VEGA_Known,exon,53688,54049,362,+,AVPR2,U52112.2-002
403,ENm006,VEGA_Known,exon,55892,55928,37,+,AVPR2,U52112.2-001
404,ENm006,VEGA_Known,exon,56131,56327,197,+,AVPR2,U52112.2-001
400,ENm006,VEGA_Known,exon,56131,56327,197,+,AVPR2,U52112.2-002
395,ENm006,VEGA_Known,exon,56271,56327,57,+,AVPR2,U52112.2-003
405,ENm006,VEGA_Known,CDS,56303,56327,25,+,AVPR2,U52112.2-001
396,ENm006,VEGA_Known,CDS,56303,56327,25,+,AVPR2,U52112.2-003
407,ENm006,VEGA_Known,CDS,56689,57573,885,+,AVPR2,U52112.2-001
406,ENm006,VEGA_Known,exon,56689,57573,885,+,AVPR2,U52112.2-001
398,ENm006,VEGA_Known,CDS,56689,57593,905,+,AVPR2,U52112.2-003


#### Ordino il data frame per lunghezza decrescente della feature

In [11]:
df.sort_values('length', ascending = False, inplace = True)
df

Unnamed: 0,reference,source,feature,start,end,length,strand,gene,transcript
366,ENm006,VEGA_Known,exon,543097,545706,2610,+,ATP6AP1,XX-FW83563B9.4-002
397,ENm006,VEGA_Known,exon,56689,57938,1250,+,AVPR2,U52112.2-003
398,ENm006,VEGA_Known,CDS,56689,57593,905,+,AVPR2,U52112.2-003
407,ENm006,VEGA_Known,CDS,56689,57573,885,+,AVPR2,U52112.2-001
406,ENm006,VEGA_Known,exon,56689,57573,885,+,AVPR2,U52112.2-001
408,ENm006,VEGA_Known,exon,57680,58323,644,+,AVPR2,U52112.2-001
402,ENm006,VEGA_Known,exon,57680,58322,643,+,AVPR2,U52112.2-002
297,ENm006,VEGA_Known,exon,58524,59119,596,-,ARHGAP4,U52112.4-024
259,ENm006,VEGA_Known,exon,58533,59119,587,-,ARHGAP4,U52112.4-001
112,ENm006,VEGA_Known,exon,58534,59119,586,-,ARHGAP4,U52112.4-002


#### Determino i geni presenti

In [25]:
list(df['gene'].unique())

['ATP6AP1', 'AVPR2', 'ARHGAP4']

#### Determino i trascritti totali presenti

In [27]:
list(df['transcript'].unique())

['XX-FW83563B9.4-002',
 'U52112.2-003',
 'U52112.2-001',
 'U52112.2-002',
 'U52112.4-024',
 'U52112.4-001',
 'U52112.4-002',
 'U52112.4-003',
 'U52112.4-016',
 'U52112.4-006',
 'U52112.4-013',
 'U52112.4-012',
 'U52112.4-011',
 'U52112.4-007',
 'U52112.4-014',
 'U52112.4-023',
 'U52112.4-021',
 'U52112.4-022',
 'XX-FW83563B9.4-004',
 'XX-FW83563B9.4-006',
 'XX-FW83563B9.4-003',
 'XX-FW83563B9.4-001',
 'U52112.4-008',
 'U52112.4-010',
 'U52112.4-005',
 'U52112.4-015',
 'U52112.4-004',
 'U52112.4-018',
 'U52112.4-017',
 'U52112.4-019',
 'U52112.4-009',
 'U52112.4-020']

#### Calcolo la lunghezza media delle features

In [24]:
df['length'].mean()

159.84803921568627

#### Determino l'esone più corto e i trascritti che lo contengono

a) Determino la lunghezza minina degli esoni

In [14]:
feature = 'exon'

In [15]:
min_length = min(df[df.feature == feature]['length'])

In [16]:
df[(df.length == min_length) & (df.feature == feature)]

Unnamed: 0,reference,source,feature,start,end,length,strand,gene,transcript
313,ENm006,VEGA_Known,exon,60665,60692,28,-,ARHGAP4,U52112.4-008
339,ENm006,VEGA_Known,exon,64181,64208,28,-,ARHGAP4,U52112.4-011
237,ENm006,VEGA_Known,exon,64181,64208,28,-,ARHGAP4,U52112.4-001
173,ENm006,VEGA_Known,exon,64181,64208,28,-,ARHGAP4,U52112.4-003
12,ENm006,VEGA_Known,exon,79484,79511,28,-,ARHGAP4,U52112.4-018
27,ENm006,VEGA_Known,exon,64181,64208,28,-,ARHGAP4,U52112.4-014
203,ENm006,VEGA_Known,exon,64181,64208,28,-,ARHGAP4,U52112.4-012
82,ENm006,VEGA_Known,exon,64181,64208,28,-,ARHGAP4,U52112.4-013
104,ENm006,VEGA_Known,exon,64181,64208,28,-,ARHGAP4,U52112.4-002
275,ENm006,VEGA_Known,exon,64181,64208,28,-,ARHGAP4,U52112.4-024


#### Conto i trascritti per il gene `ARHGAP4`

In [29]:
gene = 'ARHGAP4'

len(set(df[df['gene'] == gene]['transcript']))
len(df[df['gene'] == gene]['transcript'].unique())

24

#### Estraggo i geni con strand `+`

In [32]:
list(df[df['strand'] == '+']['gene'].unique())

['ATP6AP1', 'AVPR2']

#### Produco la lista di tutti gli esoni (distinti) del gene `ATP6AP1`

In [35]:
gene = 'ATP6AP1'

set(list(zip(df[df['gene'] == gene]['start'], df[df['gene'] == gene]['end'])))

{(542687, 542902),
 (542694, 542902),
 (542747, 542902),
 (542790, 542902),
 (542894, 543223),
 (543097, 543223),
 (543097, 545706),
 (545879, 545953),
 (546315, 546425),
 (546315, 546508),
 (546980, 547020),
 (547684, 547769),
 (547709, 547769),
 (548257, 548495)}

#### Conto il numero di trascritti del gene `ATP6AP1` che hanno una CDS annotata

In [20]:
gene = 'ATP6AP1'

df_temp = df[df['gene'] == gene][['feature', 'transcript']]
len(set(df_temp[df_temp['feature'] == 'CDS']['transcript']))

0

#### Ricostruisco i trascritti per il gene `ATP6AP1`

In [21]:
gene = 'ATP6AP1'
feature = 'exon'

a) Ottengo lo strand

In [22]:
strand = list(set(df[df['gene'] == gene]['strand']))[0]
strand

'+'

b) Ricostruisco i trascritti

In [23]:
df_temp = df[df['gene'] == gene]
df_temp = df_temp[df_temp['feature'] == feature]
df_temp= df_temp[['start', 'end', 'transcript']]

transcript_set = set(df_temp['transcript'])

for transcript in transcript_set:
    start_column = df_temp.loc[df_temp['transcript'] == transcript]['start']
    end_column = df_temp.loc[df_temp['transcript'] == transcript]['start']
    ft_list = list(zip(start_column, end_column))
    #... chiamo la funzione dell'esercizio precedente

Usare df['attribute'].str.contains()
Usare df['attribute'].values.contains()