In [1]:
import os

import pandas as pd

In [3]:
pd.options.mode.chained_assignment = None

---------------------------

In [4]:
annot_dir = '/data/parastou/RNAdeg/annotation/'

In [5]:
in_gff = os.path.join(annot_dir, 'schizosaccharomyces_pombe.chr.extended.gff3')
in_lengths = os.path.join(annot_dir, 'schizosaccharomyces_pombe.gene_lengths.V2.csv')

In [6]:
def gff_to_gdf(in_gff):
    
    columns = ['chr','1','type','start','end','2','3','4','info']
    
    df = pd.read_csv(in_gff, sep='\t', comment='#', names=columns)
    df = df[['chr', 'type', 'start', 'end', 'info']]
    df = df[df['type'].str.contains('gene')]
        
    df['gene-id'] = df['info'].apply(lambda x: x.split('=gene:')[1].split(';')[0].strip(' ";,'))  
    df['gene-name'] = df['info'].apply(lambda x: x.split(';Name=')[1].split(';')[0].strip(' ";,'))
    df['bio_type'] = df['info'].apply(lambda x: x.split(';biotype=')[1].split(';')[0].strip(' ";,'))
    
    df = df[['gene-id', 'gene-name', 'chr','type','start','end', 'bio_type']]
    
    return df    

-----------------------

### Create gene-specification table from .gff file 

In [7]:
df = gff_to_gdf(in_gff)

In [8]:
l_df = pd.read_csv(in_lengths, sep='\t')
l_df = l_df[['gene-id', 'length']]

### Add length information

In [9]:
df = df.merge(l_df, on=['gene-id'], how='outer')

In [10]:
import math

In [11]:
for index, row in df.iterrows():
    
    if math.isnan(row['length']):
        df.loc[index, 'length'] = row['end'] - row['start']       

### Add category column

In [12]:
columns = ['chr','1','type','start','end','2','3','4','info']
rep_df = pd.read_csv(os.path.join(annot_dir, 'repeats_subtelI_dg_dh_V2'), sep='\t', names=columns)
rep_df['gene-name'] = rep_df['info'].apply(lambda x: x.split(';Name=')[1].split(';')[0].strip(' ";,'))

In [13]:
repeat_names = list(rep_df['gene-name'])

In [14]:
def category(gene):
    
    if 'rpl' in gene or 'rps' in gene:
        return 'ribosomal'
    elif gene in repeat_names:
        return 'repeat'
    else:
        return 'gene'

In [15]:
df['category'] = df['gene-name'].apply(lambda x: category(x))

In [None]:
df = df.drop_duplicates()

### Remove redundant columns 

In [None]:
duplicates = []
g = df.groupby(['start', 'end'])
for s, group in g:
    if len(group['gene-name']) > 1:
        ## IMPORTANT: manualy check the duplicates
        duplicates.append(list(group['gene-name'])[1])

In [29]:
df = df[~df['gene-name'].isin(duplicates)]

###  Save PombeGene table

In [32]:
df.to_csv(os.path.join(annot_dir, 'schizosaccharomyces_pombe.chr.extended.csv'), sep='\t', index=None)