In [None]:
import numpy as np
import pandas as pd

from pathlib import Path

from ftplib import FTP

import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use('seaborn-dark-palette')
%matplotlib inline

Below is code to automatically grab a file from the FTP for RNAcentral. FTPlib doesn't like large files though, so I ended up doing it manually. 

In [None]:
# HOSTNAME = 'ftp.ebi.ac.uk'

# ftp = FTP(HOSTNAME)

# ftp.login()

# ftp.cwd('pub/databases/RNAcentral/current_release/json')

# filename = 'ensembl-xref-1-100001.json'

# with open(filename, "w") as rna_json:
#     ftp.retrlines(f"RETR {filename}", rna_json.write)


Instead of automated requests, I grabbed a few of the JSON files from the FTP, which can be read in with the following function. 

In [None]:
def load_rna(fname, inc_other=True, merge_other=True, simple_descrips=False, drop=True):
    rna = pd.read_json(fname)
    
    # drop extra columns which reference external databases -- always
    rna = rna.drop(['xrefs', 'md5'], axis=1)
    
    # make the column names pretty
    cols = {'rnacentral_id': 'RNA ID', 'description': 'Description', 'sequence':'Sequence', 'rna_type': 'RNA Type', 'taxon_id': 'Taxon ID'}
    rna = rna.rename(columns=cols)
    
    # remove underscores from RNA type
    rna['RNA Type'] = rna.apply(lambda row : (row['RNA Type'].replace('_', ' ')), axis=1)
    
    # combine the 'other', 'misc', 'nc', and 'lnc' types\n",
    if merge_other:
        rna['RNA Type'] = rna.apply(lambda row : (row['RNA Type'].replace('misc RNA', 'other')), axis=1)
        rna['RNA Type'] = rna.apply(lambda row : (row['RNA Type'].replace('lncRNA', 'other')), axis=1)
        rna['RNA Type'] = rna.apply(lambda row : (row['RNA Type'].replace('ncRNA', 'other')), axis=1)

    # drop rows with unclassified RNAs? 
    if not inc_other:
        rna = rna[rna['RNA Type'] != 'other']
        
        if not merge_other:
            rna = rna[rna['RNA Type'] != 'misc RNA']
            rna = rna[rna['RNA Type'] != 'ncRNA']
            rna = rna[rna['RNA Type'] != 'lncRNA']
    
    # Remove everything except latin name from species description
    if simple_descrips:
        rna['Description'] = rna.apply(lambda row : (row['Description'].split(' ')[0] + ' ' + row['Description'].split(' ')[1]), axis=1)\
        
    # drop extra columns with unnecessary info -- optional
    if drop:
        rna = rna.drop(['Taxon ID', 'RNA ID', 'Description'], axis=1)
    
    return rna



In [None]:
# load a batch of data files from RNAcentral or load in a single file?
load_all = True

if load_all:
    p = Path()
    p = p / 'data'
    files = p.glob('ensembl-*')
    rna = pd.DataFrame()

    for file in files:
        try: 
            print(f'Loading data from {str(file)}')
            rna = pd.concat([rna, load_rna(str(file), inc_other=False, simple_descrips=True)])
        except:
            print(f'Could not load data from {str(file)}')

    print('All data loaded.')

else:
    rna = load_rna('data\ensembl-xref-34100001-34200001.json')


In [None]:
def contract_families(types):
    
    classification = []
    
    classes = {'cis-reg': ['IRES', 'leader', 'riboswitch'], 
               'intron': ['autocatalytically spliced intron', 'intron gpI', 'intron gpII', 'intron'],
                  'gene': ['tRNA', 'rRNA', 'snRNA', 'snoRNA', 'pre miRNA', 'hammerhead ribozyme', 'SRP RNA', 'sRNA', 'Y RNA',
                          'RNase P RNA', 'lncRNA', 'tmRNA', 'RNase MRP RNA', 'vault RNA', 'ribozyme', 'miRNA', 'antisense RNA',
                          'telomerase RNA', 'scRNA', 'precursor RNA', 'scaRNA', 'piRNA'], 'artificial': ['guide RNA', 'siRNA']}

    for t in types:
        flag = False
        for c  in classes.keys():
            if t in classes[c]:
                classification.append(c)
                flag = True
                break

        if not flag:
            classification.append(t)


    family = []
    
    families = {'tRNA': ['tRNA', 'tmRNA'], 'rRNA': ['rRNA', 'SRP RNA', 'Y RNA'], 'snRNA': 'snRNA', 
                'snoRNA': ['snoRNA', 'scRNA', 'scaRNA'], 'miRNA': ['pre miRNA', 'miRNA', 'piRNA'], 'precursor RNA': ['precursor RNA'],
                'ribozyme': ['hammerhead ribozyme', 'RNase P RNA', 'RNase MRP RNA', 'ribozyme', 'autocatalytically spliced intron'], 
                'sRNA': ['sRNA'], 'antisense': ['antisense'], 'vault RNA': ['vault RNA'], 'telomerase RNA': ['telomerase RNA'],
                'artificial': ['guide RNA', 'siRNA']}

    for t in types:
        flag = False
        for f in families.keys():
            if t in families[f]:
                family.append(f)
                flag = True
                break

        if not flag:
            family.append(t)
            
    return classification, family

In [None]:
rna['RNA Class'], rna['RNA Family'] = contract_families(rna['RNA Type'])
rna.head()

In [None]:
merge_sequence = rna.groupby("Sequence").agg({'RNA Type':'first'}).reset_index()
print(len(rna))
print(len(merge_sequence))

In [None]:
def visual_comp_merged(original_seq, merged_seq, cat='RNA Type'):
    typcount = original_seq[cat].value_counts()
    mtypcount = merged_seq[cat].value_counts()
    xpos = np.arange(0, 2*len(typcount.index), 2)
    
    plt.figure()
    plt.bar(x=xpos, height=typcount, log=True, width=-0.8, align='edge', tick_label=typcount.index)
    plt.bar(x=xpos, height=mtypcount, log=True, width=0.8, align='edge')

    plt.xticks(rotation=90)
    plt.xlabel(f'{cat}')
    plt.ylabel('Log count');

In [None]:
merge_sequence['RNA Class'], merge_sequence['RNA Family'] = contract_families(merge_sequence['RNA Type'])
visual_comp_merged(rna, merge_sequence)
visual_comp_merged(rna, merge_sequence, 'RNA Class')
visual_comp_merged(rna, merge_sequence, 'RNA Family')