This script quantifies the distance from 2D and 3D cultures wrt. healthy cartilage obtained from GSE114007.

In [1]:
import pandas

# user-defined variables

In [2]:
healthy_data_file = '/Users/adrian/research/akureyri/data/transcriptomics/GSE114007_normal_normalized.counts.txt'
cultures_data_file = '/Users/adrian/research/akureyri/results/sleuth_pipeline/sleuth_scaled_reads_bygene.csv'
annotation_file = '/Users/adrian/research/akureyri/results/deseq2_pipeline/annotation.tsv'
results_file = '/Users/adrian/research/akureyri/results/distances/compiled_original_expression.tsv'

# read info

In [3]:
hdf = pandas.read_csv(healthy_data_file, sep='\t', index_col=0)
hdf.drop(['Average Normal', 'Max'], axis=1, inplace=True)

In [4]:
cdf = pandas.read_csv(cultures_data_file, sep='\t', index_col=0)

In [5]:
annotation = pandas.read_csv(annotation_file, sep='\t', index_col=0)

# analysis

In [6]:
healthy_symbols = hdf.index
print(healthy_symbols)

Index(['FN1', 'COMP', 'MALAT1', 'CHI3L2', 'CLU', 'DCN', 'PRELP', 'CILP',
       'CHI3L1', 'GPX3',
       ...
       'XAGE1E', 'XAGE2', 'XAGE2B', 'XGPY2', 'XKRY', 'XKRY2', 'ZBED1',
       'ZNF658B', 'ZNF705B', 'ZNF705D'],
      dtype='object', name='symbol', length=23710)


In [7]:
ensembl_ids = cdf.index
print(ensembl_ids)

Index(['ENSG00000000003', 'ENSG00000000005', 'ENSG00000000419',
       'ENSG00000000457', 'ENSG00000000460', 'ENSG00000000938',
       'ENSG00000000971', 'ENSG00000001036', 'ENSG00000001084',
       'ENSG00000001167',
       ...
       'ENSG00000292430', 'ENSG00000293527', 'ENSG00000293542',
       'ENSG00000293543', 'ENSG00000293546', 'ENSG00000293548',
       'ENSG00000293550', 'ENSG00000293553', 'ENSG00000293563',
       'ENSG00000293597'],
      dtype='object', length=39400)


In [8]:
annotation.head()

Unnamed: 0,ensembl_transcript_id,ensembl_gene_id,external_gene_name,gene_biotype,description,geneLength
1,ENST00000387314,ENSG00000210049,MT-TF,Mt_tRNA,mitochondrially encoded tRNA-Phe (UUU/C) [Sour...,70
2,ENST00000389680,ENSG00000211459,MT-RNR1,Mt_rRNA,mitochondrially encoded 12S rRNA [Source:HGNC ...,953
3,ENST00000387342,ENSG00000210077,MT-TV,Mt_tRNA,mitochondrially encoded tRNA-Val (GUN) [Source...,68
4,ENST00000387347,ENSG00000210082,MT-RNR2,Mt_rRNA,mitochondrially encoded 16S rRNA [Source:HGNC ...,1558
5,ENST00000386347,ENSG00000209082,MT-TL1,Mt_tRNA,mitochondrially encoded tRNA-Leu (UUA/G) 1 [So...,74


In [9]:
annotation.shape

(277081, 6)

## mapp ids

In [10]:
%%time
# this block takes circa 3 min

print(len(healthy_symbols))
rosetta = {}
for symbol in healthy_symbols:
    mini = annotation[annotation['external_gene_name'] == symbol]
    results = list(set(mini['ensembl_gene_id']))
    
    if len(results) == 1:
        rosetta[results[0]] = symbol
    if len(results) > 1:
        #print('WARNING: multiple ({}) mapping for {}: {}'.format(len(results), symbol, results[:3]))
        # check how many ensembl id I have in my expression df
        bi_results = []
        for result in results:
            if result in ensembl_ids:
                bi_results.append(result)
        if len(bi_results) == 1:
            rosetta[bi_results[0]] = symbol
        if len(bi_results) > 1:
            print('WARNING: multiple ({}) mapping for {}: {}. Dropping altogether.'.format(len(bi_results), symbol, bi_results[:3]))
        
print(len(rosetta))

23710
20419
CPU times: user 3min 4s, sys: 82.2 ms, total: 3min 5s
Wall time: 3min 6s


In [12]:
for element in rosetta.keys():
    #print(element)
    retrieved_length_values = list(set(annotation[annotation['ensembl_gene_id'] == element]['geneLength']))
    if len(retrieved_length_values) != 1:
        print('we have a problem')
    rosetta[]

In [None]:
annotation['ensembl_gene_id']


In [None]:
kjkh

## join dataframes

In [None]:
df = pandas.DataFrame.from_dict(rosetta, orient='index', columns=['symbol'])
print(df.shape)
df.head()

In [None]:
[df.index]

In [None]:
asd

In [None]:
new = df.join(hdf, on='symbol')
print(new.shape)
new.head()

In [None]:
full = new.join(cdf)
print(full.shape)
full.head()

In [None]:
print(hdf.loc['FN1', :])
print(cdf.loc['ENSG00000115414', :])
print()
print(df.loc['ENSG00000115414', :])
print(new.loc['ENSG00000115414', :])
print(full.loc['ENSG00000115414', :])
full.head()

# store

In [None]:
full.to_csv(results_file, sep='\t')