This script quantifies the distance from 2D and 3D cultures wrt. healthy cartilage obtained from GSE114007.

In [1]:
import pandas

# user-defined variables

In [2]:
healthy_data_file = '/Users/adrian/research/akureyri/data/GSE114007_normal_normalized.counts.txt'
cultures_data_file = '/Users/adrian/research/akureyri/results/sleuth_pipeline/sleuth_scaled_reads_bygene.csv'

annotation_file = '/Users/adrian/research/akureyri/results/deseq2_pipeline/annotation.tsv'

results_file = '/Users/adrian/research/akureyri/results/distances/expression.tsv'

# read info

In [3]:
hdf = pandas.read_csv(healthy_data_file, sep='\t', index_col=0)
hdf.drop(['Average Normal', 'Max'], axis=1, inplace=True)

In [4]:
cdf = pandas.read_csv(cultures_data_file, sep='\t', index_col=0)

In [5]:
annotation = pandas.read_csv(annotation_file, sep='\t', index_col=0)

# analysis

In [6]:
healthy_symbols = hdf.index
print(healthy_symbols)

Index(['FN1', 'COMP', 'MALAT1', 'CHI3L2', 'CLU', 'DCN', 'PRELP', 'CILP',
       'CHI3L1', 'GPX3',
       ...
       'XAGE1E', 'XAGE2', 'XAGE2B', 'XGPY2', 'XKRY', 'XKRY2', 'ZBED1',
       'ZNF658B', 'ZNF705B', 'ZNF705D'],
      dtype='object', name='symbol', length=23710)


In [7]:
ensembl_ids = cdf.index
print(ensembl_ids)

Index(['ENSG00000000003', 'ENSG00000000005', 'ENSG00000000419',
       'ENSG00000000457', 'ENSG00000000460', 'ENSG00000000938',
       'ENSG00000000971', 'ENSG00000001036', 'ENSG00000001084',
       'ENSG00000001167',
       ...
       'ENSG00000291317', 'ENSG00000292430', 'ENSG00000293527',
       'ENSG00000293542', 'ENSG00000293543', 'ENSG00000293546',
       'ENSG00000293548', 'ENSG00000293550', 'ENSG00000293552',
       'ENSG00000293553'],
      dtype='object', length=39521)


In [8]:
annotation.head()

Unnamed: 0,ensembl_transcript_id,ensembl_gene_id,external_gene_name,gene_biotype,description
1,ENST00000387314,ENSG00000210049,MT-TF,Mt_tRNA,mitochondrially encoded tRNA-Phe (UUU/C) [Sour...
2,ENST00000389680,ENSG00000211459,MT-RNR1,Mt_rRNA,mitochondrially encoded 12S rRNA [Source:HGNC ...
3,ENST00000387342,ENSG00000210077,MT-TV,Mt_tRNA,mitochondrially encoded tRNA-Val (GUN) [Source...
4,ENST00000387347,ENSG00000210082,MT-RNR2,Mt_rRNA,mitochondrially encoded 16S rRNA [Source:HGNC ...
5,ENST00000386347,ENSG00000209082,MT-TL1,Mt_tRNA,mitochondrially encoded tRNA-Leu (UUA/G) 1 [So...


## mapp ids

In [9]:
%%time
# this block takes circa 3 min

print(len(healthy_symbols))
rosetta = {}
for symbol in healthy_symbols:
    mini = annotation[annotation['external_gene_name'] == symbol]
    results = list(set(mini['ensembl_gene_id']))
    
    if len(results) == 1:
        rosetta[results[0]] = symbol
    if len(results) > 1:
        #print('WARNING: multiple ({}) mapping for {}: {}'.format(len(results), symbol, results[:3]))
        # check how many ensembl id I have in my expression df
        bi_results = []
        for result in results:
            if result in ensembl_ids:
                bi_results.append(result)
        if len(bi_results) == 1:
            #print('fixed multiple mapping')
            rosetta[results[0]] = symbol
        if len(bi_results) > 1:
            print('WARNING: multiple ({}) mapping for {}: {}. Dropping altogether.'.format(len(bi_results), 
                                                                                           symbol, bi_results[:3]))
print(len(rosetta))

23710
20420
CPU times: user 2min 59s, sys: 115 ms, total: 2min 59s
Wall time: 3min


## join dataframes

In [10]:
df = pandas.DataFrame.from_dict(rosetta, orient='index', columns=['symbol'])
print(df.shape)
df.head()

(20420, 1)


Unnamed: 0,symbol
ENSG00000115414,FN1
ENSG00000105664,COMP
ENSG00000251562,MALAT1
ENSG00000064886,CHI3L2
ENSG00000120885,CLU


In [11]:
new = df.join(hdf, on='symbol')
print(new.shape)
new.head()

(20420, 19)


Unnamed: 0,symbol,Normal_Cart_10_8,Normal_Cart_2_2,Normal_Cart_3_3,Normal_Cart_4_4,Normal_Cart_5_5,Normal_Cart_6_6,Normal_Cart_7_3,Normal_Cart_9_7,normal_01,normal_02,normal_03,normal_04,normal_05,normal_06,normal_07,normal_08,normal_09,normal_10
ENSG00000115414,FN1,16.277134,15.429753,15.428266,16.305868,14.635041,14.389434,15.410056,16.595074,15.680574,15.479558,15.706845,14.016121,15.731748,13.345441,15.401275,15.036921,14.951695,14.357657
ENSG00000105664,COMP,15.371944,14.51526,14.813281,14.776144,14.048698,12.793369,15.035222,16.146382,12.445524,11.82096,11.85461,11.714171,12.537045,10.31864,12.408678,12.273848,12.131808,10.739429
ENSG00000251562,MALAT1,15.441039,14.574888,15.053004,14.793931,14.773987,13.30675,14.967599,15.261263,15.153383,16.130006,15.126309,15.298698,15.049196,14.57865,15.621322,14.854685,14.887417,15.708208
ENSG00000064886,CHI3L2,7.645584,5.860772,6.055734,8.496841,6.743966,6.479437,4.659511,6.882385,9.678269,15.533499,7.524565,8.466126,9.348776,6.498835,10.958577,6.425965,5.50549,9.585698
ENSG00000120885,CLU,15.105566,14.493329,14.849689,14.704724,15.092099,15.483067,14.591833,15.392971,13.750018,13.279954,13.878884,12.809382,13.387182,12.428223,13.881811,13.734158,13.560484,13.62774


In [12]:
full = new.join(cdf)
print(full.shape)
full.head()

(20420, 31)


Unnamed: 0,symbol,Normal_Cart_10_8,Normal_Cart_2_2,Normal_Cart_3_3,Normal_Cart_4_4,Normal_Cart_5_5,Normal_Cart_6_6,Normal_Cart_7_3,Normal_Cart_9_7,normal_01,...,test03,test04,test05,test06,test07,test08,test09,test10,test11,test12
ENSG00000115414,FN1,16.277134,15.429753,15.428266,16.305868,14.635041,14.389434,15.410056,16.595074,15.680574,...,144555.857795,136202.567499,158394.53044,263460.790473,99233.799586,114716.922365,156588.519228,267493.400498,87149.863743,134312.309349
ENSG00000105664,COMP,15.371944,14.51526,14.813281,14.776144,14.048698,12.793369,15.035222,16.146382,12.445524,...,9.319409,2.31602,10.02553,0.969931,8.620417,7.731874,8.574195,4.977306,5.675533,4.031672
ENSG00000251562,MALAT1,15.441039,14.574888,15.053004,14.793931,14.773987,13.30675,14.967599,15.261263,15.153383,...,496.83973,636.863456,698.497986,995.558126,751.705658,682.57836,633.723345,932.498768,589.426906,901.068743
ENSG00000064886,CHI3L2,7.645584,5.860772,6.055734,8.496841,6.743966,6.479437,4.659511,6.882385,9.678269,...,3.994119,8.475188,0.507806,3.747833,6.877326,16.206062,6.55435,11.482193,0.444921,15.13407
ENSG00000120885,CLU,15.105566,14.493329,14.849689,14.704724,15.092099,15.483067,14.591833,15.392971,13.750018,...,3565.719528,5735.185955,5750.440648,8434.590269,3922.591202,9104.274777,2228.27247,4076.185218,1986.977753,3867.103432


In [13]:
print(hdf.loc['FN1', :])
print(cdf.loc['ENSG00000115414', :])
print()
print(df.loc['ENSG00000115414', :])
print(new.loc['ENSG00000115414', :])
print(full.loc['ENSG00000115414', :])
full.head()

Normal_Cart_10_8    16.277134
Normal_Cart_2_2     15.429753
Normal_Cart_3_3     15.428266
Normal_Cart_4_4     16.305868
Normal_Cart_5_5     14.635041
Normal_Cart_6_6     14.389434
Normal_Cart_7_3     15.410056
Normal_Cart_9_7     16.595074
normal_01           15.680574
normal_02           15.479558
normal_03           15.706845
normal_04           14.016121
normal_05           15.731748
normal_06           13.345441
normal_07           15.401275
normal_08           15.036921
normal_09           14.951695
normal_10           14.357657
Name: FN1, dtype: float64
test01    105736.078992
test02    378666.044597
test03    144555.857795
test04    136202.567499
test05    158394.530440
test06    263460.790473
test07     99233.799586
test08    114716.922365
test09    156588.519228
test10    267493.400498
test11     87149.863743
test12    134312.309349
Name: ENSG00000115414, dtype: float64

symbol    FN1
Name: ENSG00000115414, dtype: object
symbol                    FN1
Normal_Cart_10_8    16.277

Unnamed: 0,symbol,Normal_Cart_10_8,Normal_Cart_2_2,Normal_Cart_3_3,Normal_Cart_4_4,Normal_Cart_5_5,Normal_Cart_6_6,Normal_Cart_7_3,Normal_Cart_9_7,normal_01,...,test03,test04,test05,test06,test07,test08,test09,test10,test11,test12
ENSG00000115414,FN1,16.277134,15.429753,15.428266,16.305868,14.635041,14.389434,15.410056,16.595074,15.680574,...,144555.857795,136202.567499,158394.53044,263460.790473,99233.799586,114716.922365,156588.519228,267493.400498,87149.863743,134312.309349
ENSG00000105664,COMP,15.371944,14.51526,14.813281,14.776144,14.048698,12.793369,15.035222,16.146382,12.445524,...,9.319409,2.31602,10.02553,0.969931,8.620417,7.731874,8.574195,4.977306,5.675533,4.031672
ENSG00000251562,MALAT1,15.441039,14.574888,15.053004,14.793931,14.773987,13.30675,14.967599,15.261263,15.153383,...,496.83973,636.863456,698.497986,995.558126,751.705658,682.57836,633.723345,932.498768,589.426906,901.068743
ENSG00000064886,CHI3L2,7.645584,5.860772,6.055734,8.496841,6.743966,6.479437,4.659511,6.882385,9.678269,...,3.994119,8.475188,0.507806,3.747833,6.877326,16.206062,6.55435,11.482193,0.444921,15.13407
ENSG00000120885,CLU,15.105566,14.493329,14.849689,14.704724,15.092099,15.483067,14.591833,15.392971,13.750018,...,3565.719528,5735.185955,5750.440648,8434.590269,3922.591202,9104.274777,2228.27247,4076.185218,1986.977753,3867.103432


# store

In [14]:
full.to_csv(results_file, sep='\t')