In [2]:
import pandas as pd
import numpy as np

from matplotlib import pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
%matplotlib inline

from sklearn.decomposition import PCA
from scipy.stats import spearmanr as scor

## Dimension reduction and clustering of the CTRP-L1000 dataset

In [3]:
sig_info=pd.read_table('../results/CTRP/sig_info_merged_lm.csv',
                       sep=',',header=0,index_col=[0])
signatures=pd.read_table('../results/CTRP/signatures_merged_lm.csv',
                         sep=',',header=0,index_col=[0])

Let's do first a PCA and color the points based on the matched cell viability.

In [4]:
#let's do first a PCA
model=PCA(2,random_state=19890904)
signatures_pca=pd.DataFrame(model.fit_transform(signatures),
                            index=signatures.index)
evr1,evr2=model.explained_variance_ratio_.round(3).astype(str)
signatures_pca.columns=['PC1\nexplained variance ratio: '+evr1,
                      'PC2\nexplained variance ratio: '+evr2]
signatures_pca.to_csv('../results/model/dimension_reduction/pca.csv',
                      sep=',')

In [74]:
def make_pca_plot(sig_data,sig_info,color_by='viability',fname=None):
    """makes a plot of the 2 dimensional sig data, points are colored by
    {'viability','cell_id','pert_id','pert_itime','bin_viability'}
    if fname is given, saves the plot as a pdf to ../figures/"""
    exp_var1,exp_var2=model.explained_variance_ratio_
    signatures_pca=pd.DataFrame(model.transform(signatures),index=signatures.index,
                                columns=['PC1','PC2'])
    sig_info[]
    f, ax = plt.subplots()
    if color_by=='viability':
        cmap = sns.cubehelix_palette(8, start=.5, rot=-.75,as_cmap=True)
        points = ax.scatter(signatures_pca['PC1'],signatures_pca['PC2'], 
                            c=sig_info['cpd_avg_pv'], s=50, cmap=cmap,marker='.')
        f.colorbar(points,label='Cell viability')
        print('Cell viaiblity - PC1 Spearman Rho:',scor(sig_info['cpd_avg_pv'],signatures_pca['PC1']))
        print('Cell viaiblity - PC2 Spearman Rho:',scor(sig_info['cpd_avg_pv'],signatures_pca['PC2']))
    
    
    plt.xlabel('PC1\nexplained variance ratio: %s' % round(exp_var1,3))
    plt.ylabel('PC2\nexplained variance ratio: %s' % round(exp_var2,3))
    