In [None]:
## Working with DepMap data

In [1]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import cufflinks as cf
import cufflinks as cf
import networkx as nx
import matplotlib.pyplot as plt
import seaborn as sns
import scipy as sp
%matplotlib inline

### Read in DepMap mutation rate and cell line info data

In [2]:
#depmap_mutations = pd.read_csv("CCLE_mutations.csv", low_memory = False)
depmap_info = pd.read_csv("sample_info.csv")
#depmap_crispr_raw = pd.read_csv("Achilles_gene_effect.csv")
depmap_expression_raw = pd.read_csv("CCLE_expression.csv")

FileNotFoundError: [Errno 2] File b'sample_info.csv' does not exist: b'sample_info.csv'

In [None]:
depmap_mutations.head()

In [None]:
depmap_info.head()

In [None]:
depmap_crispr_raw.head()

In [3]:
depmap_crispr_raw.rename(columns={"Unnamed: 0": "DepMap_ID"})

# Move DepMap_ID to last columnn
depmap_ids = depmap_crispr_raw["Unnamed: 0"]
#df1 = depmap_crispr_raw.pop('DepMap_ID') # remove column b and store it in df1
#depmap_crispr_raw['DepMap_ID'] = df1 # add b series as a 'new' column.
depmap_crispr_raw['DepMap_ID'] = depmap_ids


NameError: name 'depmap_crispr_raw' is not defined

In [None]:
print(len(depmap_expression_raw.columns))
depmap_expression_cellIDs = depmap_expression_raw["Unnamed: 0"]
depmap_expression_raw.pop('Unnamed: 0')
depmap_expression_raw['DepMap_ID'] = depmap_expression_cellIDs
depmap_expression_raw.head()
print(len(depmap_expression_raw.columns))
nGenes = len(depmap_expression_raw.columns)-1

### Retrieve cell lines of interest
The cell line names are contained in the `depmap_info` data frame, so we need to match the cell line names with the ACH-___ ID in `depmap_X`.

In [None]:
depmap_expression = pd.merge(depmap_expression_raw,depmap_info,on = "DepMap_ID", how= "left")

In [None]:
depmap_expression.head()

In [None]:
a375_all_data = depmap_expression.loc[depmap_expression["CCLE_Name"] == "A375_SKIN", :]

In [None]:
a375_expression = a375_all_data.iloc[0, 0:nGenes]
a375_expression

In [None]:
sns.set(style="whitegrid", palette="dark")

mean_color = '#434d6e'
median_color = '#0a1338'

fig = sns.distplot(a375_expression, rug = False, kde = False, label = 'Gene count')
fig.axvline(a375_expression.mean(), label = 'Mean', color = mean_color)
fig.axvline(a375_expression.median(), label = 'Median', color = median_color, dashes = (2,1,2,1))
fig.set(xlabel='Transcripts per million', ylabel='Frequency', title = "A375 Gene Expression Distribution")
fig.legend()
plt.show()
print(a375_expression.mean())
#fig.figure.savefig("L4_population_distribution.pdf")

In [None]:
# Randomly sample n = 100 points from the gene expression data
n = 10
sample_1 = np.random.choice(a375_expression, n, replace = False)
print(sample_1.mean())

# Plot histogram
# if we want relative frequency, set norm_hist = True
fig_1 = sns.distplot(sample_1, rug = False, kde = False, label = 'Gene count', norm_hist = True)
fig_1.axvline(sample_1.mean(), label = 'Mean', color = mean_color)
#fig_1.axvline(np.median(sample_1), label = 'Median', color = median_color, dashes = (2,1,2,1))
fig_1.set(xlabel='Transcripts per million', ylabel='Relative frequency', title = "A375 Gene Expression Distribution")
fig_1.legend()
plt.show()
fig_1.figure.savefig("L4_expression_sample_n10_4.pdf")

In [None]:
## Sample many times and plot the distribution of means
n = 10
n_replicates = 1000
sample_mean_vec = np.zeros((1,n_replicates))

for rep in range(n_replicates):
    
    # Sample randomly n points from the population
    sample_i = np.random.choice(a375_expression, n, replace = False)
    
    # Compute the mean of the sample
    sample_i_mean = np.mean(sample_i)
    
    # Store the mean
    sample_mean_vec[0, rep] = sample_i_mean

print("done with experiment")
    



In [None]:
# Histogram of the means

fig_2 = sns.distplot(sample_mean_vec, rug = False, kde = False, label = 'Sample mean',hist_kws={"linewidth": 3,
                            "alpha": 0.7, "color": mean_color})
fig_2.axvline(a375_expression.mean(), label = 'Population mean', color = '#ff3800')
fig_2.axvline(np.mean(sample_mean_vec), label = 'Mean of computed sample means', color = mean_color, dashes = (2,1,2,1))
fig_2.set(xlabel='Sample mean (tpm)', ylabel='Frequency', title = "Sampling distribution", ylim = (0,120))
fig_2.legend()
#plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
lgd = plt.legend(loc=2, bbox_to_anchor=(1.05,1), borderaxespad = 0.0)
plt.show()
#fig_2.figure.savefig("L4_expression_distofsamplemeans_n1000.pdf",bbox_extra_artists=(lgd,), bbox_inches='tight')

In [None]:
## Make horizontal boxplots
n = 100
n_replicates = 20
sample_vec = np.zeros((n,n_replicates))

for rep in range(n_replicates):
    
    # Sample randomly n points from the population
    sample_i = np.random.choice(a375_expression, n, replace = False)
     
    # Store the sample
    sample_vec[:, rep] = sample_i
    

print("done with experiment")


plt.plot([np.mean(sample_mean_vec), np.mean(sample_mean_vec)], [0,21], label = 'Population Mean',
         color = "#df5500", dashes = (2,1,2,1))
for rep in range(n_replicates):
    # Calculate standard error of the mean
    sem_sample = sp.stats.sem(sample_vec[:,rep])
    ci_left = np.mean(sample_vec[:, rep]) - 2*sem_sample
    ci_right = np.mean(sample_vec[:, rep]) + 2*sem_sample
    plt.plot([ci_left, ci_right], [rep, rep], marker = '|', color = '#8989aa')
    if rep == 0:
        plt.plot([np.mean(sample_vec[:, rep])], rep, marker = 'o', color = '#777799', label = 'Sample Mean')
    else:
        plt.plot([np.mean(sample_vec[:, rep])], rep, marker = 'o', color = '#777799')
    

    
plt.legend()
plt.xlabel('Gene expression (tpm)')
plt.ylabel('Sample number')
lgd = plt.legend(loc=2, bbox_to_anchor=(1.05,1), borderaxespad = 0.0)
plt.savefig("L4_uncertainty_20samples.pdf",bbox_extra_artists=(lgd,), bbox_inches='tight')


In [None]:
## Plot that cool one that has the CI, SEM, standdev, etc.

fig_4 = sns.stripplot(sample_vec[:, 0], jitter = 0.05, size = 4, orient = 'v', label = 'data')
sem_sample = sp.stats.sem(sample_vec[:,0], ddof = 1)
print(sem_sample)
sample_mean = np.mean(sample_vec[:,0])
print(sample_mean)
ci_left = np.mean(sample_vec[:, 0]) - 1.96*sem_sample
ci_right = np.mean(sample_vec[:, 0]) + 1.96*sem_sample
stdev_sample = np.std(sample_vec[:, 0], ddof = 1)
print(np.std(sample_vec[:, 0]))
print(ci_left, ci_right)

    
plt.plot([1.5, 1.5], [ci_left, ci_right], label = '95% confidence interval', lw = 2, marker = '_', markersize = 12)
plt.plot([0.5, 0.5], [sample_mean - sem_sample, sample_mean + sem_sample], label = 'pm 1 se', lw = 2, marker = '_', markersize = 12)
plt.plot([1, 1], [sample_mean - 2*sem_sample, sample_mean + 2*sem_sample], label = 'pm 2 se', lw = 2, marker = '_', markersize = 12)
plt.plot([2, 2], [sample_mean - stdev_sample, sample_mean + stdev_sample], label = 'pm 1 stdev', lw = 2, marker = '_', markersize = 12)

# confidence interval

lgd = plt.legend(loc=2, bbox_to_anchor=(1.05,1), borderaxespad = 0.0)
plt.ylabel('Gene expression (tmp)')
plt.xticks(np.arange(5)/2, ('Data', r'$\pm$ 1 SE', r'$\pm$ 2 SE', '95% CI', r'$\pm 1 stdev'))
plt.savefig('L4_expression_uncertainty.pdf',bbox_extra_artists=(lgd,), bbox_inches='tight')


In [None]:
fig_5 = sns.stripplot(a375_expression, jitter = 0.05, size = 1, orient = 'v')
fig_5.set(xlabel='A375', ylabel='Transcripts per million', title = 'A375 gene expression')
fig_5.figure.savefig('L4_A375_expression_all.pdf')