In [2]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [3]:
#Common imports
from Bio import SeqIO
from matplotlib import pyplot as plt
import numpy as np
from scipy import stats
import pandas as pd

#My helper library for this analysis
import sequencing_analysis_library as SAL

In [11]:
genome_file = '/Users/adamhockenberry/Projects/Neisseria/Data/Genomes/fa1090.gb'
# genome_file = '/Users/adamhockenberry/Projects/Neisseria/Code/temp_new_fa1090.gb'
genome = list(SeqIO.parse(genome_file, 'genbank'))[0]
organism = 'Neisseria'

# genome_file = '/Users/adamhockenberry/Projects/Neisseria/Data/Genomes/na1000.gb'
# genome = list(SeqIO.parse(genome_file, 'genbank'))[0]
# organism = 'Caulobacter'

genome_features_dict = SAL.get_genome_features(genome)

In [None]:
sample_files = ['SQ-1', 'SQ-2', 'SQ-3', 'SQ-4', 'SQ-5', 'SQ-6']
organism = 'Neisseria'
file_ending = '_best_20_100_all'


data_to_read = []
for sample_file in sample_files:
    fwd_file = '../Data/{}/{}{}_f.wig'.format(organism, sample_file, file_ending)
    rev_file = '../Data/{}/{}{}_r.wig'.format(organism, sample_file, file_ending)
    data_to_read.append([sample_file, fwd_file, rev_file])
    
sequencing_dict_meta_f, sequencing_dict_meta_r = SAL.read_multiple_wiggles(data_to_read)

rpkm_dict_meta = SAL.get_rpkm_dict(genome_features_dict, sequencing_dict_meta_f, sequencing_dict_meta_r)

In [None]:
control_labels = ['SQ-1', 'SQ-3', 'SQ-5']
treatment_labels = ['SQ-2', 'SQ-4', 'SQ-6']

data_table = []
gene_names = []
for gene in rpkm_dict_meta[control_labels[0]].keys():
    control_exp = []
    treatment_exp = []
    for control_label in control_labels:
        control_exp.append(rpkm_dict_meta[control_label][gene])
    for treatment_label in treatment_labels:
        treatment_exp.append(rpkm_dict_meta[treatment_label][gene])
        
    t_val, p_val= stats.ttest_ind(control_exp, treatment_exp, equal_var=True)
    
    control_exp_log = np.log2(control_exp)
    treatment_exp_log = np.log2(treatment_exp)
    t_val_log, p_val_log = stats.ttest_ind(control_exp_log, treatment_exp_log)
    
    log_fold_diff = np.log2(np.mean(treatment_exp))-np.log2(np.mean(control_exp))
    output_line = control_exp + treatment_exp
    output_line.extend([np.mean(control_exp), np.std(control_exp),\
                        np.mean(treatment_exp), np.std(treatment_exp),\
                       p_val, p_val_log, log_fold_diff])
    data_table.append(output_line)
    gene_names.append(gene)

col_names = ['RPKM(SQ-1)','RPKM(SQ-3)','RPKM(SQ-5)',\
             'RPKM(SQ-2)','RPKM(SQ-4)','RPKM(SQ-6)',\
             'Avg (SQ-1, SQ-3, SQ-5)', 'St.Dev. (SQ-1, SQ-3, SQ-5)',\
             'Avg (SQ-2, SQ-4, SQ-6)', 'St.Dev. (SQ-2, SQ-4, SQ-6)',\
             'T-test(p-value)', 'T-test(p-value) on log transformed RPKM', 'log2(fold change)']

df = pd.DataFrame(data_table, index=gene_names, columns=col_names)
sort_df = df.sort_index()
sort_df

# For temporarily culling this df

In [None]:
df_annotations = pd.read_excel('new_to_old_annotation.xlsx', sheet_name='Sheet1')

In [None]:
for index in sort_df.index:
    if type(df_annotations.loc[index]['Old ID']) != float :
        sort_df = sort_df.drop(index)

In [None]:
len(sort_df.index)

# Saving and re-reading

In [None]:
sort_df.to_excel('../Results/Neisseria_gonorrhea_RNA_seq_best_20_100_March13_ONLY_NEW.xlsx', sheet_name='Sheet1')

In [5]:
df = pd.read_excel('../Results/Neisseria_gonorrhea_RNA_seq_best_20_100_Feb23.xlsx', sheet_name='Sheet1')

In [6]:
df.describe()

Unnamed: 0,RPKM(SQ-1),RPKM(SQ-3),RPKM(SQ-5),RPKM(SQ-2),RPKM(SQ-4),RPKM(SQ-6),"Avg (SQ-1, SQ-3, SQ-5)","St.Dev. (SQ-1, SQ-3, SQ-5)","Avg (SQ-2, SQ-4, SQ-6)","St.Dev. (SQ-2, SQ-4, SQ-6)",T-test(p-value),T-test(p-value) on log transformed RPKM,log2(fold change)
count,1975.0,1975.0,1975.0,1975.0,1975.0,1975.0,1975.0,1975.0,1975.0,1975.0,1962.0,1899.0,1962.0
mean,574.975916,570.841031,568.504572,578.923371,573.166633,548.132912,571.440507,46.894452,566.740972,65.667679,0.347844,0.344189,
std,3118.361187,2883.65454,2800.823261,2476.204386,2488.615008,2338.913682,2927.686935,233.537615,2421.954887,248.533403,0.305791,0.307077,
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2e-05,1.2e-05,-inf
25%,28.848664,26.735433,26.638341,32.740689,31.515681,28.598442,28.242838,2.252586,32.121249,3.613763,0.07194,0.064213,-0.148845
50%,126.644099,118.989469,125.617068,135.491098,129.907911,126.458827,123.765936,8.88699,131.997431,12.891814,0.259191,0.252493,0.071719
75%,406.921329,401.089275,399.910741,435.354855,439.496706,415.418049,405.617588,29.263576,433.836819,39.578867,0.582982,0.583459,0.31471
max,102950.900051,87499.282873,84267.401225,78487.494336,80769.830305,70656.740714,91572.52805,8153.190655,76638.021785,5811.704811,0.999822,0.999406,inf


In [10]:
np.percentile(df['Avg (SQ-1, SQ-3, SQ-5)'],5)

0.98674751877671429

In [None]:
print("Comparison within controls:")
for position, control_label in enumerate(control_labels):
    for comparison in control_labels[position+1:]:
        if comparison != control_label:
            print(control_label, comparison, stats.spearmanr(df['RPKM({})'.format(control_label)], df['RPKM({})'.format(comparison)]))
print("Comparison within treatments:")
for position, treatment_label in enumerate(treatment_labels):
    for comparison in treatment_labels[position+1:]:
        if comparison != treatment_label:
            print(treatment_label, comparison, stats.spearmanr(df['RPKM({})'.format(treatment_label)], df['RPKM({})'.format(comparison)]))          
print("Comparison between control and treatment:")
for control_label in control_labels:
    for comparison in treatment_labels:
        if comparison != control_label:
            print(control_label, comparison, stats.spearmanr(df['RPKM({})'.format(control_label)], df['RPKM({})'.format(comparison)]))

In [None]:
a = np.log2(df['RPKM(SQ-2)']) - np.log2(df['RPKM(SQ-1)'])
b = np.log2(df['RPKM(SQ-4)']) - np.log2(df['RPKM(SQ-3)'])
c = np.log2(df['RPKM(SQ-6)']) - np.log2(df['RPKM(SQ-5)'])
print(stats.spearmanr(a,b))
print(stats.spearmanr(a,c))
print(stats.spearmanr(b,c))

plt.figure()
plt.plot(a,b, 'bo')

In [None]:
to_compare_a = 'Avg (SQ-1, SQ-3, SQ-5)'
to_compare_b = 'Avg (SQ-2, SQ-4, SQ-6)'
SAL.plot_correlations_combined(to_compare_a, to_compare_b, df, save_file_name=False)