In [4]:
# data preprocessing

# require(xlsx)
library(readxl)
# read the file: cdhit pattern and ris data
df <- read_excel("../../cdhitResult/cdhitPattern.xlsx")
ris <- read_excel("/home/hermuba/resistanceExp/data/anno_sps_df.xlsx")
# select all E.coli data
ris = ris[ris$Species == 'Escherichia',]

# change column name to avoid empty(index in python), and space
colnames(ris)[1] = 'Cluster' #Cluster Number
colnames(ris)[4] = 'Genome_ID' # Genome ID
colnames(ris)[7] = 'ris' # Resistant Phenotype

colnames(df)[1] = "Genome_ID"



In [14]:
colnames(ris)

In [31]:
join_data <- function(drug){
    selected_drug_ris = ris[ris$Antibiotic == drug,] # select drug ris
    join_df <- merge(selected_drug_ris[,c('Genome_ID', 'ris')], df, by = "Genome_ID", all = FALSE) # merge with CD-hit pattern
    join_df$ris <- factor(join_df$ris, levels = c("Susceptible", "Non-resistant", "Intermediate", "Non-susceptible", "Resistant")) # tell S-I-R order
    return(join_df)
}

do_PCA <- function(drug){
    path = '/home/hermuba/resistanceExp/EcoliGenomes/figures/'
    pdf(paste(path, gsub('/','',drug), '_pca.pdf', sep=''))
    
    df = join_data(drug)
    
    clusters <- df[,3:ncol(df)]
    #print(clusters)
    # PCA analysis scaled and centered
    pca <- prcomp(clusters,
                 center = FALSE,
                 scale. = FALSE)
    
    # plot variance
    plot(pca, type = 'l', main = ("importance"))
    
    # summary
    print(summary(pca))
    
    # color
    df$color = "black"
    df$color[df$ris == "Resistant"]="red"
    df$color[df$ris == "Non-susceptible"]="orange"
    df$color[df$ris == "Intermediate"]="yellow"
    df$color[df$ris == "Non-resistant"]="green"
    df$color[df$ris == "Susceptible"]="blue"



    
    
    # plot
    plot(pca$x[,1:2], type = 'p', pch = 16, col = df$color,
        main = paste('PCA for ', drug, sep = ''))
    
    dev.off()
}

In [32]:
do_PCA('cefepime')

Importance of components:
                           PC1     PC2     PC3     PC4    PC5     PC6     PC7
Standard deviation     63.0753 18.1613 12.8517 8.61012 6.6340 5.84603 5.62951
Proportion of Variance  0.8046  0.0667  0.0334 0.01499 0.0089 0.00691 0.00641
Cumulative Proportion   0.8046  0.8713  0.9047 0.91970 0.9286 0.93551 0.94192
                           PC8    PC9    PC10    PC11    PC12    PC13    PC14
Standard deviation     5.55117 5.3104 5.19962 4.60140 4.47997 4.34570 4.18429
Proportion of Variance 0.00623 0.0057 0.00547 0.00428 0.00406 0.00382 0.00354
Cumulative Proportion  0.94815 0.9538 0.95932 0.96360 0.96766 0.97148 0.97502
                          PC15    PC16   PC17   PC18    PC19    PC20    PC21
Standard deviation     3.99976 3.70505 3.5851 3.4468 3.36403 3.28745 2.58123
Proportion of Variance 0.00324 0.00278 0.0026 0.0024 0.00229 0.00219 0.00135
Cumulative Proportion  0.97826 0.98103 0.9836 0.9860 0.98832 0.99051 0.99186
                          PC22    PC23   P

In [33]:
drug = c('cefepime','ceftazidime', 'ampicillin/sulbactam', 'cefazolin', 'ampicillin', 'trimethoprim/sulfamethoxazole','ciprofloxacin', 'gentamicin', 'meropenem')
for(i in drug){
    do_PCA(i)
}

Importance of components:
                           PC1     PC2     PC3     PC4    PC5     PC6     PC7
Standard deviation     63.0753 18.1613 12.8517 8.61012 6.6340 5.84603 5.62951
Proportion of Variance  0.8046  0.0667  0.0334 0.01499 0.0089 0.00691 0.00641
Cumulative Proportion   0.8046  0.8713  0.9047 0.91970 0.9286 0.93551 0.94192
                           PC8    PC9    PC10    PC11    PC12    PC13    PC14
Standard deviation     5.55117 5.3104 5.19962 4.60140 4.47997 4.34570 4.18429
Proportion of Variance 0.00623 0.0057 0.00547 0.00428 0.00406 0.00382 0.00354
Cumulative Proportion  0.94815 0.9538 0.95932 0.96360 0.96766 0.97148 0.97502
                          PC15    PC16   PC17   PC18    PC19    PC20    PC21
Standard deviation     3.99976 3.70505 3.5851 3.4468 3.36403 3.28745 2.58123
Proportion of Variance 0.00324 0.00278 0.0026 0.0024 0.00229 0.00219 0.00135
Cumulative Proportion  0.97826 0.98103 0.9836 0.9860 0.98832 0.99051 0.99186
                          PC22    PC23   P