# Extract relevant genes and generate files in intermediate_data_01/

## Workflow description

1. Run **I. Essentials** and **II. Custom Functions**
2. For each dataset in `lipid_selection/data/raw_data/source_data/`:

    1. Extract basic information:
        - genome_version 
        - database_source 
        - inclusion_criteria 
        - first_author
        - publication_year
      
    2. Append basic information to `lipid_selection/data/intermediate_data_01/basic_info.txt`
        - Use `append_basic_info()`
    
    3. Extract candidate and non-candidate genes.
        - Use `check_excel_data()`, `import_messy_excel()`
    
    4. Export candidate and non-candidate genes to `<first_author>_<year>.txt`.
        - Use `export_genes()` or `export_proteins()`

## I. Essentials

#### Check directories and load library packages

Working directory is `lipid_selection/data/raw_data/source_data`.

In [1]:
current_dir = getwd()
source_data_dir = "../../data/raw_data/source_data"
setwd(source_data_dir)

#Set target folder for candidate gene info from step 2.D
target_folder = "../../intermediate_data_01/"

Load essential library packages

In [2]:
library("readxl")
library("dplyr")
library("tidyverse")


Attaching package: ‘dplyr’

The following objects are masked from ‘package:stats’:

    filter, lag

The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union

── [1mAttaching packages[22m ─────────────────────────────────────── tidyverse 1.2.1 ──
[32m✔[39m [34mggplot2[39m 3.1.1     [32m✔[39m [34mreadr  [39m 1.3.1
[32m✔[39m [34mtibble [39m 2.1.1     [32m✔[39m [34mpurrr  [39m 0.3.2
[32m✔[39m [34mtidyr  [39m 0.8.3     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mggplot2[39m 3.1.1     [32m✔[39m [34mforcats[39m 0.4.0
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()


In [3]:
getwd()

## II. Custom functions

* `check_excel_data`
    - **Usage**: Check if excel dataset has more than one sheet; check if file type is .xls or .xlsx
    - **input**: file_path (str)
    - **output**: list of sheet names as strings
    
    
* `import_messy_excel`
    - **Usage**: Remove non-data table rows and import a cleaner dataframe from an excel dataset
    - **input**: file_path (str), sheet_name (str)
    - **output**: dataframe
    
    
* `export_genes`
    - **Usage**: Export candidate and non-candidate genes to `<first_author>_<year>.txt` in `target_folder`
    - **input**: gene_data (dataframe), 
    - **output**: `<first_author>_<year>.txt` in `target_folder`
    
    - **Usage**: 
    - **input**: 
    - **output**: 

#### check_excel_data

In [4]:
#Check if excel dataset has more than one sheet. 
#Check if file type is .xls or .xlsx
check_excel_data <- function (file_path){
    
    #If file type is not .xls or .xlsx, return FALSE
    #Return sheet name(s) as a list of strings
    
    #check if "readxl" is loaded
    require("readxl") 
    
    #check if file type is .xls or .xlsx. excel_sheets() only works with these file types
    if (strsplit(file_path, "[.]")[[1]][2] %in% c("xls", "xlsx")){
        list_of_sheets <- excel_sheets(file_path)
        return (list_of_sheets)
    }
    else {
        stop("File type is not .xls or .xlsx")} 
    
}

In [5]:
#Test code with excel dataset with more than one sheet
check_excel_data("Bajhaiya_2016.xls")  

#Test code with excel dataset with one sheet
check_excel_data("Boyle_2012.xls") 

#Test code with incorrect file type
check_excel_data("Li_2016.xlsb")

ERROR: Error in check_excel_data("Li_2016.xlsb"): File type is not .xls or .xlsx


#### import_messy_excel

In [6]:
#Remove non-data table rows and import a cleaner dataframe from an excel dataset

import_messy_excel <- function(file_path, sheet_name){
    
    #check if "readxl" is loaded
    require("readxl") 
    require("dplyr")
    require("stringr")
    
    #Checks to see if sheet name exists
    if (sheet_name %in% excel_sheets(file_path) == FALSE){
            stop("Sheet name does not exist")
    }
    
    #Remove rows in excel sheet if more than half of the columns have NAs
    df <- read_excel(file_path, sheet = sheet_name) %>% 
        filter(rowSums(is.na(.))/ncol(.) < 0.5)
    
    #Assumes Row 1 of the subset dataframe is the column names
    #Assign Row 1 as column names and remove Row 1
    colnames(df)<- df[1,] %>% str_replace_all(" ", "_")
    df<- df[-1,]
    message("Check if column names are correct.")
    
    return(df)
    
}

#### export_genes

In [7]:
#Export candidate and non-candidate genes to `<first_author>_<year>.txt` in target_folder
#Input: gene_data as dataframe

export_genes <- function(df = gene_data, target_folder = target_folder){
    
    require("dplyr")
    
    clean_data <- gene_data %>% 
        mutate(protein_id = NA,
               protein_name = NA)
    
    write.table(gene_data, 
            paste(target_folder, first_author,"_",publication_year,".txt", sep=""), 
            quote = FALSE, sep = "\t", col.names = TRUE, row.names = FALSE)
    
    return(clean_data)
    
}

#### append_basic_info

In [8]:
#Input basic_info.txt as output_filepath
#basic_info is a dataframe

append_basic_info <- function(basic_info = basic_info, output_filepath){
    
    colnames(basic_info)<- NULL
    #Append basic info to output_filepath
    write.table(basic_info, output_filepath, 
                append = TRUE, sep = "\t", quote = FALSE, 
                row.names = FALSE)
    
}

### Create basic_info.txt in intermediate_data_01

In [10]:
basic_info <- data.frame(matrix(ncol = 5, nrow = 0))
colnames(basic_info) <- c("first_author",
                         "publication_year",
                         "genome_version",
                         "database_source", 
                         "inclusion_criteria")
write.table(basic_info, "../../intermediate_data_01/basic_info.txt", 
            sep = "\t", quote = FALSE, row.names = FALSE)

## II. Add source data to intermediate_data_01

### Bajhaiya_2016.xls

#### 1. Extract basic information
#### 2. Append basic information to `lipid_selection/data/intermediate_data_01/basic_info.txt`

In [11]:
#Basic information

file_name = "Bajhaiya_2016.xls"
genome_version = 5.3
database_source = "Phytozome 9.1"
inclusion_criteria = "fold difference greater than 2"
first_author = "Bajhaiya"
publication_year = 2016

basic_info <- data.frame(first_author, 
                         publication_year, 
                         genome_version, 
                         database_source, 
                         inclusion_criteria, 
                         stringsAsFactors = FALSE)
str(basic_info)

append_basic_info(basic_info, "../../intermediate_data_01/basic_info.txt")

'data.frame':	1 obs. of  5 variables:
 $ first_author      : chr "Bajhaiya"
 $ publication_year  : num 2016
 $ genome_version    : num 5.3
 $ database_source   : chr "Phytozome 9.1"
 $ inclusion_criteria: chr "fold difference greater than 2"


In [12]:
#temp<- read.table("../../intermediate_data_01/basic_info.txt", fill = TRUE)

In [13]:
#Check file type and number of excel sheets
sheets <- check_excel_data(file_name)  
print(length(sheets))

[1] 2


#### 3. Extract candidate and non-candidate genes based on inclusion criteria.

**Inclusion criteria**: Within each strain, if fold-difference between high P and low P is >2 count as candidate gene. 

**Sheet 1: "Day 3" **

In [14]:
df <- import_messy_excel(file_name, sheets[1]) 
#Calculate fold difference using normalized expression
temp1 <- df %>% 
    select(starts_with("Gene"), ends_with("norm")) %>%
    mutate_at(vars(ends_with("norm")),list(as.numeric)) %>%
    filter(WT_HP_D3_DESeq_norm/WT_LP_D3_DESeq_norm >= 2 |
          WT_LP_D3_DESeq_norm/WT_HP_D3_DESeq_norm >= 2 |
          psr1_HP_D3_DESeq_norm/psr1_LP_D3_DESeq_norm >= 2 |
          psr1_LP_D3_DESeq_norm/psr1_HP_D3_DESeq_norm >= 2) %>%
    select(starts_with("Gene"))
           
temp1[1:3,]

New names:
* `` -> ...2
* `` -> ...3
* `` -> ...4
* `` -> ...5
* `` -> ...6
* … and 13 more problems
Check if column names are correct.


Gene_id,Gene_name
<chr>,<chr>
Cre09.g404900,Cre09.g404900
Cre04.g216700,PHOX
Cre01.g044300,Cre01.g044300


** Sheet 2: "Day 5" **

In [15]:
df <- import_messy_excel(file_name, sheets[2]) 
#Calculate fold difference using normalized expression
temp2 <- df %>% 
    select(starts_with("Gene"), ends_with("norm")) %>%
    mutate_at(vars(ends_with("norm")),list(as.numeric)) %>%
    filter(WT_HP_D5_DESeq_norm/WT_LP_D5_DESeq_norm >= 2 |
          WT_LP_D5_DESeq_norm/WT_HP_D5_DESeq_norm >= 2 |
          psr1_HP_D5_DESeq_norm/psr1_LP_D5_DESeq_norm >= 2 |
          psr1_LP_D5_DESeq_norm/psr1_HP_D5_DESeq_norm >= 2) %>%
    select(starts_with("Gene"))
temp2[1:3,]

New names:
* `` -> ...2
* `` -> ...3
* `` -> ...4
* `` -> ...5
* `` -> ...6
* … and 13 more problems
Check if column names are correct.


Gene_id,Gene_name
<chr>,<chr>
Cre09.g404900,Cre09.g404900
Cre04.g216700,PHOX
Cre01.g044300,Cre01.g044300


##### Join Sheet 1 and 2 data by creating two dataframes:
* `candidate_genes`
* `not_candidate_genes`

In [16]:
candidate_temp <- left_join(temp1, temp2, by = c("Gene_id", "Gene_name")) 

not_candidate_genes_df <- anti_join(df, candidate_temp) %>% select(starts_with("Gene")) %>%
    mutate(candidate_gene = FALSE)

candidate_genes <- candidate_temp %>% 
    mutate(candidate_gene = TRUE)

gene_data <- rbind(candidate_genes, not_candidate_genes_df) %>%
    rename(gene_id = Gene_id,
          gene_name = Gene_name)

print(gene_data[1:3,])
print(summary(gene_data))

Joining, by = c("Gene_id", "Gene_name")


[90m# A tibble: 3 x 3[39m
  gene_id       gene_name     candidate_gene
  [3m[90m<chr>[39m[23m         [3m[90m<chr>[39m[23m         [3m[90m<lgl>[39m[23m         
[90m1[39m Cre09.g404900 Cre09.g404900 TRUE          
[90m2[39m Cre04.g216700 PHOX          TRUE          
[90m3[39m Cre01.g044300 Cre01.g044300 TRUE          
   gene_id           gene_name         candidate_gene 
 Length:17737       Length:17737       Mode :logical  
 Class :character   Class :character   FALSE:9441     
 Mode  :character   Mode  :character   TRUE :8296     


### 4. Export candidate and non-candidate genes to `Bajhaiya_2016.txt`.

In [17]:
temp<- export_genes(df = gene_data, target_folder = target_folder)
temp[1:5,]

gene_id,gene_name,candidate_gene,protein_id,protein_name
<chr>,<chr>,<lgl>,<lgl>,<lgl>
Cre09.g404900,Cre09.g404900,True,,
Cre04.g216700,PHOX,True,,
Cre01.g044300,Cre01.g044300,True,,
g16424,g16424,True,,
g2975,MPA1,True,,


## Blaby_2013_DS2.xlsx

**Omit `Blaby_2013_DS8.xlsx`**

#### 1. Extract basic information
#### 2. Append basic information to `lipid_selection/data/intermediate_data_01/basic_info.txt`

In [18]:
#Basic information

file_name = "Blaby_2013_DS2.xlsx"
genome_version = 5.0
database_source = "Augustus u10.2"
inclusion_criteria = "fold difference greater than 2"
first_author = "Blaby"
publication_year = 2013

basic_info <- data.frame(first_author, 
                         publication_year, 
                         genome_version, 
                         database_source, 
                         inclusion_criteria, 
                         stringsAsFactors = FALSE)
str(basic_info)

append_basic_info(basic_info, "../../intermediate_data_01/basic_info.txt")

'data.frame':	1 obs. of  5 variables:
 $ first_author      : chr "Blaby"
 $ publication_year  : num 2013
 $ genome_version    : num 5
 $ database_source   : chr "Augustus u10.2"
 $ inclusion_criteria: chr "fold difference greater than 2"


In [19]:
#Check file type and number of excel sheets
sheets <- check_excel_data(file_name)  
print(length(sheets))

[1] 1


#### 3. Extract candidate and non-candidate genes based on inclusion criteria.

**Inclusion criteria**: Within each strain, if fold-difference between 0 hour and *n* hours after N starvation >2 count as candidate gene. 

* Gene expression differences due to *sta-6* mutation is less important than gene expression differences due to nutrient starvation

In [20]:
require("readxl")
require("tidyverse")
require("dplyr")

#Data manipulation to separate data by strains
df <- import_messy_excel(file_name, sheets[1]) 
CC_4349 <- df[,1:11] %>% mutate(strain = "CC_4349") %>% rename ( '0' = '0b') 
#CC_4349[1:3,]
sta_6 <- df[, c(1:3, 12:19)] %>% mutate(strain = "sta_6")
#sta_6[1:3,]

New names:
* `` -> ...2
* `` -> ...3
* `` -> ...4
* `` -> ...5
* `` -> ...6
* … and 13 more problems
Check if column names are correct.


In [25]:
gene_data <- rbind(CC_4349, sta_6) %>% 

    #reshape dataframe to assign candidate_gene label based on fold difference
    gather(.,'0.5', '2', '4', '8', '12', '24', '48', 
      key = "time", value = 'expression') %>%
    rename(time_0h = '0') %>%
    mutate(time_0h = as.numeric(time_0h)) %>%
    mutate( candidate_gene = case_when(
        expression >= 2*time_0h ~ TRUE,
        time_0h>= 2*expression ~ TRUE,
        TRUE ~ FALSE)) %>%
    
    #aggregate dataframe and remove unnecessary columns for append_genes()
    group_by(Gene, Augustus_u10.2_ID) %>% 
    summarise(candidate_gene = any(candidate_gene)) %>%
    rename (gene_name = Gene, gene_id = Augustus_u10.2_ID)

gene_data[1:3,]
summary(gene_data)
temp<- export_genes(df = gene_data, target_folder = target_folder)

gene_name,gene_id,candidate_gene
<chr>,<chr>,<lgl>
,Cre01.g001100,True
,Cre01.g004750,True
,Cre01.g011450,True


  gene_name           gene_id          candidate_gene 
 Length:450         Length:450         Mode :logical  
 Class :character   Class :character   FALSE:2        
 Mode  :character   Mode  :character   TRUE :448      

### Boyle_2012.xls

#### 1. Extract basic information
#### 2. Append basic information to `lipid_selection/data/intermediate_data_01/basic_info.txt`

In [26]:
file_name = "Boyle_2012.xls"
genome_version = 4.0
database_source = "Augustus 10.2"
inclusion_criteria = "fold difference greater than 2"
first_author = "Boyle"
publication_year = 2012

basic_info <- data.frame(first_author, 
                         publication_year, 
                         genome_version, 
                         database_source, 
                         inclusion_criteria, 
                         stringsAsFactors = FALSE)
str(basic_info)

append_basic_info(basic_info, "../../intermediate_data_01/basic_info.txt")

'data.frame':	1 obs. of  5 variables:
 $ first_author      : chr "Boyle"
 $ publication_year  : num 2012
 $ genome_version    : num 4
 $ database_source   : chr "Augustus 10.2"
 $ inclusion_criteria: chr "fold difference greater than 2"


In [27]:
#Check file type and number of excel sheets
sheets <- check_excel_data(file_name)  
print(length(sheets))

[1] 1


#### 3. Extract candidate and non-candidate genes based on inclusion criteria.

**Inclusion criteria**: Within each strain, if fold-difference of RPKM between 0 hour and *n* hours after N starvation >2 count as candidate gene. 

In [28]:
require("readxl")
require("tidyverse")
require("dplyr")

#Import excel sheet and rename column names
df <- import_messy_excel(file_name, sheets[1])
colnames(df)[1:5]<- c("gene_id", "Au.5", "gene_name","protein_name", "time_0h")
df[1:3,]

New names:
* `` -> ...6
* `` -> ...7
* `` -> ...8
* `` -> ...9
Check if column names are correct.


gene_id,Au.5,gene_name,protein_name,time_0h,2_h,12_h,24_h,48_h
<chr>,<dbl>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
Cre12.g519100,513248,ACX1,?-Carboxyltransferase,140.5,68.2,81.29999999999998,71.6,70.2
Cre12.g484000,512497,BCX1,?-Carboxyltransferase,103.5,53.4,35.1,35.4,31.4
Cre17.g715250,517403,BCC1,Acetyl-CoA biotin carboxyl carrier,293.9,129.6,54.7,74.7,68.9


In [31]:
#Reshape dataframe such that it is easier to compare time 0h to 'n'h
gene_data<- df %>% gather(., '2_h', '12_h', "24_h", '48_h',
                   key = 'time', value = "expression") %>%
    mutate(time_0h = as.numeric(time_0h),
        expression= as.numeric(expression)) %>%
           
    #assign candidate_gene label based on fold difference
    mutate(candidate_gene = case_when(
            expression >= 2*time_0h~ TRUE,
            time_0h >= 2* expression~ TRUE,
            TRUE ~ FALSE)) %>%

    #aggregate dataframe and remove unnecessary columns for append_genes()
    group_by(gene_id, gene_name) %>% 
    summarise(candidate_gene = any(candidate_gene))

gene_data[1:3,]
summary(gene_data)
temp<-export_genes(df = gene_data, target_folder = target_folder)
temp[1:3,]
    

gene_id,gene_name,candidate_gene
<chr>,<chr>,<lgl>
Cre01.g037850,BCC2,True
Cre01.g038550,SQD2,True
Cre01.g045900,DGAT1,True


   gene_id           gene_name         candidate_gene 
 Length:25          Length:25          Mode :logical  
 Class :character   Class :character   FALSE:9        
 Mode  :character   Mode  :character   TRUE :16       

gene_id,gene_name,candidate_gene,protein_id,protein_name
<chr>,<chr>,<lgl>,<lgl>,<lgl>
Cre01.g037850,BCC2,True,,
Cre01.g038550,SQD2,True,,
Cre01.g045900,DGAT1,True,,
