In [1]:
library (tidyverse)
library (openxlsx)
library (statmod)
library (reshape2)
library (readr)

-- [1mAttaching packages[22m ------------------------------------------------------------------------ tidyverse 1.3.1 --

[32mv[39m [34mggplot2[39m 3.3.3     [32mv[39m [34mpurrr  [39m 0.3.4
[32mv[39m [34mtibble [39m 3.1.0     [32mv[39m [34mdplyr  [39m 1.0.5
[32mv[39m [34mtidyr  [39m 1.1.3     [32mv[39m [34mstringr[39m 1.4.0
[32mv[39m [34mreadr  [39m 1.4.0     [32mv[39m [34mforcats[39m 0.5.1

-- [1mConflicts[22m --------------------------------------------------------------------------- tidyverse_conflicts() --
[31mx[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31mx[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()


Attaching package: 'reshape2'


The following object is masked from 'package:tidyr':

    smiths




In [2]:
d <- read.delim(file = "./data/Uniprot_raw_final.tsv", sep = '\t', header = TRUE)

In [3]:
GO_bp <- gsub("\\s*\\{[^\\}]+\\}","",as.character(d$Gene.ontology..biological.process))
GO_cc <- gsub("\\s*\\{[^\\}]+\\}","",as.character(d$Gene.ontology..cellular.component))
GO_mf <- gsub("\\s*\\{[^\\}]+\\}","",as.character(d$Gene.ontology..molecular.function))
GO <- gsub("\\s*\\{[^\\}]+\\}","",as.character(d$Gene.ontology..GO))

In [4]:
d$GO_bp <- gsub("\\[.*?\\]","",as.character(GO_bp))
d$GO_cc <- gsub("\\[.*?\\]","",as.character(GO_cc))
d$GO_mf <- gsub ("\\[.*?\\]","",as.character(GO_mf))
d$GO <- gsub ("\\[.*?\\]","",as.character(GO))

In [5]:
protname <- gsub("\\s*\\([^\\)]+\\)","",as.character(d$Protein.names))
protname <- gsub("\\[.*?\\]","",as.character(protname))

In [6]:
head (protname)

In [7]:
#head (d$GO)

d$GO <- gsub("; ",";",as.character(d$GO))
d$GO <- gsub(" ;",";",as.character(d$GO))

d$GO_mf <- gsub("; ",";",as.character(d$GO_mf))
d$GO_mf <- gsub(" ;",";",as.character(d$GO_mf))

d$GO_bp <- gsub("; ",";",as.character(d$GO_bp))
d$GO_bp  <- gsub(" ;",";",as.character(d$GO_bp))

d$GO_mf = str_trim(d$GO_mf, side = c("both"))
d$GO_bp = str_trim(d$GO_bp, side = c("both"))
d$GO = str_trim(d$GO, side = c("both"))

#head (d$GO_mf)
#head (d$GO_bp)

In [8]:
genes <- (d [ ,1])
head (genes)

In [9]:
head (d$Keywords)

### Prepare list of all terms for manual inspection

In [10]:
d_v1 = d %>%
summarize(text = str_c(unique (Keywords), collapse = ";"))

In [11]:
x=  (unlist(d_v1))
x2 = (unique (x))
x3 = data.frame(new_row = unlist(strsplit(as.character(x2), ";")))
x4 = x3 %>%
distinct ()

In [12]:
dim(x3)
dim (x4)
head (x4)


Unnamed: 0_level_0,new_row
Unnamed: 0_level_1,<chr>
1,3D-structure
2,Actin-binding
3,Alternative splicing
4,Amyloidosis
5,Cell projection
6,Coiled coil


### Matching of the prepared terms (Juho style)

In [13]:
dc <- d %>%
select (., c("Gene.names...primary..","Keywords"))

#head (dc)

In [14]:
dc_v1 = dc %>% 
    mutate(term = strsplit(as.character(Keywords), ";")) %>% 
    unnest(term)

head (dc_v1)

Gene.names...primary..,Keywords,term
<chr>,<chr>,<chr>
MEFV,3D-structure;Actin-binding;Alternative splicing;Amyloidosis;Cell projection;Coiled coil;Cytoplasm;Cytoplasmic vesicle;Cytoskeleton;Disease variant;Immunity;Inflammatory response;Innate immunity;Metal-binding;Microtubule;Nucleus;Phosphoprotein;Reference proteome;Zinc;Zinc-finger,3D-structure
MEFV,3D-structure;Actin-binding;Alternative splicing;Amyloidosis;Cell projection;Coiled coil;Cytoplasm;Cytoplasmic vesicle;Cytoskeleton;Disease variant;Immunity;Inflammatory response;Innate immunity;Metal-binding;Microtubule;Nucleus;Phosphoprotein;Reference proteome;Zinc;Zinc-finger,Actin-binding
MEFV,3D-structure;Actin-binding;Alternative splicing;Amyloidosis;Cell projection;Coiled coil;Cytoplasm;Cytoplasmic vesicle;Cytoskeleton;Disease variant;Immunity;Inflammatory response;Innate immunity;Metal-binding;Microtubule;Nucleus;Phosphoprotein;Reference proteome;Zinc;Zinc-finger,Alternative splicing
MEFV,3D-structure;Actin-binding;Alternative splicing;Amyloidosis;Cell projection;Coiled coil;Cytoplasm;Cytoplasmic vesicle;Cytoskeleton;Disease variant;Immunity;Inflammatory response;Innate immunity;Metal-binding;Microtubule;Nucleus;Phosphoprotein;Reference proteome;Zinc;Zinc-finger,Amyloidosis
MEFV,3D-structure;Actin-binding;Alternative splicing;Amyloidosis;Cell projection;Coiled coil;Cytoplasm;Cytoplasmic vesicle;Cytoskeleton;Disease variant;Immunity;Inflammatory response;Innate immunity;Metal-binding;Microtubule;Nucleus;Phosphoprotein;Reference proteome;Zinc;Zinc-finger,Cell projection
MEFV,3D-structure;Actin-binding;Alternative splicing;Amyloidosis;Cell projection;Coiled coil;Cytoplasm;Cytoplasmic vesicle;Cytoskeleton;Disease variant;Immunity;Inflammatory response;Innate immunity;Metal-binding;Microtubule;Nucleus;Phosphoprotein;Reference proteome;Zinc;Zinc-finger,Coiled coil


### This is my manually prepared list of terms

In [15]:
terms_KW <-read.delim ("./data/Uniprot-KWs_solution.txt")
terms_KW$term <- terms_KW$new_row

In [16]:
head (terms_KW)

Unnamed: 0_level_0,X,new_row,search.term,biochemistry,function.,term
Unnamed: 0_level_1,<int>,<chr>,<chr>,<chr>,<chr>,<chr>
1,100,Antibiotic,Antibiotic,antimicrobial protein,,Antibiotic
2,101,Antimicrobial,Antimicrobial,antimicrobial protein,,Antimicrobial
3,188,Fungicide,,antimicrobial protein,,Fungicide
4,237,Antiviral defense,,antimicrobial protein,,Antiviral defense
5,177,Antioxidant,,antioxidant,,Antioxidant
6,59,Cell adhesion,Cell adhesion,cell adhesion protein,,Cell adhesion


In [17]:
dc_v1$category <- terms_KW$biochemistry[match(dc_v1$term, terms_KW$term)]

dc_v1$funct <- terms_KW$function.[match(dc_v1$term, terms_KW$term)]

head (dc_v1, n =3)

Gene.names...primary..,Keywords,term,category,funct
<chr>,<chr>,<chr>,<chr>,<chr>
MEFV,3D-structure;Actin-binding;Alternative splicing;Amyloidosis;Cell projection;Coiled coil;Cytoplasm;Cytoplasmic vesicle;Cytoskeleton;Disease variant;Immunity;Inflammatory response;Innate immunity;Metal-binding;Microtubule;Nucleus;Phosphoprotein;Reference proteome;Zinc;Zinc-finger,3D-structure,,
MEFV,3D-structure;Actin-binding;Alternative splicing;Amyloidosis;Cell projection;Coiled coil;Cytoplasm;Cytoplasmic vesicle;Cytoskeleton;Disease variant;Immunity;Inflammatory response;Innate immunity;Metal-binding;Microtubule;Nucleus;Phosphoprotein;Reference proteome;Zinc;Zinc-finger,Actin-binding,,
MEFV,3D-structure;Actin-binding;Alternative splicing;Amyloidosis;Cell projection;Coiled coil;Cytoplasm;Cytoplasmic vesicle;Cytoskeleton;Disease variant;Immunity;Inflammatory response;Innate immunity;Metal-binding;Microtubule;Nucleus;Phosphoprotein;Reference proteome;Zinc;Zinc-finger,Alternative splicing,,


In [18]:
dim (dc_v1)

In [19]:
dc_test = dc_v1 %>%
group_by (Gene.names...primary..) %>% 
 summarize(summary = paste(unique (na.omit(category)), collapse = ", "))

dim (dc_test)

In [20]:
dc_test$summary = str_trim(dc_test$summary, side = c("both"))

In [21]:
head (dc_test)

Gene.names...primary..,summary
<chr>,<chr>
AATK,", enzyme"
ABCB4,", transporter, enzyme"
ABCC3,", enzyme, transporter"
ACP3,", enzyme"
ACSL1,", enzyme"
ACSM3,", enzyme"


In [22]:
dc_test$summary <- str_replace_all(dc_test$summary, "^, ", "")
dc_test$summary <- str_replace_all(dc_test$summary, ",$", "")
dc_test$summary <- str_replace_all(dc_test$summary, ", ,", ", ") 
head (dc_test)

Gene.names...primary..,summary
<chr>,<chr>
AATK,enzyme
ABCB4,"transporter, enzyme"
ABCC3,"enzyme, transporter"
ACP3,enzyme
ACSL1,enzyme
ACSM3,enzyme


In [23]:
x <- read.table ("./data/f_time2020_inlog_FC15_FC2_evidence-threshold.tsv", sep = '\t', header = T, row.names =1)

In [24]:
#head (x)
x = x %>%
rownames_to_column ("Genesymbol")

head (x)

Unnamed: 0_level_0,Genesymbol,EtOH_04h_R1,EtOH_04h_R2,EtOH_04h_R3,EtOH_08h_R1,EtOH_08h_R2,EtOH_08h_R3,EtOH_24h_R1,EtOH_24h_R2,EtOH_24h_R3,⋯,ensembl_gene_id,entrezgene_id,external_gene_name,description,chromosome_name,start_position,end_position,percentage_gene_gc_content,gene_biotype,strand
Unnamed: 0_level_1,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<chr>,<int>,<chr>,<chr>,<chr>,<int>,<int>,<dbl>,<chr>,<int>
1,A1BG,0.6699661,1.5716342,1.343659,1.96288982,1.224317,1.85505743,2.9635154,2.8044871,2.2197667,⋯,ENSG00000121410,1,A1BG,alpha-1-B glycoprotein,19,58345178,58353492,55.8,protein coding,-1
2,A2M,-0.149328,-0.5732038,0.5753295,0.02594746,-1.177576,-0.04192958,0.3870933,0.8929844,-0.3338175,⋯,ENSG00000175899,2,A2M,alpha-2-macroglobulin,12,9067664,9116229,37.18,protein coding,-1
3,A4GALT,1.0360548,0.179104,-0.5061997,1.81327202,1.441122,1.49762235,0.799574,1.2555664,1.1367142,⋯,ENSG00000128274,53947,A4GALT,"alpha 1,4-galactosyltransferase (P blood group)",22,42692121,42721298,52.39,protein coding,-1
4,AAAS,4.6847524,4.7243974,5.0412317,5.25087279,4.960263,5.15534332,5.2016303,5.3925019,5.2358878,⋯,ENSG00000094914,8086,AAAS,aladin WD repeat nucleoporin,12,53307456,53324864,49.39,protein coding,-1
5,AACS,4.6052624,4.5081055,4.4018076,4.37823558,4.189747,4.20453034,4.6912469,4.9885575,4.3303553,⋯,ENSG00000081760,65985,AACS,acetoacetyl-CoA synthetase,12,125065434,125143333,51.2,protein coding,1
6,AAGAB,5.3934778,5.464263,5.2739382,5.42861575,5.434778,5.4801169,5.3105383,5.3367685,5.1670164,⋯,ENSG00000103591,79719,AAGAB,alpha and gamma adaptin binding protein,15,67200667,67255195,39.47,protein coding,-1


In [25]:
x2 = x %>%
select (c(Genesymbol, description, howmany))

In [26]:
head (x2)

Unnamed: 0_level_0,Genesymbol,description,howmany
Unnamed: 0_level_1,<chr>,<chr>,<int>
1,A1BG,alpha-1-B glycoprotein,0
2,A2M,alpha-2-macroglobulin,0
3,A4GALT,"alpha 1,4-galactosyltransferase (P blood group)",0
4,AAAS,aladin WD repeat nucleoporin,0
5,AACS,acetoacetyl-CoA synthetase,0
6,AAGAB,alpha and gamma adaptin binding protein,0


In [27]:
dc_test = dc_test %>%
rename (Genesymbol = Gene.names...primary..)

In [28]:
head (dc_test)

Genesymbol,summary
<chr>,<chr>
AATK,enzyme
ABCB4,"transporter, enzyme"
ABCC3,"enzyme, transporter"
ACP3,enzyme
ACSL1,enzyme
ACSM3,enzyme


In [29]:
mrg <- merge (x2, dc_test, by = "Genesymbol", all.x = T)

In [30]:
#write.xlsx (dc_test, "GOmatching_mf_dev_v3.xlsx")
#write.xlsx (mrg, "GOmatching_mf_final_dev5.xlsx")
#write.xlsx (mrg, "KWmatching_mf_dev3.xlsx")
#write.xlsx (mrg, "KWmatching_mf_dev4.xlsx")

### now get GO mf (2nd script) and finalize

In [31]:
x = read.table ("./data/GO_mf_final.tsv", sep = "\t")

In [32]:
head (x)

Unnamed: 0_level_0,Genesymbol,description,howmany,summary,sum2
Unnamed: 0_level_1,<chr>,<chr>,<int>,<chr>,<chr>
1,A1BG,alpha-1-B glycoprotein,0,,
2,A2M,alpha-2-macroglobulin,0,,
3,A4GALT,"alpha 1,4-galactosyltransferase (P blood group)",0,,
4,AAAS,aladin WD repeat nucleoporin,0,,
5,AACS,acetoacetyl-CoA synthetase,0,,
6,AAGAB,alpha and gamma adaptin binding protein,0,,


In [33]:
x = x %>%
rename (., GO_mf = summary)

head (x)

Unnamed: 0_level_0,Genesymbol,description,howmany,GO_mf,sum2
Unnamed: 0_level_1,<chr>,<chr>,<int>,<chr>,<chr>
1,A1BG,alpha-1-B glycoprotein,0,,
2,A2M,alpha-2-macroglobulin,0,,
3,A4GALT,"alpha 1,4-galactosyltransferase (P blood group)",0,,
4,AAAS,aladin WD repeat nucleoporin,0,,
5,AACS,acetoacetyl-CoA synthetase,0,,
6,AAGAB,alpha and gamma adaptin binding protein,0,,


In [34]:
head (mrg)

Unnamed: 0_level_0,Genesymbol,description,howmany,summary
Unnamed: 0_level_1,<chr>,<chr>,<int>,<chr>
1,A1BG,alpha-1-B glycoprotein,0,
2,A2M,alpha-2-macroglobulin,0,
3,A4GALT,"alpha 1,4-galactosyltransferase (P blood group)",0,
4,AAAS,aladin WD repeat nucleoporin,0,
5,AACS,acetoacetyl-CoA synthetase,0,
6,AAGAB,alpha and gamma adaptin binding protein,0,


In [35]:
mrg2 = merge (mrg, x, by = "Genesymbol", all.x = T)

In [36]:
mrg_sisa = mrg2 %>%
mutate(.,
       final_fct = case_when(
         summary == "" ~ GO_mf,
       TRUE ~ summary))

In [37]:
head (mrg_sisa)

Unnamed: 0_level_0,Genesymbol,description.x,howmany.x,summary,description.y,howmany.y,GO_mf,sum2,final_fct
Unnamed: 0_level_1,<chr>,<chr>,<int>,<chr>,<chr>,<int>,<chr>,<chr>,<chr>
1,A1BG,alpha-1-B glycoprotein,0,,alpha-1-B glycoprotein,0,,,
2,A2M,alpha-2-macroglobulin,0,,alpha-2-macroglobulin,0,,,
3,A4GALT,"alpha 1,4-galactosyltransferase (P blood group)",0,,"alpha 1,4-galactosyltransferase (P blood group)",0,,,
4,AAAS,aladin WD repeat nucleoporin,0,,aladin WD repeat nucleoporin,0,,,
5,AACS,acetoacetyl-CoA synthetase,0,,acetoacetyl-CoA synthetase,0,,,
6,AAGAB,alpha and gamma adaptin binding protein,0,,alpha and gamma adaptin binding protein,0,,,


In [38]:
dc_v1$funct <- terms_KW$function.[match(dc_v1$term, terms_KW$term)]

head (dc_v1, n =3)

Gene.names...primary..,Keywords,term,category,funct
<chr>,<chr>,<chr>,<chr>,<chr>
MEFV,3D-structure;Actin-binding;Alternative splicing;Amyloidosis;Cell projection;Coiled coil;Cytoplasm;Cytoplasmic vesicle;Cytoskeleton;Disease variant;Immunity;Inflammatory response;Innate immunity;Metal-binding;Microtubule;Nucleus;Phosphoprotein;Reference proteome;Zinc;Zinc-finger,3D-structure,,
MEFV,3D-structure;Actin-binding;Alternative splicing;Amyloidosis;Cell projection;Coiled coil;Cytoplasm;Cytoplasmic vesicle;Cytoskeleton;Disease variant;Immunity;Inflammatory response;Innate immunity;Metal-binding;Microtubule;Nucleus;Phosphoprotein;Reference proteome;Zinc;Zinc-finger,Actin-binding,,
MEFV,3D-structure;Actin-binding;Alternative splicing;Amyloidosis;Cell projection;Coiled coil;Cytoplasm;Cytoplasmic vesicle;Cytoskeleton;Disease variant;Immunity;Inflammatory response;Innate immunity;Metal-binding;Microtubule;Nucleus;Phosphoprotein;Reference proteome;Zinc;Zinc-finger,Alternative splicing,,


In [39]:
dc_test = dc_v1 %>%
group_by (Gene.names...primary..) %>% 
 summarize(summary_fct = paste(unique (na.omit(funct)), collapse = ", "))
#summarise (GO_names = paste (category, collapse = ", "))
#head (dc_test)
dim (dc_test)

dc_test$summary_fct = str_trim(dc_test$summary_fct, side = c("both"))

dc_test$summary_fct <- str_replace_all(dc_test$summary_fct, "^, ", "")
dc_test$summary_fct <- str_replace_all(dc_test$summary_fct, ",$", "")
dc_test$summary_fct <- str_replace_all(dc_test$summary_fct, ", ,", ", ") 
head (dc_test)

Gene.names...primary..,summary_fct
<chr>,<chr>
AATK,
ABCB4,
ABCC3,
ACP3,
ACSL1,(metabolism)
ACSM3,(metabolism)


In [40]:
fct_only = dc_test %>%
rename (., Genesymbol = Gene.names...primary..) 

In [41]:
mrg_sisa_fct <- merge (mrg_sisa, fct_only, by= "Genesymbol")

In [42]:
head (mrg_sisa_fct)

#paste0 together but keep only non repeating terms..

Unnamed: 0_level_0,Genesymbol,description.x,howmany.x,summary,description.y,howmany.y,GO_mf,sum2,final_fct,summary_fct
Unnamed: 0_level_1,<chr>,<chr>,<int>,<chr>,<chr>,<int>,<chr>,<chr>,<chr>,<chr>
1,AATK,apoptosis associated tyrosine kinase,1,enzyme,apoptosis associated tyrosine kinase,1,enzyme,enzyme,enzyme,
2,ABCB4,ATP binding cassette subfamily B member 4,1,"transporter, enzyme",ATP binding cassette subfamily B member 4,1,transporter,transporter,"transporter, enzyme",
3,ABCC3,ATP binding cassette subfamily C member 3,2,"enzyme, transporter",ATP binding cassette subfamily C member 3,2,"transporter, enzyme","transporter, enzyme","enzyme, transporter",
4,ACP3,acid phosphatase 3,1,enzyme,acid phosphatase 3,1,enzyme,"enzyme,",enzyme,
5,ACSL1,acyl-CoA synthetase long chain family member 1,1,enzyme,acyl-CoA synthetase long chain family member 1,1,enzyme,"enzyme,",enzyme,(metabolism)
6,ACSM3,acyl-CoA synthetase medium chain family member 3,1,enzyme,acyl-CoA synthetase medium chain family member 3,1,enzyme,enzyme,enzyme,(metabolism)


In [43]:
x <- read.table ("./data/f_time2020_inlog_FC15_FC2_evidence-threshold.tsv", sep = '\t', header = T, row.names =1)

In [44]:
x = x %>%
rownames_to_column ("Genesymbol")

#head (x)

x2 = x %>%
select (c(Genesymbol, description, howmany))

In [45]:
mrg3 = merge (x2, mrg_sisa_fct, by = "Genesymbol", all.x = T)

In [46]:
dim (mrg3)

In [47]:
mrg3 %>%
arrange(desc(howmany))%>%
head()

Unnamed: 0_level_0,Genesymbol,description,howmany,description.x,howmany.x,summary,description.y,howmany.y,GO_mf,sum2,final_fct,summary_fct
Unnamed: 0_level_1,<chr>,<chr>,<int>,<chr>,<int>,<chr>,<chr>,<int>,<chr>,<chr>,<chr>,<chr>
1,ACVR1B,activin A receptor type 1B,4,activin A receptor type 1B,4,"enzyme, receptor",activin A receptor type 1B,4,"receptor, enzyme","receptor, enzyme","enzyme, receptor",
2,CLMN,calmin,4,calmin,4,,calmin,4,,,,
3,CYP24A1,cytochrome P450 family 24 subfamily A member 1,4,cytochrome P450 family 24 subfamily A member 1,4,enzyme,cytochrome P450 family 24 subfamily A member 1,4,enzyme,"enzyme,",enzyme,
4,ENTPD7,ectonucleoside triphosphate diphosphohydrolase 7,4,ectonucleoside triphosphate diphosphohydrolase 7,4,enzyme,ectonucleoside triphosphate diphosphohydrolase 7,4,enzyme,"enzyme,",enzyme,
5,G0S2,G0/G1 switch 2,4,G0/G1 switch 2,4,,G0/G1 switch 2,4,,,,(apoptosis)
6,HBEGF,heparin binding EGF like growth factor,4,heparin binding EGF like growth factor,4,"growth factor, receptor",heparin binding EGF like growth factor,4,growth factor,growth factor,"growth factor, receptor",


In [48]:
#write.xlsx (mrg3, "fct-final_v3.xlsx")