# 1. System commands can help you combine R and Python (or any other language/program)

In [None]:
# 1. System commands can help you combine R and Python (or any other language/program)
#####################################################################################
#?system()
#windows
#shell(command)
#or
#?system2()
y<-"helloworld"
name<-paste("python ", y, ".py",sep="") #helloworld.py has already been created
system(name)

# 2. Scraping pubmed or Google can be done directly with R or using Python commands


## 2.1. Scraping pubmed with R

In [None]:
library(RISmed)
#https://www.ncbi.nlm.nih.gov/pubmed/?term=calcul+quebec
?RISmed

In [None]:
g="Calcul quebec"

In [None]:
res <- EUtilsSummary(paste(g), db="pubmed", datetype='pdat', mindate=2000, maxdate=2015, retmax=500)

In [None]:
# Access data by calling EUtilsGet() (it's slow!!)
# new object t that is a character vector containing all the publication titles by using the EUtilsGet() 
# function inside ArticleTitle().

In [None]:
t<-ArticleTitle(EUtilsGet())

In [None]:
y <- YearPubmed(EUtilsGet())

In [None]:
r <- YearReceived(EUtilsGet())

In [None]:
library(ggplot2)
count<-as.data.frame(table())

In [None]:
names(count)<-c("Year", "Counts")

In [None]:
ggplot(data=count, aes(x=Year, y=Counts))+geom_bar(stat="identity")+ 
  labs(title=paste("PubMed articles containing '", g,"', n ", "= ", sum(count$Counts), sep=""))+
  ylab("Number of papers") +
  xlab(paste("Year n Query date: ", Sys.time(), sep="")) +
  labs(colour="") +
  theme_bw()

In [None]:
y #nb 21 was published before Calcul Quebec even existed

## 2.2. Scraping Google Scholar with Python and R

In [None]:
# with python
#############
## load libraries
library(reshape2)
library(plyr)

## step 1: data extraction

In [None]:
## load relevant databases
pilist<-read.csv("../files/scholar_python/pi_list2.csv", header = T)
test=list();pilist

In [None]:
## run scholar py script as system command
scholar.it<-function(pi_list, start.year, end.year, n.authors){
  for (y in start.year:end.year){
    for (x in 121:126){
      print(x)
      for (z in 1:2){
        if (z==1){
          grab="all"
          piname=pi_list[x,1]
          scholar=system('python', c('../files/scholar_python/scholar.py', args =paste('-a ', piname,' -t --',grab, ' calcul quebec',' --after ',2013,' --no-patents --no-citations --csv-header',sep="")), stdout=TRUE)#change all or none according to your needs
          if (length(scholar)==0){
            print(paste("The PI named ", piname, " published nothing related to CQ in ", y, ".", sep=""))
            Sys.sleep(20)
          }else{
            schol.df=data.frame(scholar)
            schol.df$author=piname
            schol.df$author=as.character(schol.df$author)
            schol.df$year=y
            schol.df$year=as.character(schol.df$year)
            schol.df$keywd=grab
            schol.df$keywd=as.character(schol.df$keywd)
            schol.df[1,2]="authors"
            schol.df[1,3]="year"
            schol.df[1,4]="keywd"
            write.csv(schol.df, paste("../files/scholar_python/scholar_output/",y,"_",piname,"_CQ.csv", sep=""))
            Sys.sleep(20)
          }
        }else{
          grab="none"
          piname=pi_list[x,1]
          scholar=system('python', c('../files/scholar_python/scholar.py', args =paste('-a ', piname,' -t --', grab, ' calcul quebec',' --after ',2013,' --no-patents --no-citations --csv-header',sep="")), stdout=TRUE)#change all or none according to your needs
          if (length(scholar)==0){
            print(paste("The PI named ", piname, " didn't publish in ", y, ".", sep=""))
            Sys.sleep(60)
          }else{
            schol.df=data.frame(scholar)
            schol.df$author=piname
            schol.df$author=as.character(schol.df$author)
            schol.df$year=y
            schol.df$year=as.character(schol.df$year)
            schol.df$keywd=grab
            schol.df$keywd=as.character(schol.df$keywd)
            schol.df[1,2]="authors"
            schol.df[1,3]="year"
            schol.df[1,4]="keywd"
            write.csv(schol.df, paste("../files/scholar_python/scholar_output/",y,"_",piname,"_noCQ.csv", sep=""))
            Sys.sleep(60)
          }
        }
      }
    }
  }
}

In [None]:
scholar.it(pi_list = pilist,start.year = 2014,end.year = 2014,n.authors = nrow(pilist))

## step 2: data analysis

In [None]:
## merge all csv files into one data frame
multmerge = function(mypath){
  filenames=list.files(path=mypath, full.names=TRUE)
  datalist = lapply(filenames, function(x){read.csv(file=x,header=T, skip=1)})
  Reduce(function(...) rbind(...), datalist)}

In [None]:
schol.merge<-multmerge("../files/scholar_python/scholar_output")

In [None]:
## split the "Scholar Column into multiple relevant columns using the separator
list <- strsplit(as.character(schol.merge$title.url.year.num_citations.num_versions.cluster_id.url_pdf.url_citations.url_versions.url_citation.excerpt), "\\|")

In [None]:
library("plyr")


In [None]:
df <- ldply()


In [None]:
colnames(df)<-c(unlist(strsplit("title.url.year.num_citations.num_versions.cluster_id.url_pdf.url_citations.url_versions.url_citation.excerpt","\\.")))

In [None]:
schol.res<-cbind(schol.merge[,3:5], df)

In [None]:
schol.res$num_citations<-as.numeric()

In [None]:
## compare citations between papers that do/don't mention CQ for the same authors
schol.citsum<-aggregate(data=schol.res, num_citations~authors+keywd, mean)

In [None]:
plot(y=schol.citsum$num_citations, x=schol.citsum$keywd)

# 3. Once you have retrieved the data you need, or if you already have files saved locally, you can do some text mining.

In [None]:
# 3.1. General text mining
Needed <- c("tm","RColorBrewer", "ggplot2", "wordcloud", "biclust", "cluster", "igraph", "fpc", "tm", "pdftools") 

In [None]:
lapply(, library, character.only=TRUE)

In [None]:
 setwd("../data/scraping_text_mining/pdf/")

In [None]:
files <- list.files(pattern = "")

In [None]:
filetext <- lapply(files, pdf_text)

In [None]:
filetext <- lapply(filetext, function(x)gsub("(\u201c|\u201d|\u2014)","",x))

In [None]:
corp <- Corpus(VectorSource(filetext))

In [None]:
filetext.tdm2 <- TermDocumentMatrix(corp)

In [None]:
inspect([1:10,])

In [None]:
meta(corp[[1]], tag = "id") <- files[1]

In [None]:
for(i in seq(length(corp))){
  meta(corp[[i]], tag = "id")<- files[i]
}

In [None]:
docs <- tm_map(corp, PlainTextDocument)  

In [None]:
docs <- tm_map(docs, removePunctuation)  
docs <- tm_map(docs, removeNumbers)    
docs <- tm_map(docs, tolower)   
docs <- tm_map(docs, removeWords, stopwords("english"))

In [None]:
docs <- tm_map(docs, stemDocument) 
docs <- tm_map(docs, stripWhitespace) 
docs <- tm_map(docs, PlainTextDocument)    

In [None]:
docs <- tm_map(docs, PlainTextDocument) 


In [None]:
dtm <- DocumentTermMatrix(docs)   

In [None]:
tdm <- TermDocumentMatrix(docs)    

In [None]:
freq <- colSums(as.matrix(dtm))   

In [None]:
ord <- order(freq) 

In [None]:
dtms <- removeSparseTerms(dtm, 0.1)   

In [None]:
freq <- colSums(as.matrix(dtms))   


In [None]:
freq <- sort(colSums(as.matrix(dtm)), decreasing=TRUE)   

In [None]:
findFreqTerms(dtm, lowfreq=) 

In [None]:
wf <- data.frame(word=names(freq), freq=freq)   
head(wf) 

In [None]:
p <- ggplot(subset(wf, freq>50), aes(word, freq))    
p <- p + geom_bar(stat="identity")   
p <- p + theme(axis.text.x=element_text(angle=45, hjust=1))   
p  

In [None]:
findAssocs(dtm, c("question" , "analysi"), corlimit=0.98) 

In [None]:
findAssocs(dtms, "contrast", corlimit=0.90)

In [None]:
library(wordcloud)   

In [None]:
set.seed(142)   
wordcloud(names(freq), freq, min.freq=)

In [None]:
dtmss <- removeSparseTerms(dtm, 0.15)  


In [None]:
library(cluster) 

In [None]:
d <- dist(t(dtmss), method="euclidian")   

In [None]:
fit <- hclust(d=d, method="ward") 

In [None]:
plot(fit, hang=-1)  

In [None]:
plot.new()
plot(fit, hang=-1)

In [None]:
rect.hclust(fit, k=5, border="red")