In [None]:
library("sqldf")
library("stringr")
library("ggplot2")
library("syuzhet")
library('SentimentAnalysis')

newsDF      = read.csv("data/RedditNews.csv",stringsAsFactors=F,na.strings="NA")
djiaDF      = read.csv("data/DJIA_table.csv",stringsAsFactors=F,na.strings="NA")
boolNewsDF  = read.csv("data/Combined_News_DJIA.csv",stringsAsFactors=F,na.strings="NA")

boolNewsDFcolumns = colnames(boolNewsDF, do.NULL = TRUE, prefix = "col")
boolNewsDFcolumns = tail(boolNewsDFcolumns,-2)

for(column in boolNewsDFcolumns)
{
     boolNewsDF[,column] = gsub("[^[:alnum:] ]", "", str_sub(boolNewsDF[,column], 3, -2))
     boolNewsDF[,column] = tolower(boolNewsDF[,column])
}


################################################################################

SELECT = 'SELECT *' 
FROM = 'FROM djiaDF,boolNewsDF'
WHERE = 'WHERE djiaDF.Date == boolNewsDF.Date'
SALT = 'ORDER BY djiaDF.Date ASC'

myStmt = paste(SELECT,FROM,WHERE,SALT,sep=' ')

masterDF = sqldf(myStmt)

################################################################################

masterDF$all <- paste(masterDF$Top1, masterDF$Top2, masterDF$Top3, masterDF$Top4, masterDF$Top5, masterDF$Top6,
                      masterDF$Top7, masterDF$Top8, masterDF$Top9, masterDF$Top10, masterDF$Top11, masterDF$Top12, 
                      masterDF$Top13, masterDF$Top14, masterDF$Top15, masterDF$Top16, masterDF$Top17, masterDF$Top18,
                      masterDF$Top19, masterDF$Top20, masterDF$Top21, masterDF$Top22, masterDF$Top23, masterDF$Top24,
                      masterDF$Top25, sep=' ')

headlines = masterDF$all

sentimentMatrix  = get_nrc_sentiment(headlines)
emotionMatrix    = sentimentMatrix[,1:8]
directionMatrix  = sentimentMatrix[,9:10]

directionMatrix$flag = ""
rm(djiaDF,boolNewsDF,masterDF,newsDF)

#################################################

pcaResults = prcomp(emotionMatrix, scale=FALSE)

# pcaResults = prcomp(emotionMatrix, scale=TRUE)

#################################################

# Proportion of Variance Explained
pcaVariance = pcaResults$sdev^2
proportionVarExp = pcaVariance/sum(pcaVariance)
proportionVarExp

#################################################

plot(proportionVarExp, xlab="PCs", ylab="Proportion of Variance Explained", ylim=c(0,1), type='b')

#################################################

len = dim(directionMatrix)[1]
posIndex = 2
negIndex = 1
flagIndex = 3

for(i in 1:len)
{
  posCount = directionMatrix[i,posIndex]
  negCount = directionMatrix[i,negIndex]
  
  if(posCount > 1*negCount)
  {
    directionMatrix[i,flagIndex] = 'Positive'
  }
  else if (negCount > 1*posCount)
  {
    directionMatrix[i,flagIndex] = 'Negative'
  }
  else
  {
    directionMatrix[i,flagIndex] = 'Neutral'
  }
}

#################################################

dim(directionMatrix[directionMatrix$flag=='Positive',][1])
dim(directionMatrix[directionMatrix$flag=='Neutral',][1])
dim(directionMatrix[directionMatrix$flag=='Negative',][1])

#################################################

sentimentsAll  <- analyzeSentiment(headlines)

directionsSAP = convertToDirection(sentimentsAll)

sentimentLM = directionsSAP$SentimentLM
sentimentQDAP = directionsSAP$SentimentQDAP
sentimentGI = directionsSAP$SentimentGI
sentimentHE = directionsSAP$SentimentHE

myPCAs = pcaResults$rotation[,1:2]
myPCAs

#################################################

Z = as.data.frame(pcaResults$x)

Z$NRC = directionMatrix$flag

Z$HE = sentimentHE
Z$LM = sentimentLM
Z$QDAP = sentimentQDAP
Z$GI = sentimentGI

#################################################

ggplot(data = Z) + 
  geom_point(mapping=aes(x=PC1,y=PC2,color=NRC))

#############

ggplot(data = Z) + 
  geom_point(mapping=aes(x=PC1,y=PC2,color=HE))

#############

ggplot(data = Z) + 
  geom_point(mapping=aes(x=PC1,y=PC2,color=LM))

#############

ggplot(data = Z) + 
  geom_point(mapping=aes(x=PC1,y=PC2,color=QDAP))

############

ggplot(data = Z) + 
  geom_point(mapping=aes(x=PC1,y=PC2,color=GI))
