In [4]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import sqlite3
import re
import time
import random
from pprint import pprint
import requests
import json
import praw
import pandas as pd
import pickle
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from textblob import TextBlob
from textblob.sentiments import NaiveBayesAnalyzer
import collections
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities

#To import jupyter notebooks
import nbimporter

from seleniumFunctions import *
from seleniumRedditFunctions import *
from redditFunctionsPublic import *
from twitterFunctionsPublic import *
from visualizationFunctions import *

#Private notebooks with sensitive information. 
from redditFunctionsPrivate import *
from twitterFunctionsPrivate import *

Importing Jupyter notebook from redditFunctionsPrivate.ipynb
Importing Jupyter notebook from twitterFunctionsPrivate.ipynb


In [5]:
def getWordWeights(allText, searchTerms):
    wordHash = {}
    from nltk.corpus import stopwords
    from nltk.tokenize import word_tokenize
    stop_words = set(stopwords.words('english'))

    additionalStopWords = ["-", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", ".", "Sign in", "Feedback"]
    for x in additionalStopWords: 
        stop_words.add(x)

    allTextOld = ""
    for sentence in allText: 
        allTextOld = allTextOld + sentence

    for word in stop_words: 
        compiledRegex = re.compile("\s"+word+"\s", re.IGNORECASE)
        allTextOld = compiledRegex.sub("", allTextOld)

    for word in allText.split(): 
        if word in wordHash: 
            wordHash[word] = wordHash[word] + 1
        else: 
            wordHash[word] = 1

    wordWeights = {}
    s = [(k, wordHash[k]) for k in sorted(wordHash, key=wordHash.get, reverse=True)]
    for k, v in s:
        wordWeights[k] = int(v)
        
    minValue = 9999
    maxValue = -9999
    for x in wordWeights: 
        key = x
        value = wordWeights[key]
        if value > maxValue:
            maxValue = value
        if value < minValue: 
            minValue = value

    for x in wordWeights: 
        key = x
        value = float(wordWeights[key])
        wordWeights[key] = (value-minValue) / (maxValue-minValue)

    return wordWeights

def split_into_sentences(text):
    caps = "([A-Z])"
    prefixes = "(Mr|St|Mrs|Ms|Dr)[.]"
    suffixes = "(Inc|Ltd|Jr|Sr|Co)"
    starters = "(Mr|Mrs|Ms|Dr|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"
    acronyms = "([A-Z][.][A-Z][.](?:[A-Z][.])?)"
    websites = "[.](com|net|org|io|gov)"
    text = " " + text + "  "
    text = text.replace("\n"," ")
    text = re.sub(prefixes,"\\1<prd>",text)
    text = re.sub(websites,"<prd>\\1",text)
    if "Ph.D" in text: text = text.replace("Ph.D.","Ph<prd>D<prd>")
    text = re.sub("\s" + caps + "[.] "," \\1<prd> ",text)
    text = re.sub(acronyms+" "+starters,"\\1<stop> \\2",text)
    text = re.sub(caps + "[.]" + caps + "[.]" + caps + "[.]","\\1<prd>\\2<prd>\\3<prd>",text)
    text = re.sub(caps + "[.]" + caps + "[.]","\\1<prd>\\2<prd>",text)
    text = re.sub(" "+suffixes+"[.] "+starters," \\1<stop> \\2",text)
    text = re.sub(" "+suffixes+"[.]"," \\1<prd>",text)
    text = re.sub(" " + caps + "[.]"," \\1<prd>",text)
    if "”" in text: text = text.replace(".”","”.")
    if "\"" in text: text = text.replace(".\"","\".")
    if "!" in text: text = text.replace("!\"","\"!")
    if "?" in text: text = text.replace("?\"","\"?")
    text = text.replace(".",".<stop>")
    text = text.replace("?","?<stop>")
    text = text.replace("!","!<stop>")
    text = text.replace("<prd>",".")
    sentences = text.split("<stop>")
    sentences = sentences[:-1]
    sentences = [s.strip() for s in sentences]
    longSentences = []
    for sentence in sentences: 
        if len(sentence) > 20:
            longSentences.append(sentence)
    return longSentences

def getSentencesSentiment(sentenceArray):
    allSentiments = {}
    sid = SentimentIntensityAnalyzer()
    
    for sentence in sentenceArray:
        
        ss = sid.polarity_scores(sentence)
        sentiments = {}
        for k in ss:
            sentiments[k] = ss[k]
        allSentiments[sentence] = sentiments
    return allSentiments

def allTextToSentiment(allText, searchTerms, sortBy="compound"):
    sentences = allText
    sentiment = getSentencesSentiment(sentences)
    
    
    
    sortedSentiments = sorted(sentiment.keys(), key=lambda x: sentiment[x]["pos"], reverse=True)
    allSentiment = []
    for sentence in sortedSentiments:
        allSentiment.append((sentence, sentiment[sentence]))
        
        
    newSentiment = {}
    wordWeights = getWordWeights(allText, searchTerms)
    #count1 = 0
    for x in allSentiment: 
        #print("count: ", count1, " of: ", len(allSentiment))
        #count1 += 1
        sentence = x[0]
        wordList = re.sub("[^\w]", " ",  sentence).split()
        
        if len(wordList) < 5: 
            pass
        else: 
            sentimentHash = x[1] #Hash table
            importance = 0.0
            usedWeights = []
            count = 0

            for word in wordList: 
                count += 1
                try: 
                    usedWeights.append(wordWeights[word])
                except: 
                    pass

            for weight in usedWeights: 
                importance = float(importance) + float(weight)

            if len(usedWeights) > 0: 
                importance = float(importance) / float(count)
            else:
                pass
            
            
            sentimentHash["frequency"] = importance    
            overallCompound = (sentimentHash["frequency"] * 30) + (sentimentHash["pos"] * 10) + (sentimentHash["neg"] * 10) + (sentimentHash["compound"] * 7)
            sentimentHash["freqPos"] = sentimentHash["frequency"]*1.5 + sentimentHash["pos"] - 0.5 * sentimentHash["neu"]
            sentimentHash["freqNeg"] = sentimentHash["frequency"]*1.5 + sentimentHash["neg"] - 0.5 * sentimentHash["neu"]
            sentimentHash["weightedCompound"] = overallCompound

            if sentimentHash["frequency"] < 0.10: 
                sentimentHash["freqPos"] = 0
            
            newSentiment[sentence] = sentimentHash
            
     
    newSortedSentiments = sorted(newSentiment.keys(), key=lambda x: newSentiment[x][sortBy], reverse=True)
    
            
    allSentimentFinal = []
    for sentence in newSortedSentiments:
        allSentimentFinal.append((sentence, newSentiment[sentence]))
            
    return allSentimentFinal



def getGoogleSearchResults(searchTerm, numPages):
    from selenium import webdriver
    driver = webdriver.Firefox()

    driver.get("https://www.google.com/search?q="+str(searchTerm))
    links = []
    
    results = driver.find_elements_by_css_selector('div.g')
    for result in results: 
        link = result.find_element_by_tag_name("a")
        href = link.get_attribute("href")
        links.append(href)
    links.append(driver.current_url)
    
    
    for pageNum in range(1, numPages):
        driver.find_element_by_link_text("Next").click()
        results = driver.find_elements_by_css_selector('div.g')
        for result in results: 
            link = result.find_element_by_tag_name("a")
            href = link.get_attribute("href")
            links.append(href)
        links.append(driver.current_url)
    
    #driver.close()
    return links

def getSentiment(allText, searchTerms, allSentencesAndLocations, sortBy="frequency"):
    sentiment = allTextToSentiment(allText, searchTerms, sortBy=sortBy) #Returns a list of tuples, each tuple is sentence, then hash of sentiment         
    print("sentiment: ", sentiment)
    overallSentiment = 0
    print("Sorted by: ", sortBy)
    allSentimentHash = {}
    for example in sentiment: 
        sentence = example[0] + "\n"
        #print("sentence: ", sentence)
        sentimentHash = example[1]
        positive = round(sentimentHash["pos"],2)
        negative = round(sentimentHash["neg"],2)
        neutral = round(sentimentHash["neu"],2)
        intensity = round(sentimentHash["compound"],2)
        frequency = round(sentimentHash["frequency"],2)
        compound = round(sentimentHash["weightedCompound"],2)
        url = allSentencesAndLocations[sentence.replace("\n", "")]
        sentimentHash["location"] = url
        overallSentiment = overallSentiment + positive  - negative
        allSentimentHash[sentence] = sentimentHash
        #print("\"", sentence.strip(), "\" \nPositive: ", positive, " Negative: ", negative, " Neutral: ", neutral, " Frequency: ", frequency, " Intensity: ", intensity, " Weighted Compound: ", compound)
        #print("")
    if not float(len(sentiment)) == 0: 
        overallSentiment = 100 * (float(overallSentiment) / float(len(sentiment)))
    #print("Overall Sentiment: ", overallSentiment)
    return (overallSentiment, allSentimentHash)

def getAllTextFromGoogleSearch(searchTerms, searchDepth, maxDepth):
    #All google main "cryonics" search page result pages
    allallText = ""
    allSentencesAndLocations = {}

    identifierString = ""
    for searchTerm in searchTerms: 
        identifierString = identifierString + str(searchTerm)
    identifierString = identifierString + str(maxDepth) + str(searchDepth)
    identifierString = identifierString.replace(" ", "")
    identifierString = identifierString.replace("?", "")
    identifierString = identifierString.replace("'", "")

    try:
        print("Attempting to load googleSearch"+identifierString+".p variable from previous calculation.")
        allallText = pickle.load(open("googleSearch"+identifierString+".p", "rb"))
        fixedSentenceUrlHash = pickle.load(open("googleSearchallSentencesAndLocations"+identifierString+".p", "rb"))
        basicSentimentList = pickle.load(open("basicSentimentList"+identifierString+".p", "rb"))
        print("Load successful.")    
    except (OSError, IOError) as e:
        print("Load un-successful. Calculating.")

        links = []
        for searchTerm in searchTerms: 
            tempLinks = getGoogleSearchResults(searchTerm, searchDepth)
            links.extend(tempLinks)

        links = list(set(links))
        
        allLinksList = links
        allText = ""
        
        count = 0
        driver = createSeleniumWebdriver("http://www.google.com")
        for depthValue in range(0, maxDepth): 
            newLinks = []
            for link in allLinksList:
                print("Processing link: ", count, " out of: ", len(allLinksList))
                count += 1
                try: 
                    (text, urls) = getWebpageTextAndLinksSelenium(link, driver)
                    allText = allText + text
                    for url in urls: 
                        newLinks.extend(urls)
                except: 
                    print("Skipping link: ", link)
                    pass
                
                #If the link is already in the hash, add text to it, otherwise, make it and add the text
                if link in allSentencesAndLocations: 
                    tempTextList = allSentencesAndLocations[link]
                    tempTextList.append(allText)
                    allSentencesAndLocations[link] = tempTextList
                else:
                    allSentencesAndLocations[link] = [allText]
                allallText = allallText + allText
                #print("Adding: ", allText)
                allText = ""
            
            allLinksList = list(set(newLinks))

        print("Writing calculated googleSearch"+identifierString+".p variable to pickle file.")
        output = open("googleSearch"+identifierString+".p", "wb")
        pickle.dump(allallText, output)
        output.close()
        
        fixedSentenceUrlHash = {}
        basicSentimentList = []
        allallText = []
        #Parse up the allSentencesAndLocations variable into key=sentence value = url
        for url, allTextUrl in allSentencesAndLocations.items():
            for allTextGroup in allTextUrl:
                sentences = split_into_sentences(allTextGroup)
                for sentence in sentences: 
                    fixedSentenceUrlHash[sentence] = url
                    allallText.append(sentence)
                    basicSentimentList.append((getSentencesSentiment([sentence]), url))
                    
        output = open("googleSearchallSentencesAndLocations"+identifierString+".p", "wb")
        pickle.dump(fixedSentenceUrlHash, output)
        output.close()
        
        output = open("basicSentimentList"+identifierString+".p", "wb")
        pickle.dump(basicSentimentList, output)
        output.close()

    return (allallText, fixedSentenceUrlHash, basicSentimentList)
              
def printBasicSentenceSentimentToCsv(basicSentenceSentiment, csvName):
    file = open(csvName+".csv","w") 
    for sentimentTuple in basicSentenceSentiment:
        infoHash = sentimentTuple[0]
        url = sentimentTuple[1]
        for sentence, sentiment in infoHash.items():
            toWrite = sentence + "\t" + str(url) + "\t" + str(sentiment["pos"]) + "\t" + str(sentiment["neg"]) + "\t" + str(sentiment["neu"]) + "\t" + str(sentiment["compound"]) + "\n"
            file.write(toWrite)
    file.close()