In [None]:
import pandas as pd
import numpy as np
import urllib2
from bs4 import BeautifulSoup
from textblob import TextBlob
from textstat.textstat import textstat
from datetime import datetime
from dateutil.parser import parse
import json
import re
import requests

The G2GAnalysis class holds all the data. It has class methods for scraping new data, keeping track of what it has already scraped

In [1]:
class G2GAnalysis():
    def __init__(self):
        self.home='https://www.wikitree.com/g2g/tags'
        self.path=self.home[:(self.home.find('tags'))]
            
        self.contribURL='https://www.wikitree.com/index.php?title=Special:Contributions&who='

        self.tagPageIndex=0  #which page of tags am I on?
        self.TagList=[]
        self.Users={} #dict holding the users
        self.Questions={} # dict holding the questions
        self.flipped=0 # number of questions flipped through
        self.page_tick=0
            
            
    def runScrape(self, nQuestions=99999):
        '''
        Run through a bunch of pages of g2g questions
        Parse them for the users, the users comments
        and the user stats.
        Fill a dictionary of results.
        '''
        self.nQuestions=nQuestions        
        self.topQuestion='none' # keep track of what question is on top of each page
                                # if it doesn't change after you try to change a page
                                # end.
        self.endList=0 #if 0, stay in the same list
                         #if 1, go to the next list

        self.initializeTags()
        print self.page_tick
        with open('userData.txt', 'w') as outfile: 
            #back up our data in case of emergencies
            json.dump(self.Users, outfile)
        with open('questionData.txt','w') as outfile:
            json.dump(self.Questions, outfile)
        self.scrapeLoop()

                
    def continueScrape(self, nQuestions=99999):
        '''
        If there is already a G2G analysis object, g2g, 
        with len(g2g.Users)>0 and len(g2g.Questions)>0,
        pick up where you left off.
        '''
        self.page_tick=0 #start at the beginning of a tag
        self.nQuestions=nQuestions

        if len(self.tagDict)==0:
            self.makeTagDict()
            self.endList=1
            self.newTagl()
            self.runPage() 
        print self.home
        self.scrapeLoop()
                        
    
    def scrapeLoop(self):
        '''
        Turn the page; changing to a new tag if necessary.
        '''
        while self.flipped < self.nQuestions:
            self.turnPage() # go to next page of questions.
            self.newTag() # if I need to change tags, do so
            self.runPage() # scrape all the questions on a page.
            print self.page_tick
            with open('userData.txt', 'w') as outfile: 
                #back up our data in case of emergencies
                json.dump(self.Users, outfile)
            with open('questionData.txt','w') as outfile:
                json.dump(self.Questions, outfile)
    
    def initializeTags(self):
        '''
        create a tag dictionary from the launching page
        '''
        self.tagDict={}
        self.tagPageIndex=0
        self.makeTagDict()
        self.endList=1
        self.newTag()
        #self.runPage()

        
    def turnPage(self):
        '''
        load the next page of questions within a tag
        check whether I've reached the end of that tag
        '''
        nextPage=self.page_tick*50
        pageUrl=self.home+'?start='+str(nextPage)
        page = requests.get('http://www.newyorksocialdiary.com/party-pictures')
        self.page=requests.get(pageUrl)
        self.soup = BeautifulSoup(self.page.text, 'lxml')
        
        # Check to see there's still questions on this page.
        pageTitle=self.soup.find('title').text
        if pageTitle.find('No questions') > -1:
            self.endList=1
        div=self.soup.find('div', attrs={'class' : "qa-part-q-list"})
        if div.text.find('No question') > -1: 
            #not sure which of these works consistently
            self.endList=1

            
    def newTag(self):
        '''
        check whether I've reached the end of a tag
        and move to the next, if necessary
        or, signal I've reached the end of all the tags.
        '''
        if self.endList==1:
            self.endList=0
            self.page_tick=0

            # move to the next tag.
            tagDets=self.tagDict.popitem()
            print tagDets
            self.currentTag=tagDets[0]
            self.currentTagCount=tagDets[1]
            self.home=self.path+'tag/'+self.currentTag
            self.page=requests.get(self.home)
            self.soup=BeautifulSoup(self.page.text, 'lxml')
            if len(self.tagDict)==0:
                self.makeTagDict()
        else:
            pass    
    
    
    def makeTagDict(self):
        '''
        Move to the next page of tags
        read them all in, and
        repopulate the dictionary with tags.
        '''
        pageStart=str(self.tagPageIndex*100)
        tagPageUrl=self.path+'/tags?start='+pageStart
        print tagPageUrl
        tagPage = requests.get(tagPageUrl)
        tagSoup = BeautifulSoup(tagPage.text, 'lxml')
        for tag in tagSoup.findAll('td', attrs={'class':'qa-top-tags-count'}):
            myCount=tag.text.split(' ')[0]
            myCount=int(myCount.replace(',', ''))
            tag2=tag.find_next()
            link=tag2.find('a').get('href')
            link=link[6:]
            self.tagDict[link]=myCount
        if self.tagPageIndex > 150: # we've almost certainly seen everything.
            self.flipped=self.nQuestions+1 # jump to the end.
        self.tagPageIndex+=1
        
        
    def runPage(self):
        '''
        go through the list of questions on this page
        and scrape each one in turn
        also, check to see if this page is the same as the last one.
        '''
        self.page_tick+=1
        if self.flipped < self.nQuestions:
            for title in self.soup.findAll('div', attrs={'class': 'qa-q-item-title' }):
                link=title.find('a').get('href')
                qID=self.checkQuestion(link) #did I already ask it?
                if qID:
                    self.flipped+=1
                    self.scrapeQuestion(link,qID)
                    
                    
    def addUserAndText(self, uID,uText, uWhen):
        '''
        Update the info for a user
        Initializing a new user if needed
        '''
        days = self.getDaysDiff(uWhen)
        userID=uID.split('/user/')[1]
        if not userID in self.Users:
            self.addUser(userID)
        else:
            pass
        textVal=self.get_text_sentiment(uText)
        nSyllables=textstat.syllable_count(uText)
        self.Users[userID]['textLens']+= [len(uText)]#another measure of their text
        self.Users[userID]['textSent']+=[textVal]
        self.Users[userID]['nSylls']+=[nSyllables]
        self.Users[userID]['days']+=[days]
        self.Users[userID]['questionIds']+=[self.qDict['question']]
        self.qDict['count']+=1
        self.qDict['tone']+=[textVal]
        self.qDict['syllables']+=[nSyllables]
        self.qDict['length']+=[len(uText)]
        self.qDict['user']+=[userID]

        
    def addUser(self, userID):
        '''
        Add the G2G keys and values to the userDets dictionary
        and intitialize the list values for the other keys
        '''
        userPath=self.path+'user/'+userID
        userpage = requests.get(userPath)
        userSoup=BeautifulSoup(userpage.text, 'lxml')
        userDets={}
        userDets['nPosts'] = userSoup.find('span', 
                                           attrs={'class' : "qa-uf-user-q-posts"}).text
        userDets['nAnswers']=userSoup.find('span', 
                                           attrs={'class' : "qa-uf-user-a-posts"}).text
        userDets['nComments'] = userSoup.find('span', 
                                              attrs={'class' : "qa-uf-user-c-posts"}).text
        userDets['giveUp'] = userSoup.find('span', 
                                           attrs={'class' : "qa-uf-user-upvotes"}).text #gave
        userDets['giveDown'] = userSoup.find('span', 
                                             attrs={'class' : "qa-uf-user-downvotes"}).text
        userDets['getUp'] = userSoup.find('span', 
                                          attrs={'class' : "qa-uf-user-upvoteds"}).text #received
        userDets['getDown'] = userSoup.find('span', 
                                            attrs={'class' : "qa-uf-user-downvoteds"}).text
        userURL='https://www.wikitree.com/wiki/'+userID

        userpage= requests.get(userURL)
        userSoup=BeautifulSoup(userpage.text, 'lxml')
        myThanks=0
        for div in userSoup.findAll('div', attrs={'class':'SMALL'} ):
            words=div.text.split(' ')
            words=filter(lambda x: x!=' ', words)
            words=filter(lambda x: x!='', words)
            i=0
            for x in words:
                if 'contributions' in x:
                    contributions=words[i-1]
                if 'thank' in x:
                    if myThanks==0:
                        thanks=words[i-1]
                        myThanks=1
                if 'confirmed' in x:
                    day=words[i+1]
                    month=words[i+2]
                    year=words[i+3]
                i+=1    
        
        userDets['year']=int(year)
        userDets['mon']=month
        userDets['day']=day
        userDets['thanks']=thanks
        userDets['contributions']=contributions
        userDets['textLens']=[]
        userDets['textSent']=[]
        userDets['nSylls']=[]
        userDets['days']=[]
        userDets['questionIds']=[]
        
        self.Users[userID]=userDets

                    
    def checkQuestion(self, link):
        '''
        make sure I haven't queried this question before
        if not, 
        add the question to the dictionary of questions
        '''
        questNumber=link.split('/')[1]
        
        if not questNumber in self.Questions:
            self.Questions[questNumber]={'path':self.path+link[3:]}
            #questPage=requests.get(self.path+link[3:])
            return questNumber
        else:
            return False
                  
            
    def parseQuestion(self,qPage,qTitle):
        '''
        Go through the question, and the comments on the question
        '''
        #question= qPage.find('div', attrs={'class': 'qa-part-q-view' })
        try:
            question=qPage.find('div', attrs={'class': re.compile('qa-q-view\shentry\squestion')})
            body=question.find('div', attrs={'class':'qa-q-view-main'})
        except:
            question=qPage.find('div', attrs={'class':re.compile('qa-q-view\sqa-q-closed\shentry\squestion')})
            body=question.find('div', attrs={'class':'qa-q-view-main'})
        try:
            questionText=body.find('div', attrs={'class': 'entry-content' }).text
        except:
            questionText=''
        try:
            questionText=qTitle+' '+questionText
            qID=body.find('a', attrs={'class': 'qa-user-link' }).get('href')
            whenQ=question.find('span', attrs={'class':re.compile('qa-q-view-when-data')}).text
            self.addUserAndText(qID,questionText,whenQ)
        except:
            pass
        comments=question.find('div', attrs={'class':'qa-q-view-c-list'}) 

        for comment in comments.findAll('div', attrs={'class' : re.compile('qa-c-list-item\shentry\scomment')}):
            try:
                self.parseComments(comment)
            except:
                pass
            
    def parseAnswer(self,answer):
        answerText=answer.find('div', attrs={'class':'qa-a-item-content'}).text
        #print answerText
        answerID=answer.find('a', attrs={'class':'qa-user-link'}).get('href')
        whenAnswer=answer.find('span', attrs={'class':'qa-a-item-when-data'}).text
        self.addUserAndText(answerID,answerText,whenAnswer)
        comments = answer.find('div', attrs={'class':'qa-a-item-c-list'})
        for comment in comments.findAll('div', attrs={'class': re.compile('qa-c-list-item\shentry\scomment')}):
            self.parseComments(comment)
            
            
    def extractQuestionText(self,qPage,qTitle):
        '''
        Check to see how many up/downvotes a question has
        Save it as an either positive, negative, or neutral corpus.
        '''
        #question= qPage.find('div', attrs={'class': 'qa-part-q-view' })
        try:
            question=qPage.find('div', attrs={'class': re.compile('qa-q-view\shentry\squestion')})
            body=question.find('div', attrs={'class':'qa-q-view-main'})
        except:
            question=qPage.find('div', attrs={'class':re.compile('qa-q-view\sqa-q-closed\shentry\squestion')})
            body=question.find('div', attrs={'class':'qa-q-view-main'})
        try:
            questionText=body.find('div', attrs={'class': 'entry-content' }).text
        except:
            questionText=''
        try:
            questionText=qTitle+' '+questionText
            print question
        except:
            pass
            
    def extractAnswerText(self,answer):
        answerText=answer.find('div', attrs={'class':'qa-a-item-content'}).text
        #print answerText
        answerID=answer.find('a', attrs={'class':'qa-user-link'}).get('href')
        whenAnswer=answer.find('span', attrs={'class':'qa-a-item-when-data'}).text
        self.addUserAndText(answerID,answerText,whenAnswer)
        comments = answer.find('div', attrs={'class':'qa-a-item-c-list'})
        for comment in comments.findAll('div', attrs={'class': re.compile('qa-c-list-item\shentry\scomment')}):
            self.parseComments(comment)

            
    def parseComments(self, comment):
        cText=comment.find('div', attrs={'class':'entry-content'}).text
        #print cText
        try:
            cID=comment.find('a',  attrs={'class': 'qa-user-link'} ).get('href')
        except:
            pass
        cWhen=comment.find('span', attrs={'class' : 'qa-c-item-when-data'}).text
        self.addUserAndText(cID,cText,cWhen)
        

    def scrapeQuestion(self,link, qId):
        '''
        Go through the page
        1.0: the question (only one)
        1.1: the comments on the question (0 to many) 
        2.0: the answers (0 to many)
        2.1: the comments on each answer in turn (0 to many)
        '''
        #qDict will hold the record of the question statistics
        #print 'am I scraping'
        self.qDict={'question':qId, 'count':0,'tone':[], 'syllables':[],'length':[], 'user':[]}
        #print self.qDict
        link=self.path+link[3:]
        page = requests.get(link)
        qPage=BeautifulSoup(page.text, 'lxml')
        qTitle=qPage.find('title').text
        qTitle=qTitle[:(qTitle.find(' - WikiTree G2G'))]
        #print ' '
        #1.0
        self.parseQuestion(qPage, qTitle)
        for answer in qPage.findAll('div', attrs={'class': re.compile('qa-a-list-item\shentry\sanswer')}):
            try:
                self.parseAnswer(answer)
            except:
                pass
        try:
            bestAnswer= qPage.find('div', attrs={'class': re.compile('qa-a-list-item\shentry\sanswer\sanswer-selected\sqa-a-list-item-selected')})
            self.parseAnswer(bestAnswer)
        except:
            pass
        self.Questions[qId]['stats']=self.qDict
                
                
    def clean_text(self, text):
        '''
        Utility function to clean text by removing links, special characters
        using simple regex statements.
        '''
        cleanText=' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", 
                                  " ", text).split())
        return cleanText
        
        
    def get_text_sentiment(self, text):
        '''
        Utility function to classify sentiment of passed tweet
        using textblob's sentiment method
        '''
        text=self.clean_text(text)
        analysis = TextBlob(text)
        return analysis.sentiment.polarity

    def getDaysDiff(self, uWhen):
        '''
        Calculate how long ago the question was.
        '''
        uWhen=uWhen.strip()
        if uWhen.find('minute')>-1:
            days=0
        elif uWhen.find('hour')>-1:
            days=0
        elif uWhen.find('day')>-1:
            days = uWhen[0]
        elif len(uWhen.split(' '))<3:
            uWhen=uWhen+' 2018'
            then=datetime.strptime(uWhen, '%b %d %Y')
            dateDiff=(datetime.now()-then)
            days=dateDiff.days           
        else:
            uWhen=uWhen.replace(',','')
            then=datetime.strptime(uWhen, '%b %d %Y')
            dateDiff=(datetime.now()-then)
            days=dateDiff.days
        return days
    
    
    def reloadSession(self,userpath='userData.txt', questionpath='questionData.txt', nQuestions=99999):
        '''
        initialise a new session, 
        then import a bunch of saved questions and users
        '''
        self.nQuestions=nQuestions
        self.initializeTags()
        with open(questionpath) as json_data:
            self.Questions = json.load(json_data)
        
        with open(userpath) as json_data:
            self.Users = json.load(json_data)
            
        
    def getScoredText(self):
        
        pass

        
    def importG2G(self, otherG2G):
        '''
        copy over the dicts and things from an old G2G analysis, in order to keep going
        only works on tag analysis so far
        '''
        self.Questions=otherG2G.Questions
        self.Users=otherG2G.Users
        self.home=otherG2G.home
        self.path=otherG2G.path
        self.tagDict=otherG2G.tagDict
        self.topQuestion=otherG2G.topQuestion
        self.currentTagCount=otherG2G.currentTagCount
        self.soup=otherG2G.soup
        self.page=otherG2G.page
        self.tagPageIndex=otherG2G.tagPageIndex
        self.endList=otherG2G.endList
    
g2g=G2GAnalysis()
#g2g.runScrape()
g2g.reloadSession(userpath='userData_trans.txt', questionpath='questionData.txt')

https://www.wikitree.com/g2g//tags?start=0
('irish_roots', 534)


In [86]:


import random

def extractQuestionText(qPage,qTitle, viewCount, running_threshold):
    '''
    Check to see how many up/downvotes a question has
    Save it as an either positive, negative, or neutral corpus.
    '''
    #question= qPage.find('div', attrs={'class': 'qa-part-q-view' }
    try:
        question = qPage.find('div', attrs={'class': re.compile('qa-q-view\shentry\squestion')})
        body = question.find('div', attrs={'class':'qa-q-view-main'})
    except:
        question = qPage.find('div', attrs={'class':re.compile('qa-q-view\sqa-q-closed\shentry\squestion')})
        body = question.find('div', attrs={'class':'qa-q-view-main'})
    try:
        text=body.find('div', attrs={'class': 'entry-content' }).text
    except:
        text = ''
    try:
        text = qTitle+' '+text
    except:
        return 0
    votes = question.find('span', attrs={'class': 'qa-netvote-count-data'}).text
    saved=saveTraining(votes, viewCount, running_threshold, text)
    return saved
    
    
def extractAnswerText(answer, viewCount, running_threshold):
    votes = answer.find('span', attrs={'class': 'qa-netvote-count-data'}).text
    text=answer.find('div', attrs={'class':'qa-a-item-content'}).text
    votes = int(votes)
    saved=saveTraining(votes, viewCount, running_threshold, text)
    return saved
    
    
def votes_to_int(votes):
    '''
    screw them with their decorative n-dash
    '''
    if votes[0]==u'\u2013':
        votes = votes[1:]
        votes=0-int(str(votes))
    else:
        votes=int(str(votes))
    return votes

    
def saveTraining(votes, viewCount, running_threshold, text):
    votes = votes_to_int(votes)
    vote_rat = votes/(np.log(viewCount+10))
    if vote_rat > running_threshold:
        outname = './training/pos/outie_'+str(votes)+'_'+str(random.randrange(10000000))+'.txt'
        f=open(outname, 'w+')
        f.write(text.encode('utf-8').strip())
        return 1
    elif vote_rat <0:
        outname = './training/neg/outie_'+str(votes)+'_'+str(random.randrange(10000000))+'.txt'
        f=open(outname, 'w+')
        f.write(text.encode('utf-8').strip())
        return 0
    else:
        if (random.random() < 0.1) and viewCount > 10 and len(text) > 30:
            outname = './training/neut/outie_'+str(votes)+'_'+str(random.randrange(10000000))+'.txt'
            f=open(outname, 'w+')
            f.write(text.encode('utf-8').strip())
        return 0

def updateRunner(current_good, target_ratio, i, running_threshold):
    if ((current_good*1.)/i)>target_ratio:
        running_threshold=running_threshold*(1.+1./np.sqrt(i+1.))
    else:
        running_threshold=running_threshold*(1.-1./np.sqrt(i+1.))
    return running_threshold

In [88]:
i=0
running_threshold = 1. # threshold goodness
current_good = 0.1
target_ratio = 0.05 # proportion saved as 'high ranks'
for q_id, q_vals in g2g.Questions.iteritems():
    page = requests.get(q_vals['path'])
    try:
        qPage=BeautifulSoup(page.text, 'lxml')
        qTitle=qPage.find('title').text
        qTitle=qTitle[:(qTitle.find(' - WikiTree G2G'))]
        viewCount=qPage.find('span', attrs={'class': re.compile('qa-view-count-data')}).text
        viewCount=int(viewCount.replace(',',''))
        current_good+=extractQuestionText(qPage, qTitle, viewCount, running_threshold)
        i+=1
        running_threshold=updateRunner(current_good, target_ratio, i, running_threshold)
    except:
        pass
    for answer in qPage.findAll('div', attrs={'class': re.compile('qa-a-list-item\shentry\sanswer')}):
        try:
            current_good+=extractAnswerText(answer, viewCount, running_threshold)
            i+=1
            running_threshold=updateRunner(current_good, target_ratio, i, running_threshold)
        except:
            pass



KeyboardInterrupt: 

In [157]:
questionText="I adopted Hawkins-1533 and it is same as Hawkins-332 Please look and merge , Thanks After researching I went back on the profile of Elizabeth, Hawkins-332 and wasn\'t paying attention to them being to and adopted and they was one already with the information Hawkins-332, but I didn\'t add nothing just need them merged,  sorry and Thank\'s I don't know how to take my name off manage"

In [25]:
i=0
running_threshold = 1. # threshold goodness
current_good = 0.1
target_ratio = 0.05 # proportion saved as 'high ranks'
page = requests.get('https://www.wikitree.com/g2g/598684/i-am-the-22nd-grt-nephew-of-king-john')
try:
    qPage=BeautifulSoup(page.text, 'lxml')
    qTitle=qPage.find('title').text
    qTitle=qTitle[:(qTitle.find(' - WikiTree G2G'))]
    viewCount=qPage.find('span', attrs={'class': re.compile('qa-view-count-data')}).text
    viewCount=int(viewCount.replace(',',''))
    print qTitle
    print viewCount
    current_good+=extractQuestionText(qPage, qTitle, viewCount, running_threshold)
    i+=1
    running_threshold=updateRunner(current_good, target_ratio, i, running_threshold)
except:
    pass
for answer in qPage.findAll('div', attrs={'class': re.compile('qa-a-list-item\shentry\sanswer')}):
    try:
        current_good+=extractAnswerText(answer, viewCount, running_threshold)
        i+=1
        running_threshold=updateRunner(current_good, target_ratio, i, running_threshold)
    except:
        pass


I am the 22nd grt nephew of King John
101
I am the 22nd grt nephew of King John John Plantagenet Signed the Magna Carta. He is my 22nd great Uncle
–3
here
–3
–3
here


In [16]:
print '–3'

–3


In [158]:
outname = './training/pos/outie316901.txt'
print outname
print questionText.encode('utf-8').strip()
f=open(outname, 'w+')
f.write(questionText.encode('utf-8').strip())



./training/pos/outie316901.txt
I adopted Hawkins-1533 and it is same as Hawkins-332 Please look and merge , Thanks After researching I went back on the profile of Elizabeth, Hawkins-332 and wasn't paying attention to them being to and adopted and they was one already with the information Hawkins-332, but I didn't add nothing just need them merged,  sorry and Thank's I don't know how to take my name off manage


In [None]:
len(g2g.Questions)

In [130]:
if 1>2 and 3<4 and 'apple'=='apple':
    print 'true'
else:
    print 'gibberish'

gibberish


In [None]:
import pandas as pd
import numpy as np
import urllib2
from bs4 import BeautifulSoup
from textblob import TextBlob
from textstat.textstat import textstat
from datetime import datetime
from dateutil.parser import parse
import json
import re
import requests
lunk='https://www.wikitree.com/'
link='/g2g/556814/public-record-office-announces-digitizing-records-seemed'

thing=G2GAnalysis()
qId=thing.checkQuestion(link)
thing.scrapeQuestion(link, qId)

In [None]:
page=requests.get('https://www.wikitree.com/g2g/556814/public-record-office-announces-digitizing-records-seemed').text
