In [1]:
import requests,re
import sys
sys.path.append('/usr/local/lib/python2.7/dist-packages/')
import textblob
import langid
import logging
import collections
import pymongo,time
from secrets import *

### Summary
Script to scrape content from Facebook API.
Collects
1. Pages matching a keyword query
2. Recent posts from these pages (limit set by API)
3. Comments on these posts (it seems that likes on comments are not available)
4. Likes on these posts  

### Errors

Needs robust error handling due to errors with API requests. More detail [here](https://developers.facebook.com/docs/graph-api/using-graph-api/v2.4#errors)

* ``Error 500 {"error":{"message":"An unexpected error has occurred. Please retry your request later.","type":"OAuthException","is_transient":true,"code":2}}``
* ``Error 500 {"error":{"code":1,"message":"An unknown error occurred"}}``
* ``ConnectionError: ('Connection aborted.', gaierror(-2, 'Name or service not known'))``

### TODO

* ~~Get likes on comments~~
* ~~Have a restart page ID, so we can pick up if scraping fails~~
* Write a script to check for new posts on pages that already exist in DB 
* Add a record of when data was pulled  

Mongo tutorial [here](http://api.mongodb.org/python/current/tutorial.html)

In [2]:
from pymongo import MongoClient
client = MongoClient()
db = client.fb
pagesCollection = db.pages
postsCollection = db.posts
commentsCollection = db.comments
likesCollection=db.likes

### Set up query to grab pages

In [3]:

ands=[]
#ands.append('أوروبا') # Europe
#ands.append('ألمانيا')
#ands.append('المانيا') # Germany (two alternate spellings)

ors=[]
ors.append('مهاجرون') # Refugees
ors.append('مهاجرين') # Refugees
ors.append('المهاجرين') # The refugees
ors.append('المهاجرون') # The refugees
ors.append('المهاجرون') # The refugees
ors.append('مهاجرون') # refugees
ors.append('هجرة') # Migration
ors.append('الهجرة') # The migration
ors.append('أوروبا') # Europe
ors.append('سوريا') # Syria

ors=[]
ors.append('مهرب') # Trafficker
ors.append('المتاجرين') # Traffickers

#QUERY='+'.join(ands)+'+'
#QUERY='|'.join(ors)
#QUERY='syria'
#QUERY='سوريا'
QUERY='أوروبا' # Europe
#QUERY=u'اوروبا'
#QUERY=u'اللجوء' # asylum
#QUERY=u'ملجأ' # asylum
# 'Europe' + 'ANY ['migration']...'
LIMIT=1000
# Page limit
postsLimit=250
# Hard limit from API=250

nSkip=5
# Hit API for query nSkip times
# before skipping

nWait=60
# Wait between API errors

postSleepTime=0.5
pageSleepTime=10
# Pause so API not thrashed

### Some boilerplate DB functions

In [4]:
def countCollections():
    print '%d pages' % pagesCollection.count()
    print '%d posts' % postsCollection.count()
    print '%d comments' % commentsCollection.count()
    print '%d likes' % likesCollection.count()

In [5]:
def clean(s):
    if s:
        s=re.sub(',|;|:|"|\'|\?|\(|\)|\n|\t|\-|\=|\+',' ',s.lower())
        return s.strip()
    else:
        return None

In [6]:
def addCommentsToDb(commentsData):
    nAdded=nAlready=0
    for comment in commentsData['data']:
        if not isCommentInDb(comment['id']):
            commentsCollection.insert_one(comment)
            nAdded+=1
        else:
            nAlready+=1
    logging.warning('%d comments added (%d already in DB)' % (nAdded,nAlready))

In [7]:
def addPageToDb(page):
    pagesCollection.insert_one(page)

In [8]:
def clearCollection(collection=None):
    
    all=False
    answer=True
    
    if not collection or collection.lower().strip()=='all':
        answer=raw_input('Clear all?')
        if answer.lower().strip() in ['y','yes']:
            all=True
            
    if all or collection=='likes':
        if not all:answer=raw_input('Clear likes?')
        if all or answer.lower().strip() in ['y','yes']:
            res=likesCollection.remove()
            print 'Cleared %d likes' % res['n']
            
    if all or collection=='pages':
        if not all:answer=raw_input('Clear pages?')
        if all or answer.lower().strip() in ['y','yes']:
            res=pagesCollection.remove()
            print 'Cleared %d pages' % res['n']

            
    if all or collection=='comments':
        if not all:answer=raw_input('Clear comments?')
        if all or answer.lower().strip() in ['y','yes']:
            res=commentsCollection.remove()
            print 'Cleared %d comments' % res['n']

            
    if all or collection=='posts':
        if not all:answer=raw_input('Clear posts?')
        if all or answer.lower().strip() in ['y','yes']:
            res=postsCollection.remove()
            print 'Cleared %d posts' % res['n']


In [9]:
countCollections()

1146 pages
101845 posts
208517 comments
122224 likes


In [None]:
clearCollection('all')

In [11]:
def addLikesToDb(likes):
    nAdded=nAlready=0
    for like in likes['data']:
        if not isLikeInDb(like['id']):
            likesCollection.insert_one(like)
            nAdded+=1
        else:
            nAlready+=1
    logging.warning('%d likes added (%d already in DB)' % (nAdded,nAlready))

In [12]:
def addPostToDb(post):
    print 'Adding posts'
    postsCollection.insert_one(post)

In [13]:
def isLikeInDb(id):
    '''
    Tests if a like is in comments collection
    Returns Bool
    '''
    nMatches=likesCollection.find({'id':id}).count()
    if nMatches==0:
        return False
    elif nMatches==1:
        return True
    else:
        logging.warning('Duplicate like %s' % id)
        return True

In [14]:
def isCommentInDb(id):
    '''
    Tests if a comment is in comments collection
    Returns Bool
    '''
    nMatches=commentsCollection.find({'id':id}).count()
    if nMatches==0:
        return False
    elif nMatches==1:
        return True
    else:
#        logging.warning('Duplicate comment %s' % id)
        return True

In [31]:
def addTimestampToPage(id):
    '''
    Adds current timestamp to page 
    '''
    pagesCollection.update({'id':id},{'$set':{'checked':[int(time.time())]}})

In [32]:
def updatePageTimestamp(id):
    '''
    Updates timestamp of page 
    '''
    pagesCollection.update({'id':id},{'$addToSet':{'checked':int(time.time())}})

In [50]:
def isPostInDb(id):
    '''
    Tests if a post is in post collection
    Returns Bool
    '''
    nMatches=postsCollection.find({'id':id}).count()
    if nMatches==0:
        return False
    elif nMatches==1:
        return True
    else:
#        logging.warning('Duplicate post %s' % id)
        return True

In [51]:
def isPageInDb(id):
    '''
    Tests if a page is in pages collection
    Returns Bool
    '''
    nMatches=pagesCollection.find({'id':id}).count()
    
    if nMatches==0:
        return False
    elif nMatches==1:
        return True
    else:
#        logging.warning('Duplicate page %s' % id)
        return True

### How to deal with API errors

In [36]:
def handleResult(statusCode,returnText):
    '''
    Parses API call result to determine if successful
    or to wait or abandon
    Returns success,skip (both Bool)
    '''
    if statusCode==200:
        # OK
        return True,False
    
    if statusCode in [102,10,463,467]:
        # Access token expired
        logging.warning('API error: %d %s' % (statusCode,returnText))
        return False,True
    elif statusCode in [2,4,17,341,500]:
        # Wait and retry
        logging.warning('API error - waiting: %d %s' % (statusCode,returnText))
        return False,False
    elif statusCode in [506,1609005]:
        # Skip
        logging.warning('API error - skipping: %d %s' % (statusCode,returnText))
        return False,True
    else:
        logging.warning('API error - unknown code %d %s' % (statusCode,returnText))
        return False, True

### Start by looping through pages

In [37]:
######################################################
success=None
nAttempts=0

temp='https://graph.facebook.com/search?q=%s&limit=%d&type=page&access_token=%s' % (QUERY, LIMIT, ACCESSTOKEN)

while not success:
    # Keep looping if unsuccessful
    r=requests.get(temp)
    success,skip=handleResult(r.status_code,r.text)
    # Try, find out if successful or should skip
    
    if skip or nAttempts==nSkip:
        # If tried nSkip times or if should skip
        r={'data':[],'paging':None}
        if nAttempts==nSkip:
            logging.warning('Skipping after %d attempts' % nAttempts)
            break
    time.sleep(nWait)
    nAttempts+=1
######################################################

In [None]:
getPostsFromPage(307091372652912)

In [None]:
getPages(r.json()['data'],r.json().get('paging'),restart=None)



0 Europe
http://fb.com/612622758878101 Continent
['category', 'about', u'talking_about_count', u'description', 'retrieved', u'engagement', u'description_en', u'likes', u'id', 'name']
1 أوروبا
Translates: 



 europe
http://fb.com/776373632471822 Community
Post 0 Photos from ‎أوروبا‎'s post 776373632471822_792796697496182




Got 2 likes after paging
Post 1 Timeline Photos 776373632471822_780756755366843




Got 6 likes after paging




Post 2 Timeline Photos 776373632471822_780753652033820




Got 3 likes after paging




Post 3 أوروبا 776373632471822_779454045497114




Got 2 likes after paging
Post 4 أوروبا 776373632471822_779454058830446




Got 2 likes after paging
Post 5 None 776373632471822_778704942238691




Got 2 likes after paging




Post 6 Timeline Photos 776373632471822_778703305572188




Got 3 likes after paging




Post 7 None 776373632471822_778397432269442




Got 2 likes after paging




Post 8 None 776373632471822_777555369020315




Got 3 likes after paging




Post 9 Dj nassim mosta cheb houssem zahri winta yatfakarn 776373632471822_777548442354341
Got 3 likes after paging




Post 10 Timeline Photos 776373632471822_777546825687836




Got 3 likes after paging




Post 11 Timeline Photos 776373632471822_777546419021210




Got 5 likes after paging




Post 12 Timeline Photos 776373632471822_777253355717183




Got 4 likes after paging
Post 13 Timeline Photos 776373632471822_777253062383879




Got 4 likes after paging




Post 14 أوروبا 776373632471822_776791169096735




Got 5 likes after paging
Post 15 Timeline Photos 776373632471822_776789995763519




Got 5 likes after paging




Post 16 Untitled Album 776373632471822_776377552471430




Got 8 likes after paging
Post 17 Timeline Photos 776373632471822_776375989138253




Got 6 likes after paging




Post 18 Timeline Photos 776373632471822_776375602471625




Got 5 likes after paging




Post 19 ‎أوروبا‎'s cover photo 776373632471822_776374755805043




Got 5 likes after paging
Post 20 أوروبا 776373632471822_776373812471804




Got 5 likes after paging
Adding posts




['category', 'about', u'talking_about_count', 'name', 'retrieved', u'engagement', u'likes', 'name_en', u'id', 'description']
2 اوروبا
Translates: 



 europe
http://fb.com/198373520174674 Computers/Technology
Post 0 None 198373520174674_429666223712068




Got 1 likes after paging




Post 1 None 198373520174674_429665903712100




['category', 'about', u'talking_about_count', 'name', 'retrieved', u'engagement', u'likes', 'name_en', u'id', 'description']
3 اوروبا
Translates: 



 europe
http://fb.com/594524630689703 Community
Post 0 None 594524630689703_598085223666977




Post 1 ‎اوروبا‎'s cover photo 594524630689703_594531834022316




Got 1 likes after paging
Post 2 اوروبا 594524630689703_594531727355660




Got 1 likes after paging
['category', u'about', u'talking_about_count', 'name', 'retrieved', u'engagement', u'likes', u'about_en', 'name_en', u'id', 'description']
4 اخبار المهاجرين في اوروبا
Translates: 



 news of migrants in europe
http://fb.com/307091372652912 Community
Post 0 None 307091372652912_1001167463245296




Got next page of likes (current) (25 so far)
Got next page of likes (current) (50 so far)
Got next page of likes (current) (75 so far)
Got next page of likes (current) (100 so far)
Got next page of likes (current) (125 so far)
Got next page of likes (current) (150 so far)
Got next page of likes (current) (175 so far)
Got next page of likes (current) (200 so far)




Got 208 likes after paging
Post 1 اخبار المهاجرين في اوروبا 307091372652912_822407197787991




Got next page of likes (current) (25 so far)
Got next page of likes (current) (50 so far)
Got next page of likes (current) (75 so far)
Got next page of likes (current) (100 so far)
Got next page of likes (current) (125 so far)
Got next page of likes (current) (150 so far)
Got next page of likes (current) (175 so far)
Got next page of likes (current) (200 so far)
Got next page of likes (current) (225 so far)
Got next page of likes (current) (250 so far)




Got 254 likes after paging
Post 2 None 307091372652912_822401507788560




Got next page of likes (current) (25 so far)
Got next page of likes (current) (50 so far)
Got next page of likes (current) (75 so far)
Got next page of likes (current) (100 so far)
Got next page of likes (current) (125 so far)
Got next page of likes (current) (150 so far)




Got 151 likes after paging
Post 3 experiencesvideoludiques.com 307091372652912_822400144455363




Got next page of likes (current) (25 so far)
Got next page of likes (current) (50 so far)
Got next page of likes (current) (75 so far)
Got next page of likes (current) (100 so far)




Got 117 likes after paging
Post 4 None 307091372652912_822395634455814




Got next page of likes (current) (25 so far)
Got next page of likes (current) (50 so far)




Got 62 likes after paging
Post 5 www.mohager.com 307091372652912_819297228098988




Got next page of likes (current) (25 so far)
Got next page of likes (current) (50 so far)




Got 58 likes after paging
Post 6 None 307091372652912_819296131432431




Got next page of likes (current) (25 so far)
Got next page of likes (current) (50 so far)




Got 61 likes after paging
Post 7 None 307091372652912_818180994877278




Got next page of likes (current) (25 so far)
Got next page of likes (current) (50 so far)




Got 64 likes after paging
Post 8 None 307091372652912_818180434877334




Got next page of likes (current) (25 so far)




Got 48 likes after paging
Post 9 None 307091372652912_818180248210686




Got next page of likes (current) (25 so far)




Got 28 likes after paging
Post 10 http://www.ostio.de/wp-content/uploads/2013/02/Schengen-Visa.jpg 307091372652912_818179801544064




Got next page of likes (current) (25 so far)




Got 42 likes after paging
Post 11 None 307091372652912_818168661545178




Got next page of likes (current) (25 so far)
Got next page of likes (current) (50 so far)
Got next page of likes (current) (75 so far)
Got next page of likes (current) (100 so far)




Got 102 likes after paging
Post 12 http://www.hibapress.com/upload/2742014-1c06e.jpg 307091372652912_818165441545500




Got 25 likes after paging




Post 13 http://upload-alkompis.s3-eu-west-1.amazonaws.com/iblock/6dc/6dcbfc3753cb7e16484d8f2b4e5715d4/b9f531e3d305b8ad0033aa1e439ef2be.jpg 307091372652912_818163758212335




Got next page of likes (current) (25 so far)




Got 37 likes after paging
Post 14 http://www.oxfordnador.com/wp-content/uploads/2014/02/francais.jpg 307091372652912_818162498212461




Got next page of likes (current) (25 so far)




Got 34 likes after paging
Post 15 http://www.aawsat.com/2010/02/26/images/hassad1.558761.jpg 307091372652912_818161094879268




Got next page of likes (current) (25 so far)




Got 39 likes after paging
Post 16 Freestyle royal Drifting Maroc 307091372652912_671049906257055




Got 20 likes after paging




Post 17 السلطات الفرنسية تغرم يهوديا رفض تشغيل المغاربة في محله 307091372652912_642736432421736




Got next page of likes (current) (25 so far)




Got 34 likes after paging
Post 18 None 307091372652912_642604492434930




Got next page of likes (current) (25 so far)
Got next page of likes (current) (50 so far)
Got next page of likes (current) (75 so far)
Got next page of likes (current) (100 so far)




Got 109 likes after paging
Post 19 None 307091372652912_642601905768522




Got next page of likes (current) (25 so far)




Got 42 likes after paging
Post 20 None 307091372652912_642600225768690




Got 22 likes after paging




Post 21 None 307091372652912_642588135769899




Got 22 likes after paging




Post 22 None 307091372652912_642128479149198




Got 18 likes after paging




Post 23 None 307091372652912_642127705815942




Got 17 likes after paging




Post 24 None 307091372652912_642125172482862




Got 15 likes after paging




Post 25 None 307091372652912_642116755817037




Got next page of likes (current) (25 so far)
Got next page of likes (current) (50 so far)
Got next page of likes (current) (75 so far)




Got 98 likes after paging
Post 26 None 307091372652912_497370936958287




Got 2 likes after paging




Post 27 None 307091372652912_497362880292426




Got 10 likes after paging




Post 28 None 307091372652912_495232237172157




Got 9 likes after paging




Post 29 None 307091372652912_495231273838920




Got 7 likes after paging




Post 30 None 307091372652912_495230630505651




Got 2 likes after paging




Post 31 None 307091372652912_495230050505709




Post 32 None 307091372652912_495229080505806




Got 1 likes after paging




Post 33 None 307091372652912_495228327172548




Got 1 likes after paging




Post 34 None 307091372652912_495227800505934




Got 1 likes after paging




Post 35 None 307091372652912_495227287172652




Got 4 likes after paging




Post 36 None 307091372652912_495225437172837




Got 1 likes after paging




Post 37 Al Italiya 307091372652912_515323645148580
Got 1 likes after paging




Post 38 None 307091372652912_493654217329959




Post 39 None 307091372652912_493652903996757




Got 2 likes after paging




Post 40 None 307091372652912_492949837400397




Got 16 likes after paging




Post 41 None 307091372652912_492948637400517




Post 42 Timeline Photos 307091372652912_492945297400851




Got next page of likes (current) (25 so far)




Got 28 likes after paging
Post 43 None 307091372652912_492944230734291




Got 2 likes after paging




Post 44 None 307091372652912_492943814067666




Got 6 likes after paging




Post 45 None 307091372652912_492943544067693




Got 7 likes after paging




Post 46 None 307091372652912_492943230734391




Got 8 likes after paging




Post 47 http://img.youm7.com/images/NewsPics/large/s8201218214536.jpg 307091372652912_276265985811305




Got 4 likes after paging




Post 48 None 307091372652912_491315274230520




Got 4 likes after paging




Post 49 http://a5.sphotos.ak.fbcdn.net/hphotos-ak-ash3/539285_463421240355781_1706079713_n.jpg 307091372652912_396028703797944




Got 1 likes after paging




Post 50 http://www.elakhbaronline.com/dz/media/k2/items/cache/1b9499731f0cfbb7e7f5f5b826330424_XL.jpg?t=-621 307091372652912_426165610768174




Got 2 likes after paging




Post 51 None 307091372652912_490485894313458




Got 3 likes after paging




Post 52 http://www.rnw.nl/data/files/imagecache/must_carry/images/lead/article/2012/08/anti-muslim-650.jpg 307091372652912_480552175297767




Got 2 likes after paging




Post 53 None 307091372652912_490069167688464




Got 6 likes after paging




Post 54 None 307091372652912_490067311021983




Got 8 likes after paging




Post 55 None 307091372652912_490065707688810




Got 5 likes after paging




Post 56 None 307091372652912_490057364356311




Got 5 likes after paging




Post 57 None 307091372652912_489730067722374




Got 2 likes after paging




Post 58 None 307091372652912_489729277722453




Got 1 likes after paging




Post 59 http://a6.sphotos.ak.fbcdn.net/hphotos-ak-ash4/314775_10150267705766875_206386401874_7898754_1930340 307091372652912_409966005707320




Got 8 likes after paging




Post 60 None 307091372652912_488278541200860




Got 1 likes after paging




Post 61 إيطاليا: ثورة في "بريشا" اعتراضًا على المركز الإسلامي - المسلمون في إيطاليا - موقع المسلمون في العال 307091372652912_258970777553307
Got 1 likes after paging




Post 62 None 307091372652912_487894304572617




Got 4 likes after paging




Post 63 None 307091372652912_487893004572747




Got 3 likes after paging




Post 64 None 307091372652912_487891384572909




Got 3 likes after paging




Post 65 None 307091372652912_487065141322200




Post 66 Harraga : Espagne la fin des soins gratuits pour les clandestins أسبانيا 307091372652912_271812176257115
Post 67 None 307091372652912_486957291332985




Got 1 likes after paging




Post 68 None 307091372652912_486956944666353




Got 1 likes after paging




Post 69 None 307091372652912_486956641333050




Got 1 likes after paging




Post 70 None 307091372652912_486956124666435




Got 1 likes after paging




Post 71 None 307091372652912_486580878037293




Got 1 likes after paging




Post 72 None 307091372652912_486580391370675




Post 73 None 307091372652912_486580118037369




Got 1 likes after paging




Post 74 None 307091372652912_486578584704189




Got 3 likes after paging




Post 75 None 307091372652912_486577358037645




Post 76 None 307091372652912_486577028037678




Got 1 likes after paging




Post 77 None 307091372652912_486576474704400




Got 2 likes after paging




Post 78 None 307091372652912_486576001371114




Post 79 None 307091372652912_486178768077504




Got 4 likes after paging




Post 80 None 307091372652912_486178484744199




Got 1 likes after paging




Post 81 None 307091372652912_486177824744265




Got 1 likes after paging




Post 82 None 307091372652912_486177621410952




Got 3 likes after paging




Post 83 None 307091372652912_486177328077648




Got 2 likes after paging




Post 84 None 307091372652912_486176888077692




Got 3 likes after paging




Post 85 None 307091372652912_486171648078216




Got 5 likes after paging




Post 86 None 307091372652912_480023085359739




Post 87 None 307091372652912_480022975359750




Got 1 likes after paging




Post 88 None 307091372652912_480022708693110




Post 89 None 307091372652912_480022368693144




Got 1 likes after paging




Post 90 http://i.imgur.com/OVwzk.png 307091372652912_426431040728966




Got 1 likes after paging




Post 91 None 307091372652912_478802475481800




Post 92 None 307091372652912_478251742203540




Got 5 likes after paging




Post 93 None 307091372652912_478247382203976




Got 3 likes after paging




Post 94 None 307091372652912_478242268871154




Post 95 None 307091372652912_478242125537835




Post 96 None 307091372652912_478242008871180




Post 97 None 307091372652912_478241628871218




Got 1 likes after paging




Post 98 None 307091372652912_477848605577187




Got 2 likes after paging




Post 99 None 307091372652912_477847558910625




Got 1 likes after paging




Post 100 None 307091372652912_477847222243992




Got 1 likes after paging




Post 101 None 307091372652912_477846775577370




Got 1 likes after paging




Post 102 None 307091372652912_477846548910726




Post 103 None 307091372652912_477845808910800




Got 3 likes after paging




Post 104 Regolarizzazione. Le domande si presentano a settembre 307091372652912_285191288254363




Got 3 likes after paging




Post 105 None 307091372652912_477462428949138




Got 1 likes after paging




Post 106 None 307091372652912_477462098949171




Post 107 None 307091372652912_477461468949234




Got 1 likes after paging




Post 108 None 307091372652912_477461015615946




Post 109 None 307091372652912_477460525615995




Post 110 None 307091372652912_477460105616037




Post 111 None 307091372652912_477225102306204



In [38]:
countCollections()

1145 pages
101760 posts
208335 comments
77796 likes


In [39]:
def getPages(data,paging,restart=None):
    
    if paging:
        if paging.get('next'):
            logging.warning('Paging needed')
    
    for n,d in enumerate(data):
        
        if (restart and d['id']==str(restart)) or (not restart):
            # If restart id defined then wait until we find it
            # if not add straight in
            restart=None
        
            time.sleep(pageSleepTime)

            print n,d['name']

            if not langid.classify(d['name'])[0]=='en':
                try:
                    enName=textblob.TextBlob(d['name']).translate().string
                    enName=clean(enName)
                    print 'Translates: ',enName
                except:
                    logging.warning('Translation failed')
                    enName=None
            else:
                enName=None

            print 'http://fb.com/'+d['id'],d.get('category')
            res=getPageInfo(d['id'],raw=True)

            res['name']=clean(d['name'])
            res['about']=clean(d.get('about'))
            res['description']=clean(d.get('description'))

            ######################################################################
            posts,comments,likes=getPostsFromPage(d['id'],limit=postsLimit,raw=False)
            # Get posts,comments,likes from that page
            nAdded=nAlready=0
            for post in posts:
                if not isPostInDb(post['id']):
                    addPostToDb(post)
                    nAdded+=1
                else:
                    #logging.warning('Post %s already in DB' % post['id'])
                    nAlready+=1
            logging.warning('%d posts added (%d already in DB)' % (nAdded,nAlready))
    #        addCommentsToDb(comments)

    #        addLikesToDb(likes)
            ######################################################################

            if enName:
                res['name_en']=enName

            category=d.get('category')
            if category:
                res['category']=category

            print res.keys()
            if not isPageInDb(d['id']):
                addPageToDb(res)
            else:
                logging.info('Page %s already in DB' % d['id'])
            addTimestampToPage(d['id'])

        else:
            logging.warning('Skipping page %s. Waiting for %s to restart' % (d['id'],restart))

In [40]:
def getPostsFromPage(pageId,raw=False,limit=100):
    '''
    Requests list of posts, list of comments and 
    list of likes from a page
    Returns a list of JSON objects
    or if raw=True, a string description of posts
    '''
    
    logging.info('Getting posts,comments,likes for page %s' % pageId)
    
    tempUrl='https://graph.facebook.com/%s/posts?&limit=%d&access_token=%s' % (pageId,postsLimit,ACCESSTOKEN)
    
    out=[]
    outFull=[]
    
    comments=None
    likes=None
    
    r=requests.get(tempUrl)
    ######################################################
    success=None
    nAttempts=0

    while not success:
        # Keep looping if unsuccessful
        r=requests.get(tempUrl)
        success,skip=handleResult(r.status_code,r.text)
        # Try, find out if successful or should skip

        if skip or nAttempts==nSkip:
            # If tried nSkip times or if should skip
            r={'data':[],'paging':None}
            if nAttempts==nSkip:
                logging.warning('Skipping posts after %d attempts' % nAttempts)
                return ([],[],[])
        time.sleep(nWait)
        nAttempts+=1
    ######################################################
    
    for n,d in enumerate(r.json()['data']):
        
        time.sleep(postSleepTime)

        
        name=d.get('name')
        id=d.get('id')
        print 'Post %d %s %s' % (n,name,id)
        
        message=d.get('message')
        if message:
            message=clean(message)
        else:
            logging.warning('No message for post %s' % d['id'])
        
        description=d.get('description')
        if description:
            description=clean(description)
        else:
            logging.warning('No description for post %s' % d['id'])
        
        caption=d.get('caption')
        if caption:
            caption=clean(caption)
        else:
            logging.warning('No caption for post %s' % d['id'])
        
        d['page_id']=pageId
        d['retrieved']=time.time()
        
        if d.get('icon'):del d['icon']
        if d.get('picture'):del d['picture']
        if d.get('privacy'):del d['privacy']
        # Don't need these
        
        try:
            shareCount=d['shares']['count']
            d['shares']=shareCount
        except:
            pass
        # Simplify this
        
        if message:
#            print message
            out.append(message)
            if not langid.classify(message)[0]=='en':
                try:
                    enMessage=textblob.TextBlob(message).translate().string
                    enMessage=clean(enMessage)
                    out.append('==>'+enMessage+'---------')
                    d['en_message']=enMessage
                except:
                    logging.warning('Translation failed')
                    enMessage=None
        if description:
            out.append(description)
            if not langid.classify(description)[0]=='en':
                try:
                    enDescription=textblob.TextBlob(description).translate().string
                    enDescription=clean(enDescription)
                    out.append('==>'+enDescription+'---------')
                    d['en_description']=enDescription
                except:
                    logging.warning('Translation failed')
                    enDescription=None
        if caption:
            out.append(caption)
            if not langid.classify(caption)[0]=='en':
                try:
                    enCaption=textblob.TextBlob(caption).translate().string
                    enCaption=clean(enCaption)
                    out.append('==>'+enCaption+'---------')
                    d['en_caption']=enCaption
                except:
                    enCaption=None
                    
        try:
            comments=d['comments']
            del d['comments']
        except:
            comments=None
        
        if comments:
            logging.info('Getting comments...')
            commentData=getComments(comments,pageId)
            # This does all the paging
#            for c in commentData['data']:
#                print 'Comments data:',c.keys()
#                print 'Likes:',c[u'user_likes'],c.get('likes')
            addCommentsToDb(commentData)
            
            # TODO get comment likes
            
        try:
            likes=d['likes']
            del d['likes']
        except:
            likes=None
        
        if likes:
            logging.info('Getting likes...')
            likeData=getLikes(likes,pageId,id)
            # This does all the paging
            addLikesToDb(likeData)

        
        outFull.append(d)       
    
    if raw:
        '\n'.join(out)
        pass # return string
    else:
        return outFull,comments,likes

In [41]:
def getLikes(likes,pageId,id):
    '''
    Takes a dictionary of like data from API with keys
    [paging,data], pageId and post id. If paging information is present, keep 
    requesting pages
    '''
    
    
    if likes.get('paging'):
        current=likes
        
        while current['paging'].get('next'):
            logging.info('Paging likes... %s' % current['paging']['next'])
            current=getNextLikes(current['paging']['next'])
            if current:
                print 'Got next page of likes (current) (%d so far)' % len(likes['data'])
                likes['data'].extend(current['data'])
            else:
                break
            # TODO better error handling
                
        print 'Got %d likes after paging' % len(likes['data'])
#    print 'Likes type %s' % type(likes)
#    print likes.keys()
    
    for like in likes['data']:
        like['id']='%s_%s' % (id,like['id'])
        # Make a unique like ID made up of post id_likeid
        like['parent_id']=id
        # Keep parent id of comment/post for getting most liked content
    
#    print 'Likes',likes.keys()
#    print likes.get('paging')
#    print likes['data'][0]
    return likes

In [42]:
def getComments(comments,pageId):
    '''
    Takes a dictionary of comment data from API with keys
    [paging,data]. If paging information is present, keep
    requesting pages
    '''
    
    if comments.get('paging'):
        
        current=comments
        
        while current['paging'].get('next'):
            logging.info('Paging comments...')
            current=getNextComments(current['paging']['next'])
            comments['data'].extend(current['data'])
            
            if not current.get('paging'):
                break
    return comments

In [43]:
def getNextComments(nextToken):
    res=requests.get(nextToken)
    
    if not res.status_code==200:
        logging.warning('Error with next comments data %d' % res.status_code)
        return None
    else:
        return res.json()

In [44]:
def getNextLikes(nextToken):
#    logging.warning('Getting next likes: %s' % nextToken)
    res=requests.get(nextToken)
    
    if not res.status_code==200:
        logging.warning('Error with next likes data %d %s' % (res.status_code,res.text))
        return None
    else:
        return res.json()

In [45]:
trash=getPostsFromPage('657095547710700',raw=True,limit=10)



KeyboardInterrupt: 

In [48]:
def getPageInfo(pageId,raw=False):
    '''
    Requests info for a page by ID
    Returns the info either as a JSON object
    or if raw=True as a string to be printed
    '''
    tempUrl='https://graph.facebook.com/v2.4/'+pageId+'?fields=about,description,location,phone,talking_about_count,\
    engagement,start_info,likes,website&access_token='+ACCESSTOKEN

    res=requests.get(tempUrl)
    
    
    if not res.status_code==200:
        logging.warning('Request failed: %d %s' % (res.status_code,res.text))
    
    res=res.json()
    res['retrieved']=time.time()
    
    description=res.get('description')
    if description:
        description=clean(description)
    else:
        logging.warning('No description for page %s' % pageId)
        logging.warning('Keys %s' % res.keys())
    
    engagement=res.get('engagement')
    if engagement:
        if engagement.get(u'count'):
            res['engagement']=str(res['engagement']['count'])
    
    start_info=res.get('start_info')
    if start_info:
        date=start_info.get(u'date')
        if date:
            res['start_info_clean']=str(res['start_info']['date']['year'])
            if res['start_info']['date'].get('month'):
                res['start_info_clean']+='/'+str(res['start_info']['date']['month'])
                if res['start_info']['date'].get('day'):
                    res['start_info_clean']+='/'+str(res['start_info']['date']['day'])

        else:
            del res['start_info']
    
    for k,v in res.items():
        if type(v) in [unicode,str]:

            if not langid.classify(v)[0]=='en':

                try:
                    res[k+'_en']=textblob.TextBlob(v).translate().string
                except:
                    logging.warning('Translation failed')
                # Create a new dictionary entry with the translation
    if raw:    
        return res
    else:
#        print res.json().items()
        return '\n'.join([k+'\t\t'+unicode(v) for k,v in res.items()])+'\n================'