Clean code and Experimentation

In [1]:
# Imports
import requests
from lxml import html
import datetime
from itertools import chain
import pandas as pd
from collections import OrderedDict
import re
import time

## Parameters

In [2]:
# base URL
base_url = 'http://www.nairaland.com'

## Utility Functions

In [3]:
# Flatten List of Lists
def flattenLL(listOfLists):
    "Flatten one level of nesting"
    return list(chain.from_iterable(listOfLists))

# Get next Word (string) after a target string 
def nextWord(target, source):
    source = source.split()
    for i, w in enumerate(source):
        if w == target:
            return source[i+1]

# Get next two words after a target string
def next2Words(target, source):
    source = source.split()
    for i, w in enumerate(source):
        if w == target:
            nxt2 = print(source[i+1],source[i+2])
            return(nxt2)

# Get word (string) before a target string        
def beforeWord(target, source):
    source = source.split()
    for i, w in enumerate(source):
        if w == target:
            return source[i-1]

### 1. Start: Load the Base page

In [4]:
# Get a htmlElement of a particular link
def getPagehtmlElems(link):
    response = requests.get(link) #get page data from server, block redirects
    sourceCode = response.content #get string of source code from response
    htmlElem = html.document_fromstring(sourceCode) #make HTML element object
    return(htmlElem)

### 2. Get Sections Links

In [5]:
# Get sections Links given the base page htmlContent
def getSectionsLinks(base_url,base_htmlContent):
    section_panel =base_htmlContent.cssselect('[class="boards"]') #panel-board sections of all section element
    nlist_sectns_links = [sp.xpath('@href') for sp in section_panel[0].cssselect('a')] # list of list of hyperlinks
    # Extract links
    list_sub_links = flattenLL(nlist_sectns_links)
    #format the link relative to the base URL
    sections_links = []
    for ref in list_sub_links:
        sections_links.append(base_url+ref)
    return(sections_links)

### 3. Iterate the Section by pages and Generate links

#### Get Number of pages in a section: given section front page html_content

In [6]:
## Get Number of pages in a section: given section front page html_content
def getNumberOfPages(sctn_frontPage):
    s_pgCount = sctn_frontPage.xpath('/html/body/div[1]/p[4]/b[2]')
    if s_pgCount != []:
        return(s_pgCount[0].text)
    else:
        s_pgCount = sctn_frontPage.xpath('/html/body/div[1]/p[3]/b[2]')
        return(int(s_pgCount[0].text))

##### Construct Section pages URL

In [7]:
# Construct Section pages URL: Given Section Base Link and Section page count
# NB last pages are often skipped (0:length(section_pgsCount)-1)
def constructPagesLinks(sctn_baseLink,sctn_pgsCount):
    sctns_pagesLinks = [sctn_baseLink+'/'+str(i) for i in range(int(sctn_pgsCount))]
    return(sctns_pagesLinks)

### 4. Iterate  Pages for Post Panel Info

#### Get Post Panels from Post Table on the page

In [8]:
# Get Post Panels
def getPostsPanel(pageLink):
    # 1. Getting page
    page = getPagehtmlElems(pageLink)
    # 2. Get post table
    post_table = page.xpath('/html/body/div[1]/table[3]')[0] # Target the post table on the page
    # 3. Get post Panels
    post_panelsList = post_table.cssselect('td')  # Extract post panels
    
    if len(post_panelsList) > 1:
        return(post_panelsList)
    elif len(post_panelsList) == 1:
        post_table2 = page.xpath('/html/body/div[1]/table[2]')[0] # Target the post table on the page
        # 3. Get post Panels
        post_panelsList2 = post_table2.cssselect('td')  # Extract post panels
        return(post_panelsList2)

### 5. Extract Info from Post panels

#### Extraction Functions

In [9]:
# Get Post title form post panel
# breaking the panel elements into section using the css-tag 'b' (-bold)
def getPostTitle(postPanel):
    post_title = [panelSctns.text_content() for panelSctns in postPanel.cssselect('b')][0]
    return(post_title.strip())

# Get Post Author
def getAuthor(postPanel):
    ppTextContent = postPanel.text_content()
    author = nextWord('by',ppTextContent)
    return(author.replace('.',''))

# Get Number of post view
def getViewCount(postPanel):
    ppTextContent = postPanel.text_content()
    vcount = beforeWord('views.',ppTextContent)
    if vcount == None:
        return(0)
    else:
        return(vcount)
    

# Get Post Count
def getPostCount(postPanel):
    ppTextContent = postPanel.text_content()
    pscount = beforeWord('posts',ppTextContent) 
    if pscount == None:
        return(0)
    else:
        return(pscount)

# Get Post time
# Using Regular Expression
def getTime(postPanel):
    ppTextContent = postPanel.text_content() 
    matcher = re.compile(r".* (\d{1,2}:\d{2}(pm|am))")
    m = matcher.match(ppTextContent)
    if m == None:
        return('NA')
    else:
        return(m.group(1))

# Get Post Date
# Using utility function as
def getDate(postPanel,currentDay,currentMonth,currentYear):
    ppTextContent = postPanel.text_content()
    dateMatcher = re.compile(r".* (\d{1,2}:\d{2}(pm|am))")
    dateContent = dateMatcher.split(ppTextContent)[3]
    split = dateContent.split('On')
    nsplit = len(split) # Check if postPanel text contain 'On'
    if nsplit == 1:  # Contain no Date
        noDate = (currentMonth,str(currentDay)+',',currentYear)
        return(' '.join(noDate))
    elif nsplit == 2: # Contain Date
        split2 = split[1].split() #split the part containing date element
        nsplit2 = len(split2) # Check if Year is included
        if nsplit2 == 3:
            date_noYear = (split2[0],split2[1]+',',currentYear)
            return(' '.join(date_noYear))
        elif nsplit2 == 4:
            date_wtYear = (split2[0],split2[1],split2[2])
            return(' '.join(date_wtYear))
        else:
            return('NA')
    else:
        return('NA')

##### Extract Given Page Post Panels and populate a local database

In [10]:
# Extract Given Page Post Panels and populate a local database
def extractPagePostPanels(post_panelsList,currDay,currMnt,currYr):
    pgDB_postTitles = []
    pgDB_authors = []
    pgDB_postCounts = []
    pgDB_viewCounts= []
    pgDB_times = []
    pgDB_dates = []
    # Exploring each post panel
    if post_panelsList != []:
        for i in range(len(post_panelsList)):
            postPanel = post_panelsList[i]
            if postPanel != []:
                pgDB_postTitles.append(getPostTitle(postPanel))
                pgDB_authors.append(getAuthor(postPanel))
                pgDB_postCounts.append(getPostCount(postPanel))
                pgDB_viewCounts.append(getViewCount(postPanel))
                pgDB_times.append(getTime(postPanel))
                pgDB_dates.append(getDate(postPanel,currDay,currMnt,currYr))
    return(pgDB_postTitles,pgDB_authors,pgDB_postCounts,pgDB_viewCounts,pgDB_times,pgDB_dates)

### 6. Compact: Iteration over pages

In [11]:
### Generalizing
def getSectionDB(sectionPagesLinks,currDay,currMnt,currYr):
    # 1. Initializing Database 
    DB_post_title = []
    DB_author = []
    DB_post_cnt =[]
    DB_view_cnt = []
    DB_time =[]
    DB_date = []
    
    colNames = ['Post Title', 'Author', 'Post Count', 'View Count', 'Time', 'Date']
    
    # 2. Iterate over pages
    pgsCount = len(sectionPagesLinks) # Total Number of pages 
    for pc in range(pgsCount):
        if pc>0 and pc%200 == 0:
            print("pausing for 5sec")
            time.sleep(5)
            #Print progress per page
            print('getting page ',pc+1 ,'/',pgsCount,'\t')
            # A. Iterate over pages
            pageLink = sectionPagesLinks[pc]
            # B. Get posts panel List
            pgPostPanelList = getPostsPanel(pageLink) 
            # Info about number of post topic on the page (Optional)
            #print('Extracting ',len(pgPostPanelList),' topic-postes')
    
            # C. Extract Given Page Post Panels (get a local DB)
            infoExtrct = []
            infoExtrct = extractPagePostPanels(pgPostPanelList,
                                           currDay,currMnt,currYr)
        
            # D.  Extend Database per page
            DB_post_title.extend(infoExtrct[0])
            DB_author.extend(infoExtrct[1])
            DB_post_cnt.extend(infoExtrct[2])
            DB_view_cnt.extend(infoExtrct[3])
            DB_time.extend(infoExtrct[4])
            DB_date.extend(infoExtrct[5])
            #Reset Extractor Tuple
            infoExtrct =[]
        else:
            #Print progress per page
            print('getting page ',pc+1 ,'/',pgsCount,'\t')
    
            # A. Iterate over pages
            pageLink = sectionPagesLinks[pc]
    
            # B. Get posts panel List
            pgPostPanelList = getPostsPanel(pageLink) 
            # Info about number of post topic on the page (Optional)
            #print('Extracting ',len(pgPostPanelList),' topic-postes')
    
            # C. Extract Given Page Post Panels (get a local DB)
            infoExtrct = []
            infoExtrct = extractPagePostPanels(pgPostPanelList,
                                           currDay,currMnt,currYr)
        
            # D.  Extend Database per page
            DB_post_title.extend(infoExtrct[0])
            DB_author.extend(infoExtrct[1])
            DB_post_cnt.extend(infoExtrct[2])
            DB_view_cnt.extend(infoExtrct[3])
            DB_time.extend(infoExtrct[4])
            DB_date.extend(infoExtrct[5])
            #Reset Extractor Tuple
            infoExtrct =[]
            
        # 3. Prepare Final Data Frame
        DB = pd.DataFrame({ 'Post Title' : DB_post_title,
                                'Author' : DB_author,
                                'Post Count' : DB_post_cnt,
                                'View Count' : DB_view_cnt,
                                'Time' : DB_time,
                                'Date' : DB_date
                              },  columns = colNames)
            
    return(DB)

In [12]:
# Get Specific Section Pages URLs
def getSpecSectionPagesLinks(sctn_baseLink):
    # Get Front Page 
    sctn_frontPage = getPagehtmlElems(sctn_baseLink)
    # Get the total number of pages in this section
    sctn_pgsCount = getNumberOfPages(sctn_frontPage)
    sctn_pgsCount
    ## Construct Section pages URL
    sctn_pgsLinks = constructPagesLinks(sctn_baseLink,sctn_pgsCount)
    return(sctn_pgsLinks) 

In [13]:
def ScrapperInterface(id,sections_links,currDay, currMnt, currYr):
    sctn_baseLink = sections_links[id]
    sctn_pgsLinks = getSpecSectionPagesLinks(sctn_baseLink)
    sctn_DB = getSectionDB(sctn_pgsLinks, currDay, currMnt, currYr)
    return(sctn_DB)
    

In [14]:
### Generalizing
def getSectionDB2(sectionPagesLinks,currDay,currMnt,currYr):
    # 1. Initializing Database 
    DB_post_title = []
    DB_author = []
    DB_post_cnt =[]
    DB_view_cnt = []
    DB_time =[]
    DB_date = []
    
    colNames = ['Post Title', 'Author', 'Post Count', 'View Count', 'Time', 'Date']
    
    # 2. Iterate over pages
    pgsCount = len(sectionPagesLinks) # Total Number of pages 
    for pc in range(pgsCount):
        if pc>0 and pc%200 == 0:
            print("pausing for 5sec")
            time.sleep(5)
            #Print progress per page
            print('getting page ',pc+1 ,'/',pgsCount,'\t')
            # A. Iterate over pages
            try:
                pageLink = sectionPagesLinks[pc]
            except:
                print('something is wrong at, '+str(pc))
                time.sleep(5)
                continue
            # B. Get posts panel List
            try:
                pgPostPanelList = getPostsPanel(pageLink) 
            except:
                print('something is wrong at, '+str(pc))
                time.sleep(5)
                continue
            # Info about number of post topic on the page (Optional)
            #print('Extracting ',len(pgPostPanelList),' topic-postes')
    
            # C. Extract Given Page Post Panels (get a local DB)
            try:
                infoExtrct = []
                infoExtrct = extractPagePostPanels(pgPostPanelList,
                                           currDay,currMnt,currYr)
            except:
                print('something is wrong at, '+str(pc))
                time.sleep(5)
                continue
            # D.  Extend Database per page
            DB_post_title.extend(infoExtrct[0])
            DB_author.extend(infoExtrct[1])
            DB_post_cnt.extend(infoExtrct[2])
            DB_view_cnt.extend(infoExtrct[3])
            DB_time.extend(infoExtrct[4])
            DB_date.extend(infoExtrct[5])
            #Reset Extractor Tuple
            infoExtrct =[]
        else:
            #Print progress per page
            print('getting page ',pc+1 ,'/',pgsCount,'\t')
    
            # A. Iterate over pages
            try:
                pageLink = sectionPagesLinks[pc]
            except:
                print('something is wrong at, '+str(pc))
                time.sleep(5)
                continue
    
            # B. Get posts panel List
            try:
                pgPostPanelList = getPostsPanel(pageLink) 
            except:
                print('something is wrong at, '+str(pc))
                time.sleep(5)
                continue
            # Info about number of post topic on the page (Optional)
            #print('Extracting ',len(pgPostPanelList),' topic-postes')
    
            # C. Extract Given Page Post Panels (get a local DB)
            try:
                infoExtrct = []
                infoExtrct = extractPagePostPanels(pgPostPanelList,
                                           currDay,currMnt,currYr)
            except:
                print('something is wrong at, '+str(pc))
                time.sleep(5)
                continue
        
            # D.  Extend Database per page
            DB_post_title.extend(infoExtrct[0])
            DB_author.extend(infoExtrct[1])
            DB_post_cnt.extend(infoExtrct[2])
            DB_view_cnt.extend(infoExtrct[3])
            DB_time.extend(infoExtrct[4])
            DB_date.extend(infoExtrct[5])
            #Reset Extractor Tuple
            infoExtrct =[]
            
        # 3. Prepare Final Data Frame
        DB = pd.DataFrame({ 'Post Title' : DB_post_title,
                                'Author' : DB_author,
                                'Post Count' : DB_post_cnt,
                                'View Count' : DB_view_cnt,
                                'Time' : DB_time,
                                'Date' : DB_date
                              },  columns = colNames)
            
    return(DB)

In [15]:
# More Control
def ScrapperInterface2(id,sections_links,currDay, currMnt, currYr,start = None, stop=None):
    sctn_baseLink = sections_links[id]
    sctn_pgsLinks = getSpecSectionPagesLinks(sctn_baseLink)
    if start == None:
        sctn_DB = getSectionDB2(sctn_pgsLinks, currDay, currMnt, currYr)
        return(sctn_DB)
    else:
        if stop == None:
            #spIndx = len(sctn_pgsLinks)
            sctn_DB = getSectionDB2(sctn_pgsLinks[start:len(sctn_pgsLinks)], currDay, currMnt, currYr)
            return(sctn_DB)
        else:
            #stpIndx = stop
            sctn_DB = getSectionDB2(sctn_pgsLinks[start:stop], currDay, currMnt, currYr)
            return(sctn_DB)
        
    

## Final: Interface

In [16]:
# base URL
base_url = 'http://www.nairaland.com'
# load Base Page
base_page = getPagehtmlElems(base_url)
#get all sections URLs 
sections_links = getSectionsLinks(base_url,base_page)

In [18]:
# Specify Section link
sections_links[6]

'http://www.nairaland.com/business'

In [19]:
pgF = getPagehtmlElems(sections_links[6])
getNumberOfPages(pgF)

'2552'

In [None]:
# Get Specified Section DB
currDay = 13
currMnt = 'Apr'
currYr = '2018'

business_DBP1 = ScrapperInterface2(6,sections_links,
                                 currDay, currMnt, currYr,start = 0,stop=3717)

getting page  1 / 2552 	
getting page  2 / 2552 	
getting page  3 / 2552 	
getting page  4 / 2552 	
getting page  5 / 2552 	
getting page  6 / 2552 	
getting page  7 / 2552 	
getting page  8 / 2552 	
getting page  9 / 2552 	
getting page  10 / 2552 	
getting page  11 / 2552 	
getting page  12 / 2552 	
getting page  13 / 2552 	
getting page  14 / 2552 	
getting page  15 / 2552 	
getting page  16 / 2552 	
getting page  17 / 2552 	
getting page  18 / 2552 	
getting page  19 / 2552 	
getting page  20 / 2552 	
getting page  21 / 2552 	
getting page  22 / 2552 	
getting page  23 / 2552 	
getting page  24 / 2552 	
getting page  25 / 2552 	
getting page  26 / 2552 	
getting page  27 / 2552 	
getting page  28 / 2552 	
getting page  29 / 2552 	
getting page  30 / 2552 	
getting page  31 / 2552 	
getting page  32 / 2552 	
getting page  33 / 2552 	
getting page  34 / 2552 	
getting page  35 / 2552 	
getting page  36 / 2552 	
getting page  37 / 2552 	
getting page  38 / 2552 	
getting page  39 / 25

In [59]:
Business_DB = pd.concat([business_DBP1])

In [60]:
Jobs_DB

Unnamed: 0,Post Title,Author,Post Count,View Count,Time,Date
0,Jobs/vacancies Section Chatroom,davide470,46075,1621956,12:54pm,"Apr 13, 2018"
1,How To Identify A Scam Interview Invitation,LaurelP,3963,681612,8:08am,"Apr 13, 2018"
2,Jobs/Vacancies Section Directory!,davide470,0,125628,7:01am,"Jan 04, 2016"
3,"This Tuesday, April 10th At 8pm, Please Join A...",npowerng,23,657,1:22pm,"Apr 13, 2018"
4,Post Abuja Jobs Here,MsSteph,19221,1054710,1:21pm,"Apr 13, 2018"
5,Hospitality Jobs @ It's Finest,wynerz1,4,74,1:20pm,"Apr 13, 2018"
6,GNLD Happened To Me Today For The First Time,Earthbound,126,11452,1:19pm,"Apr 13, 2018"
7,Urgent Vacancies,Ntukidem,2,105,1:18pm,"Apr 13, 2018"
8,Job Opportunity: Earn Over 450k,SteadyIncome11,11,673,1:11pm,"Apr 13, 2018"
9,2018 Pre-employment Internship Scheme At UAC O...,admart,0,13,1:08pm,"Apr 13, 2018"


In [61]:
# DB to CSV
Jobs_DB.to_csv('../Data/Jobs DB.csv')




#Career_DB.to_csv('../Data/CareerDB.csv')
#Politics_DB.to_csv('../Data/PoliticsDB.csv')
#artGV_DB.to_csv('../Data/ArtGraphicsVideoDB.csv')

In [51]:
#



#INVESTIGATION





#

In [None]:
#sctn_baseLink = sections_links[2]
# Get Front Page 
#sctn_frontPage = getPagehtmlElems(sctn_baseLink)
# Get the total number of pages in this section
#sctn_pgsCount = getNumberOfPages(sctn_frontPage)
#sctn_pgsCount
## Construct Section pages URL
#sctn_pgsLinks = constructPagesLinks(sctn_baseLink,sctn_pgsCount)

#sctn_DB = getSectionDB(sctn_pgsLinks, currDay, currMnt, currYr)

In [None]:
#plink = sctn_pgsLinks[571]
#pg = getPagehtmlElems(plink)

In [None]:
#tbl = pg.xpath('/html/body/div[1]/table[3]')[0]
#pnls = tbl.cssselect('td')
p_panelsList = getPostsPanel(plink)
len(p_panelsList)

In [None]:
#[ getTime(pnl) for pnl in p_panelsList[32:35]]

#getTime(p_panelsList[34])

p_panelsList[34].text
#extractPagePostPanels(p_panelsList, currDay, currMnt, currYr)

In [None]:
p_panelsList[0].text_content()

In [None]:
#/html/body/div[1]/table[2]

    

In [None]:
p = 10
if p>0 and p%2 == 0:
    print('Yrs')
else:
    print('NO')