# Scraping data from multiple web pages with the same structure (example)
We will build the code that allow us to use wiki pages to retrieve information about a list of people.
- first we will use the [code that retrieves data about a single person](WebScraping_SinglePage-NoInstructions.ipynb) to build a functions that allows to pass a person name and that returns a list of information about that person
- second we will call the function for each one of Trump's friends

See also: 
- [more information about python functions](http://www.tutorialspoint.com/python/python_functions.htm)
- [more details about the code we use to define the function getPersonData](WebScraping_SinglePage.ipynb)

In [2]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np

## Define a functions that takes a person name and returns a list of information about that person

In [8]:
def getPersonData( name ):
    url = 'https://en.wikipedia.org/wiki/'+name
    r  = requests.get(url)
    data = r.text
    soup = BeautifulSoup(data)

    # create a list to store all the data
    personInfo = []

    
    # *** name ***
    # look for a div tag with a class named nickname
    # check that there is a name
    nameSection = soup.find("div", class_="nickname")
    if nameSection is not None :
        name = nameSection.get_text()

        # add the person name to the list
        personInfo.append(name)
        
    else : personInfo.append('none')

    # *** name ***
    # look for a span tag with a class named bday
    bdaySection=soup.find("span", class_="bday")
    if bdaySection is not None :
        bday=bdaySection.get_text()

        # add the person bday to the list
        personInfo.append(bday)
    
    else : personInfo.append('none')

    # *** education ***

    # find the th tag that has a text 'Education'
    edu_heading = soup.find("th", string="Education")
    if edu_heading is not None :

        # go one level up to the table row level
        edu_row = edu_heading.parent

        # get all links (this is how education info is stored)
        edu_list_with_tags = edu_row.find_all('a')

        # transform it into a list to be able to scroll through each element
        tagsList = list(edu_list_with_tags)

        # collect all [university, degree] pairs
        edu_list=[]
        for x in range(0,len(tagsList)-1, 2):
            edu_list.append([tagsList[x].string, tagsList[x+1].string])

        # add the person education to the list (converting it to a string)
        personInfo.append(str(edu_list))

    else : personInfo.append('none')


    # *** children ***

    # find the th tag that has a text 'Children'
    children_heading = soup.find("th", string="Children")
    if children_heading is not None :

        # retreive the number of children by getting the text of the next siblin
        numbChildren = children_heading.next_sibling.string

        # add the person children to the list
        personInfo.append(numbChildren)

    else : personInfo.append('none')

    return personInfo

# the line below tests the function
getPersonData('Reince_Priebus')

['Reinhold Richard Priebus',
 '1972-03-18',
 "[['University of Wisconsin–Whitewater', 'BA'], ['University of Miami', 'JD']]",
 '2']

In [None]:
# shows what the getPersonData returns for a non-person
getPersonData('DRUDGE REPORT')

## Call the function for each one of Trump's friends

In [19]:
# read the file with the geocoded twitter data and build the dataFrame
df = pd.read_csv('geocodedTwitterData.csv', sep=',')
df.head()

Unnamed: 0.1,Unnamed: 0,screen_name,name,followers_count,location,lat,lon
0,0,TuckerCarlson,Tucker Carlson,1064787,"Washington, DC",38.895009,-77.036563
1,3,WhiteHouse,The White House,15327283,"Washington, DC",38.895009,-77.036563
2,5,KellyannePolls,Kellyanne Conway,1664393,"Washington, DC",38.895009,-77.036563
3,6,Reince,Reince Priebus,941927,"Kenosha, WI",42.584677,-87.821226
4,10,RealRomaDowney,Roma Downey,192126,Malibu,34.035591,-118.689423


In [20]:
# call the getPersonData function for all names in the column 'name' of the dataFrame
allInfo = []
for name in df['name']:
    #store the data returned by the fuction in the list allInfo
    allInfo.append(getPersonData(name))
allInfo

[['Tucker McNear Carlson[1]', '1969-05-16', '[]', '4'],
 ['none', '1792-10-13', 'none', 'none'],
 ['Kellyanne Elizabeth Fitzpatrick',
  '1967-01-20',
  "[['Trinity Washington University', 'BA'], ['George Washington University', 'JD']]",
  '4'],
 ['Reinhold Richard Priebus',
  '1972-03-18',
  "[['University of Wisconsin–Whitewater', 'BA'], ['University of Miami', 'JD']]",
  '2'],
 ['none', '1960-05-06', 'none', '1 daughter, 2 stepsons'],
 ['none', '1923', 'none', 'none'],
 ['Laura Anne Ingraham',
  '1963-06-19',
  "[['Dartmouth College', 'BA'], ['University of Virginia', 'JD']]",
  '3'],
 ['none', 'none', 'none', 'none'],
 ['none', 'none', 'none', 'none'],
 ['Lara Lea Yunaska', '1982-10-12', 'none', '1'],
 ['Sean Patrick Hannity[1]', '1961-12-30', '[]', '2'],
 ['none', '1973-09-18', "[[None, 'BA'], ['American University', 'MA']]", '4'],
 ['Ann Hart Coulter',
  '1961-12-08',
  "[['Cornell University', 'BA'], ['University of Michigan', 'JD']]",
  'none'],
 ['none', 'none', 'none', 'none']

## Add the information found to the original DataFrame

In [23]:
# create a new dataFrame from the allInfo list
df_new = pd.DataFrame(allInfo)

# assign column names
df_new=df_new.rename(columns = {0:'nickName', 1:'bday', 2:'education', 3:'children'})

# add the columns to the original dataFrame
df['nickName']=df_new['nickName']
df['bday']=df_new['bday']
df['education']=df_new['education']
df['children']=df_new['children']

In [24]:
df

Unnamed: 0.1,Unnamed: 0,screen_name,name,followers_count,location,lat,lon,nickName,bday,education,children
0,0,TuckerCarlson,Tucker Carlson,1064787,"Washington, DC",38.895009,-77.036563,Tucker McNear Carlson[1],1969-05-16,[],4
1,3,WhiteHouse,The White House,15327283,"Washington, DC",38.895009,-77.036563,none,1792-10-13,none,none
2,5,KellyannePolls,Kellyanne Conway,1664393,"Washington, DC",38.895009,-77.036563,Kellyanne Elizabeth Fitzpatrick,1967-01-20,"[['Trinity Washington University', 'BA'], ['Ge...",4
3,6,Reince,Reince Priebus,941927,"Kenosha, WI",42.584677,-87.821226,Reinhold Richard Priebus,1972-03-18,"[['University of Wisconsin–Whitewater', 'BA'],...",2
4,10,RealRomaDowney,Roma Downey,192126,Malibu,34.035591,-118.689423,none,1960-05-06,none,"1 daughter, 2 stepsons"
5,11,Trump,Trump Organization,254702,"New York, NY",40.730862,-73.987156,none,1923,none,none
6,14,IngrahamAngle,Laura Ingraham,1664852,"Washington, DC",38.895009,-77.036563,Laura Anne Ingraham,1963-06-19,"[['Dartmouth College', 'BA'], ['University of ...",3
7,16,TeamTrump,Official Team Trump,742514,USA,39.783730,-100.445882,none,none,none,none
8,17,DRUDGE_REPORT,DRUDGE REPORT,1319753,USA,39.783730,-100.445882,none,none,none,none
9,19,LaraLeaTrump,Lara Trump,301998,"New York, NY",40.730862,-73.987156,Lara Lea Yunaska,1982-10-12,none,1


In [29]:
df.drop('Unnamed: 0', axis=1, inplace=True)
df

Unnamed: 0,screen_name,name,followers_count,location,lat,lon,nickName,bday,education,children
0,TuckerCarlson,Tucker Carlson,1064787,"Washington, DC",38.895009,-77.036563,Tucker McNear Carlson[1],1969-05-16,[],4
1,WhiteHouse,The White House,15327283,"Washington, DC",38.895009,-77.036563,none,1792-10-13,none,none
2,KellyannePolls,Kellyanne Conway,1664393,"Washington, DC",38.895009,-77.036563,Kellyanne Elizabeth Fitzpatrick,1967-01-20,"[['Trinity Washington University', 'BA'], ['Ge...",4
3,Reince,Reince Priebus,941927,"Kenosha, WI",42.584677,-87.821226,Reinhold Richard Priebus,1972-03-18,"[['University of Wisconsin–Whitewater', 'BA'],...",2
4,RealRomaDowney,Roma Downey,192126,Malibu,34.035591,-118.689423,none,1960-05-06,none,"1 daughter, 2 stepsons"
5,Trump,Trump Organization,254702,"New York, NY",40.730862,-73.987156,none,1923,none,none
6,IngrahamAngle,Laura Ingraham,1664852,"Washington, DC",38.895009,-77.036563,Laura Anne Ingraham,1963-06-19,"[['Dartmouth College', 'BA'], ['University of ...",3
7,TeamTrump,Official Team Trump,742514,USA,39.783730,-100.445882,none,none,none,none
8,DRUDGE_REPORT,DRUDGE REPORT,1319753,USA,39.783730,-100.445882,none,none,none,none
9,LaraLeaTrump,Lara Trump,301998,"New York, NY",40.730862,-73.987156,Lara Lea Yunaska,1982-10-12,none,1


In [30]:
df.to_csv('geocodedAndCompletedTwittedData.csv')