In [1]:
import urllib
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
from datetime import datetime, date
import random

# Link Page

<b>Scraping the page where the list of Nepalese Politicians is populated.<b>

In [2]:
main_page = requests.get('https://en.wikipedia.org/wiki/List_of_Nepalese_politicians')

In [3]:
main_soup = BeautifulSoup(main_page.content,"html.parser")

In [4]:
# Uncomment the line below to see html of the page
# print(main_soup.prettify())

In [5]:
wiki_links = [] 
for link in main_soup.find_all('a',href=True):
    wiki_links.append(link['href'])

In [6]:
random.sample(wiki_links, 10)

['/wiki/Bishal_Khadka',
 '/wiki/Balbir_Prasad_Chaudhary',
 '/wiki/Chandra_Prakash_Mainali',
 '#cite_ref-81',
 '/wiki/Dama_Kumari_Sharma',
 'https://web.archive.org/web/20070927023111/http://www.kantipuronline.com/kolnews.php?&nid=106447',
 '#cite_ref-election-commission4_13-3',
 '#cite_ref-election-commission1_1-11',
 '#cite_ref-123',
 '#cite_ref-myrepublica10_49-1']

In [7]:
import re
poli_links = []
for s in wiki_links:
    if (re.search('\A/wiki/',s)) != None:
        poli_links.append(s)

In [8]:
random.sample(poli_links, 10)

['/wiki/Wayback_Machine',
 '/wiki/Category:Articles_with_dead_external_links_from_September_2017',
 '/wiki/Gobinda_Bahadur_Shah',
 '/wiki/Basanta_Kumar_Nemwang',
 '/wiki/Kailash_Nath_Kasudhan',
 '/wiki/Til_Kumar_Menyangbo_Limbu',
 '/wiki/Chandra_Lal_Meche',
 '/wiki/Wikipedia:General_disclaimer',
 '/wiki/Krishna_Kumar_Chaudhari',
 '/wiki/Golchhe_Sarki']

<b>We now have all links from the page (Wikipedia links like About,Home and politician links). We can select the politician links by slicing the list.<b>

# Scraping - POC for single link 

<b> Getting Name and Date of birth of a single Politician. This is done as a proof of concept before writing the final function. </b>

In [9]:
page = requests.get('https://en.wikipedia.org/wiki/Girija_Prasad_Koirala')

In [10]:
soup_single= BeautifulSoup(page.content,'html.parser')

In [11]:
# Uncomment the line below to see html of the page
# print(soup_single.prettify())

<b>Since the date of birth is stored in the right hand side box of each page, we select that element. </b>

In [12]:
box_detail = soup_single.find_all('table',class_=["infobox vcard"])

In [13]:
bday = soup_single.find_all('span',class_=["bday"])[0].string

In [14]:
bday

'1924-07-04'

In [15]:
print (soup_single.find(text="Died").findNext('td').contents[0])

20 March 2010


# Function

In [88]:
def scrapebirth(links):
    df = pd.DataFrame(columns=['Name','Bday','Death'])
    for l in links:
        link = "https://en.wikipedia.org"+l 
        page = requests.get(link)
        soup = BeautifulSoup(page.content, "html.parser")
        
        name = soup.find_all('title')[0].string
        try:
            bday = soup.find_all('span',class_=["bday"])[0].string
        except IndexError:
            bday = ""
#             print('IndexError')
        try:
            death = soup.find(text="Died").findNext('td').contents[0]
        except AttributeError:
            death = ""
#             print('AttributeError')
        df = df.append({'Name': name,'Bday':bday,'Death':death},ignore_index=True)
        link = ""
    return df

In [89]:
my_df = scrapebirth(poli_links)

In [90]:
my_df.sample(10)

Unnamed: 0,Name,Bday,Death
21,Salim Miya Ansari - Wikipedia,,
29,Sudarshan Baral - Wikipedia,,
56,Sabitri Bogati (Pathak) - Wikipedia,,
296,Sukra Raj Sonyok (Songyokpa) - Wikipedia,,
197,Ishwar Dayal Mishra - Wikipedia,,
15,Rabindra Prasad Adhikari - Wikipedia,,"February 27, 2019"
274,Binod Kumar Shah - Wikipedia,,
236,Ram Chandra Pokhrel - Wikipedia,,
53,Nara Bahadur Bista - Wikipedia,,
199,Muhammad Okil Musalman - Wikipedia,,


## Clean dataframe

In [81]:
my_df = my_df[my_df['Bday'].astype(bool)]

In [82]:
my_df = my_df[:-18]

In [83]:
df_poli = my_df.copy()

In [84]:
df_poli['Name'] = df_poli['Name'].apply(lambda x:x[:-12])

In [85]:
df_poli.set_index('Name',inplace=True)

<h3>Filter politician that are dead</h3>

In [86]:
dead_poli = df_poli[df_poli.astype(str)['Death'] != '']
dead_poli

Unnamed: 0_level_0,Bday,Death,Namen
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Post Ba,1953-07-18,15 September 2014,Post Bahadur Bogati
Bhagw,1940-01-01,15 November 1998,Bhagwan Das Gupta


In [87]:
alive_poli = df_poli[~df_poli.index.isin(dead_poli.index)]
alive_poli.head()

Unnamed: 0_level_0,Bday,Death,Namen
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
,1959-04-27,,Bhim Acharya
Ma,1954-08-06,,Mahesh Acharya
Nara,1953-09-27,,Narahari Acharya
Dilendra,1954-01-09,,Dilendra Prasad Badu
Khad,1928-04-17,,Khadgajeet Baral


In [60]:
def calculate_age(born):
    b= datetime.strptime(born, '%Y-%m-%d')
    today = date.today()
    return today.year-b.year-((today.month, today.day) < (b.month, b.day))

In [61]:
final_df = alive_poli.copy()
final_df['Age']= final_df['Bday'].apply(lambda x: calculate_age(x))

In [69]:
final_df.shape

(54, 5)

In [62]:
final_df

Unnamed: 0,Name,Bday,Death,Namen,Age
4,Bhim Acharya,1959-04-27,,Bhim Acharya,61
8,Mahesh Acharya,1954-08-06,,Mahesh Acharya,66
9,Narahari Acharya,1953-09-27,,Narahari Acharya,67
25,Dilendra Prasad Badu,1954-01-09,,Dilendra Prasad Badu,66
28,Khadgajeet Baral,1928-04-17,,Khadgajeet Baral,92
34,Shakti Bahadur Basnet,1971-04-14,,Shakti Bahadur Basnet,49
40,Damodar Bhandari,1973-11-05,,Damodar Bhandari,46
45,Dev Raj Bhar,1952-01-29,,Dev Raj Bhar,68
47,Lekh Raj Bhatta,1960-03-24,,Lekh Raj Bhatta,60
48,Baburam Bhattarai,1954-06-18,,Baburam Bhattarai,66


In [63]:
final_df['Age'].median()

62.5

In [64]:
final_df['Age'].mean()

62.870370370370374

In [65]:
final_df['Age'].count()

54