In [1]:
import urllib
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
from datetime import datetime, date
import random

# Link Page

<b>Scraping the page where the list of Nepalese Politicians is populated.<b>

In [2]:
main_page = requests.get('https://en.wikipedia.org/wiki/List_of_Nepalese_politicians')

In [3]:
main_soup = BeautifulSoup(main_page.content,"html.parser")

In [4]:
# Uncomment the line below to see html of the page
# print(main_soup.prettify())

In [5]:
wiki_links = [] 
for link in main_soup.find_all('a',href=True):
    wiki_links.append(link['href'])

In [6]:
random.sample(wiki_links, 10)

['/wiki/Ram_Krishna_Tamrakar',
 '/wiki/Narayan_Singh_Pun',
 'http://www.myrepublica.com/election/dr.php?dr=2',
 '#cite_ref-Tuladhar_115-0',
 '#I',
 '#cite_ref-82',
 '/w/index.php?title=Special:UserLogin&returnto=List+of+Nepalese+politicians',
 '#cite_ref-election1_17-37',
 '#cite_ref-76',
 '/wiki/Tek_Bahadur_Gurung']

In [7]:
import re
poli_links = []
for s in wiki_links:
    if (re.search('\A/wiki/',s)) != None:
        poli_links.append(s)

In [8]:
random.sample(poli_links, 10)

['/wiki/Hari_Lal_Joshi',
 '/wiki/Urmila_Aryal',
 '/wiki/Sunil_Babu_Pant',
 '/wiki/Ranju_Darshana',
 '/wiki/Ishwar_Dayal_Mishra',
 '/wiki/Sudarshan_Baral',
 '/wiki/Dama_Kumari_Sharma',
 '/wiki/Kathmandu',
 '/wiki/Mohan_Prasad_Pandey',
 '/wiki/Santa_Kumar_Tharu']

<b>We now have all links from the page (Wikipedia links like About,Home and politician links). We can select the politician links by slicing the list.<b>

# Scraping - POC for single link 

<b> Getting Name and Date of birth of a single Politician. This is done as a proof of concept before writing the final function. </b>

In [9]:
page = requests.get('https://en.wikipedia.org/wiki/Girija_Prasad_Koirala')

In [10]:
soup_single= BeautifulSoup(page.content,'html.parser')

In [11]:
# Uncomment the line below to see html of the page
# print(soup_single.prettify())

<b>Since the date of birth is stored in the right hand side box of each page, we select that element. </b>

In [12]:
box_detail = soup_single.find_all('table',class_=["infobox vcard"])

In [13]:
bday = soup_single.find_all('span',class_=["bday"])[0].string

In [14]:
bday

'1924-07-04'

In [15]:
print (soup_single.find(text="Died").findNext('td').contents[0])

20 March 2010


# Function

In [16]:
def scrapebirth(links):
    df = pd.DataFrame(columns=['Name','DOB','DOD'])
    for l in links:
        link = "https://en.wikipedia.org"+l 
        page = requests.get(link)
        soup = BeautifulSoup(page.content, "html.parser")
        
        name = soup.find_all('title')[0].string
        try:
            bday = soup.find_all('span',class_=["bday"])[0].string
        except IndexError:
            bday = ""
#             print('IndexError')
        try:
            death = soup.find(text="Died").findNext('td').contents[0]
        except AttributeError:
            death = ""
#             print('AttributeError')
        df = df.append({'Name': name,'DOB':bday,'DOD':death},ignore_index=True)
        link = ""
    return df

<h3>The following line scrapes all links. Computation may take few minutes.</h3>

In [17]:
my_df = scrapebirth(poli_links)

In [18]:
my_df.sample(10)

Unnamed: 0,Name,DOB,DOD
256,Nagendra Kumar Ray - Wikipedia,,
252,Purna Prasad Rajbansi - Wikipedia,,
150,Kailash Nath Kasudhan - Wikipedia,,
61,Khem Bahadur Bum - Wikipedia,,
66,Renu Chand (Bhatt) - Wikipedia,,
10,Haribhakta Adhikari - Wikipedia,,
399,Talk:List of Nepalese politicians - Wikipedia,,
193,Farmulha Mansur - Wikipedia,,
51,Gokarna Bista - Wikipedia,,
111,Bhagwan Das Gupta - Wikipedia,1940-01-01,15 November 1998


## Clean dataframe

In [19]:
my_df = my_df[my_df['DOB'].astype(bool)]

In [20]:
my_df = my_df[:-18]

In [21]:
df_poli = my_df.copy()

In [22]:
df_poli['Name'] = df_poli['Name'].apply(lambda x:x[:-12])

In [23]:
df_poli.set_index('Name',inplace=True)

<h3>Filter politician that are dead</h3>

In [24]:
dead_poli = df_poli[df_poli.astype(str)['DOD'] != '']
dead_poli

Unnamed: 0_level_0,DOB,DOD
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
Khadgajeet Baral,1928-04-17,"May 19, 2021"
Post Bahadur Bogati,1953-07-18,15 September 2014
Nar Bahadur Chand,1962-08-29,"06 Simayal, Baitadi"
Bhagwan Das Gupta,1940-01-01,15 November 1998
Nabindra Raj Joshi,1964-01-08,26 March 2021
Sushil Koirala,1939-08-12,9 February 2016
Dil Bahadur Lama,1930-03-21,25 March 2014
Bidhyanath Pokhrel,1918-06-09,25 August 1994
Sahana Pradhan,1927-06-17,22 September 2014
Ganesh Prasad Rijal,1920-05-11,4 April 1998


In [25]:
alive_poli = df_poli[~df_poli.index.isin(dead_poli.index)]
alive_poli.head()

Unnamed: 0_level_0,DOB,DOD
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
Bhim Acharya,1959-04-27,
Mahesh Acharya,1954-08-06,
Mahadev Bajgai,1978-02-23,
Narahari Acharya,1953-09-27,
Dilendra Prasad Badu,1954-01-09,


In [26]:
def calculate_age(born):
    b= datetime.strptime(born, '%Y-%m-%d')
    today = date.today()
    return today.year-b.year-((today.month, today.day) < (b.month, b.day))

In [27]:
final_df = alive_poli.copy()
final_df['Age']= final_df['DOB'].apply(lambda x: calculate_age(x))

In [28]:
final_df.shape

(61, 3)

In [29]:
final_df

Unnamed: 0_level_0,DOB,DOD,Age
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Bhim Acharya,1959-04-27,,62
Mahesh Acharya,1954-08-06,,67
Mahadev Bajgai,1978-02-23,,43
Narahari Acharya,1953-09-27,,68
Dilendra Prasad Badu,1954-01-09,,67
...,...,...,...
Dal Bahadur Sunar,1959-10-01,,62
Sher Bahadur Tamang,1969-03-28,,52
Surya Man Dong Tamang,1970-04-20,,51
Gagan Thapa,1976-07-16,,45


In [30]:
final_df['Age'].median()

64.0

In [31]:
final_df['Age'].mean()

62.9344262295082

In [32]:
final_df['Age'].count()

61

In [33]:
df_poli.replace('','NA').to_csv('politician.csv')