In [1]:
import urllib
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
from datetime import datetime, date
import random

# Link Page

<b>Scraping the page where the list of Nepalese Politicians is populated.<b>

In [2]:
main_page = requests.get('https://en.wikipedia.org/wiki/List_of_Nepalese_politicians')

In [3]:
main_soup = BeautifulSoup(main_page.content,"html.parser")

In [4]:
# Uncomment the line below to see html of the page
# print(main_soup.prettify())

In [5]:
wiki_links = [] 
for link in main_soup.find_all('a',href=True):
    wiki_links.append(link['href'])

In [6]:
random.sample(wiki_links, 10)

['#cite_note-election1-16',
 '/wiki/Hasina_Miya',
 'https://web.archive.org/web/20061012034141/http://www.election-commission.org.np/toptwo.php',
 '/wiki/Ghan_Shyam_Yadav_Ahir',
 '#cite_ref-election-commission4_13-8',
 '/wiki/Wayback_Machine',
 '#cite_ref-119',
 '/wiki/Sebaki_Devi_Das_Tatma',
 '/wiki/Khadga_Prasad_Oli',
 '#cite_note-election1-16']

In [7]:
import re
poli_links = []
for s in wiki_links:
    if (re.search('\A/wiki/',s)) != None:
        poli_links.append(s)

In [8]:
random.sample(poli_links, 10)

['/wiki/Bal_Chandra_Poudel',
 '/wiki/Purna_Prasad_Rajbansi',
 '/wiki/Chitra_Bahadur_K.C.',
 '/wiki/Bam_Dev_Gautam',
 '/wiki/Tek_Bahadur_Chokhyal',
 '/wiki/Sushila_Swar',
 '/wiki/Ek_Nath_Dhakal',
 '/wiki/The_Caravan',
 '/wiki/Bishwodip_Lingden_Limbu',
 '/wiki/Dev_Raj_Bhar']

<b>We now have all links from the page (Wikipedia links like About,Home and politician links). We can select the politician links by slicing the list.<b>

# Scraping - POC for single link 

<b> Getting Name and Date of birth of a single Politician. This is done as a proof of concept before writing the final function. </b>

In [9]:
page = requests.get('https://en.wikipedia.org/wiki/Girija_Prasad_Koirala')

In [10]:
soup_single= BeautifulSoup(page.content,'html.parser')

In [11]:
# Uncomment the line below to see html of the page
# print(soup_single.prettify())

<b>Since the date of birth is stored in the right hand side box of each page, we select that element. </b>

In [12]:
box_detail = soup_single.find_all('table',class_=["infobox vcard"])

In [13]:
bday = soup_single.find_all('span',class_=["bday"])[0].string

In [14]:
bday

'1924-07-04'

In [15]:
print (soup_single.find(text="Died").findNext('td').contents[0])

20 March 2010


# Function

In [16]:
def scrapebirth(links):
    df = pd.DataFrame(columns=['Name','DOB','DOD'])
    for l in links:
        link = "https://en.wikipedia.org"+l 
        page = requests.get(link)
        soup = BeautifulSoup(page.content, "html.parser")
        
        name = soup.find_all('title')[0].string
        try:
            bday = soup.find_all('span',class_=["bday"])[0].string
        except IndexError:
            bday = ""
#             print('IndexError')
        try:
            death = soup.find(text="Died").findNext('td').contents[0]
        except AttributeError:
            death = ""
#             print('AttributeError')
        df = df.append({'Name': name,'DOB':bday,'DOD':death},ignore_index=True)
        link = ""
    return df

<h3>The following line scrapes all links. Computation may take few minutes.</h3>

In [17]:
my_df = scrapebirth(poli_links)

In [18]:
my_df.sample(10)

Unnamed: 0,Name,DOB,DOD
91,Sher Bahadur Deuba - Wikipedia,1946-09-12,
94,Ramnath Dhakal - Wikipedia,,
141,Chitra Bahadur K.C. - Wikipedia,,
289,Ganesh Man Singh - Wikipedia,,"September 18, 1997"
168,Dan Bahadur Kurmi - Wikipedia,,
18,Ghan Shyam Yadav Ahir - Wikipedia,,
190,Sapana Pradhan Malla - Wikipedia,,
134,Govinda Raj Joshi - Wikipedia,1949-08-12,
65,Binayadhoj Chand - Wikipedia,,
152,Rajendra Kumar KC - Wikipedia,,


## Clean dataframe

In [19]:
my_df = my_df[my_df['DOB'].astype(bool)]

In [20]:
my_df = my_df[:-18]

In [21]:
df_poli = my_df.copy()

In [22]:
df_poli['Name'] = df_poli['Name'].apply(lambda x:x[:-12])

In [23]:
df_poli.set_index('Name',inplace=True)

<h3>Filter politician that are dead</h3>

In [24]:
dead_poli = df_poli[df_poli.astype(str)['DOD'] != '']
dead_poli

Unnamed: 0_level_0,DOB,DOD
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
Post Bahadur Bogati,1953-07-18,15 September 2014
Bhagwan Das Gupta,1940-01-01,15 November 1998
Sushil Koirala,1939-08-12,9 February 2016
Dil Bahadur Lama,1930-03-21,25 March 2014
Bidhyanath Pokhrel,1918-06-09,25 August 1994
Sahana Pradhan,1927-06-17,22 September 2014
Ganesh Prasad Rijal,1920-05-11,4 April 1998
Surya Bahadur Thapa,1928-03-21,15 April 2015


In [25]:
alive_poli = df_poli[~df_poli.index.isin(dead_poli.index)]
alive_poli.head()

Unnamed: 0_level_0,DOB,DOD
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
Bhim Acharya,1959-04-27,
Mahesh Acharya,1954-08-06,
Narahari Acharya,1953-09-27,
Dilendra Prasad Badu,1954-01-09,
Khadgajeet Baral,1928-04-17,


In [26]:
def calculate_age(born):
    b= datetime.strptime(born, '%Y-%m-%d')
    today = date.today()
    return today.year-b.year-((today.month, today.day) < (b.month, b.day))

In [27]:
final_df = alive_poli.copy()
final_df['Age']= final_df['DOB'].apply(lambda x: calculate_age(x))

In [28]:
final_df.shape

(54, 3)

In [29]:
final_df

Unnamed: 0_level_0,DOB,DOD,Age
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Bhim Acharya,1959-04-27,,61
Mahesh Acharya,1954-08-06,,66
Narahari Acharya,1953-09-27,,67
Dilendra Prasad Badu,1954-01-09,,66
Khadgajeet Baral,1928-04-17,,92
Shakti Bahadur Basnet,1971-04-14,,49
Damodar Bhandari,1973-11-05,,46
Dev Raj Bhar,1952-01-29,,68
Lekh Raj Bhatta,1960-03-24,,60
Baburam Bhattarai,1954-06-18,,66


In [30]:
final_df['Age'].median()

62.5

In [31]:
final_df['Age'].mean()

62.870370370370374

In [32]:
final_df['Age'].count()

54

In [33]:
df_poli.replace('','NA').to_csv('politician.csv')