## Web Scraping: 2019 list of Earthquakes in Nepal


In [1]:
import requests #https://requests.readthedocs.io/en/master/
from pyquery import PyQuery as pq #for XPath and Css Selectors: https://pythonhosted.org/pyquery/

Importing the packages

In [2]:
url='http://seismonepal.gov.np/earthquakes/2020' #http://seismonepal.gov.np/earthquakes/2019

In [3]:
def read_url(url):
    '''using pyquery'''
    pageSource = requests.get(url).text  #receiving 'view source'
    #return pq(pageSource)
    return pageSource
    


In [4]:
page = read_url(url)

In [5]:
title=pq(page).find('title').text() #check here if page is an object of pyquery or not
print(title)

National Earthquake Monitoring & Research Center


In [6]:
data = pq(page).find('table.table-striped thead').html() #confirming the content we have choosen!
print(data[28:505])


                      <th scope="col">Date</th>
                        <th scope="col">Time</th>
                        <th scope="col">Latitude</th>
                        <th scope="col">Longitude</th>
                        <th scope="col">Magnitude(ML)</th>
                        <th scope="col">Remarks</th>
                        <th scope="col">Epicenter</th>
                        <th scope="col">Form</th>
                    </tr>
                    


In [7]:
headText = pq(page).find('table.table-striped thead').text() #load <thead> texts from table
print(headText)

Date
Time
Latitude
Longitude
Magnitude(ML)
Remarks
Epicenter
Form


In [8]:
body = pq(page).find('table.table-striped tbody').html() #load <tbody> HTML from table
print(body[200:700])

">
                                    <small>B.S:</small>2077-9-15<br/>
                                    <small>A.D:</small>2020-12-29</td>
                                <td>
                                    <small>Local:</small>01:49<br/>
                                    <small>UTC:</small>20:04</td>
                                <td>27.66</td>
                                <td>86.34</td>
                                <td>4.6</td>
                                <td>NERMC</td>


In [9]:
firstRow = pq(page).find('table.table-striped tbody tr:first').text() 
print(firstRow)

B.S:2077-9-15
A.D:2020-12-29
Local:01:49
UTC:20:04
27.66
86.34
4.6
NERMC
Ramechhap
Did You feel it ?


In [10]:
firstRowNewAgain = pq(page).find('table.table-striped tbody tr:first').remove('small')
#print(firstRowNewAgain)

print(str(firstRowNewAgain).strip()) 

<tr>
                                <td scope="row">
                                     2077-9-15<br/>
                                     2020-12-29</td>
                                <td>
                                     01:49<br/>
                                     20:04</td>
                                <td>27.66</td>
                                <td>86.34</td>
                                <td>4.6</td>
                                <td>NERMC</td>
                                <td><a href="http://seismonepal.gov.np/earthquake-epicenter-google-map/1039" title="Click to see Google Map point of this" style="text-decoration: underline;">Ramechhap</a></td>
                                <td><a href="http://seismonepal.gov.np/did-you-feel-it-" class="btn btn-primary">Did You feel it ?</a></td>
                            </tr>


First row sampling

In [11]:
newRowAgain = pq(firstRowNewAgain).text()
print(newRowAgain)
print('----'*20)
print(newRowAgain.split("\n"))

2077-9-15
2020-12-29
01:49
20:04
27.66
86.34
4.6
NERMC
Ramechhap
Did You feel it ?
--------------------------------------------------------------------------------
['2077-9-15', '2020-12-29', '01:49', '20:04', '27.66', '86.34', '4.6', 'NERMC', 'Ramechhap', 'Did You feel it ?']


In [12]:
#cleaned content from Seismonepal!

allRowMaterial = pq(page).find('table.table-striped tbody').remove('small')
print(len(pq(allRowMaterial).find('tr')))


25


# loop through the data stored in table

In [13]:
finalList=list([['Date','Time','Latitude','Longitude','Magnitude','Epicenter']])


for row in pq(allRowMaterial).find('tr').items(): #each and every selected element traverse/travel!
    col=row.find('td')
    
    
    edate = col[0].text.strip()
    etime = col[1].text.strip() 
    elatitude= col[2].text.strip() 
    elongitude= col[3].text.strip() 
    emagnitude= col[4].text.strip() 
    #epicentre= col[5].text.strip()
    epicentre = col[6].find('a').text.strip()

    if edate:
        finalList.append([edate, etime, elatitude, elongitude, emagnitude, epicentre])
        
print(finalList)
        
    

[['Date', 'Time', 'Latitude', 'Longitude', 'Magnitude', 'Epicenter'], ['2077-9-15', '01:49', '27.66', '86.34', '4.6', 'Ramechhap'], ['2077-8-23', '19:08', '27.77', '87.86', '4.2', 'Taplejung'], ['2077-8-17', '13:42', '27.75', '85.87', '4.2', 'Sindhupalchok'], ['2077-7-24', '03:29', '27.73', '87.87', '4.9', 'Taplejung'], ['2077-7-13', '00:19', '29.58', '80.81', '4.2', 'Bajhang'], ['2077-7-8', '16:17', '27.62', '86.24', '4.0', 'Dolakha'], ['2077-5-31', '05:19', '27.77', '85.88', '6.0', 'Sindhupalchowk'], ['2077-5-26', '09:30', '26.76', '88.21', '5.0', 'Nepal India  Border'], ['2077-5-24', '15:48', '27.92', '84.94', '4.0', 'Dhading'], ['2077-5-19', '07:20', '27.58', '87.19', '4.0', 'Sankhuwasabha'], ['2077-5-9', '21:09', '29.59', '80.89', '4.7', 'Bajhang'], ['2077-4-31', '16:07', '27.8', '87.62', '4.2', 'Sankhuwasabha'], ['2077-3-2', '09:53', '27.88', '86.20', '4.1', 'Dolakha'], ['2077-2-17', '21:36', '27.85', '84.93', '4.8', 'Dhading'], ['2077-2-13', '16:00', '29.73', '82.01', '4.2', 'Hu

In [14]:
print("Total Records found: ",len(finalList))
print(finalList[0])
print(finalList[-1])

Total Records found:  26
['Date', 'Time', 'Latitude', 'Longitude', 'Magnitude', 'Epicenter']
['2076-9-27', '20:19', '29.58', '81.71', '4.5', 'Bajura']


In [15]:
import pandas as pd


Converting the 

In [16]:
df=pd.DataFrame(finalList)

In [17]:
df.to_csv('Earthquake2020.csv',header=False,index=False)