#Web Scraping Land Price Data



*   Import libraries
*   Create ETL functions
*   Scrape the Data
*   Create a CSV file





##1.) Import Libraries

In [None]:
import requests
from bs4 import BeautifulSoup

#Inspecting url

In [None]:
response = requests.get('https://www.jumia.cm/en/land-plots')
response

<Response [200]>

In [None]:
soup = BeautifulSoup(response.text,'html.parser')

In [None]:
list_urls = soup.find_all('article')
list_urls

[<article class="post-holder product-click " data-event='{"id":"12344516","title":"Terrain titr\u00e9","price":"400000.00","category":"Real Estate\/Land &amp; Plots"}' data-position="1" data-state="approved">
 <div class="post">
 <div class="alignleft ">
 <img alt="Terrain titré - Cameroon" itemprop="image" src="https://www.jumia.cm/deals/images/placeholders/re/mini/land-plots.png" title="Terrain titré - Cameroon"/> </div>
 <div class="text-area">
 <span class="icon-pic no-pic" data-nb-pics="0"></span><!--
 --><div class="announcement-container">
 <div class="announcement-infos">
 <a class="post-link post-vip" href="/en/terrain-titr--pid12344516" title="Terrain titré"><span>Terrain titré</span></a>
 <span class="address">
                                                     Land &amp; Plots,
                                                                             Bastos                                            </span>
 </div><!--
 --><div class="price-date">
 <span class="price">

In [None]:
for partial_url in list_urls:
  print(partial_url)

<article class="post-holder product-click " data-event='{"id":"12344516","title":"Terrain titr\u00e9","price":"400000.00","category":"Real Estate\/Land &amp; Plots"}' data-position="1" data-state="approved">
<div class="post">
<div class="alignleft ">
<img alt="Terrain titré - Cameroon" itemprop="image" src="https://www.jumia.cm/deals/images/placeholders/re/mini/land-plots.png" title="Terrain titré - Cameroon"/> </div>
<div class="text-area">
<span class="icon-pic no-pic" data-nb-pics="0"></span><!--
--><div class="announcement-container">
<div class="announcement-infos">
<a class="post-link post-vip" href="/en/terrain-titr--pid12344516" title="Terrain titré"><span>Terrain titré</span></a>
<span class="address">
                                                    Land &amp; Plots,
                                                                            Bastos                                            </span>
</div><!--
--><div class="price-date">
<span class="price">
              

In [None]:
for partial_url in list_urls:
    print(partial_url.find('a')['href'])

/en/terrain-titr--pid12344516
/en/terrain-titr-a-vendre-au-centre-d-elat-pid12239087
/en/terrain-yassa-bwang-bakoko-de-300m2-bon-prix--pid12343668
/en/terrain-vendre-odza-pid12343251
/en/terrain-vendre-omnisports-pid12342926
/en/terrain-vendre-afanoyoa-pid12342919
/en/terrain-vendre-odza-pid12342897
/en/2-lots-de-terrain-vendre-mballa-2-pid12342872


In [None]:
#find each independent article on page
base_url = 'https://www.jumia.cm'
list_urls = soup.find_all('article')
for partial_url in list_urls:
    print(base_url + partial_url.find('a')['href'])

##2.)Creating ETL functions

In [None]:
#define function to get the page
url_list=[]
items_list=[]

In [None]:
def get_page_urls(page):
  base_url = 'https://www.jumia.cm'
  response = requests.get(f'https://www.jumia.cm/en/land-plots?page={page}')
  soup = BeautifulSoup(response.text,'html.parser')
  list_urls = soup.find_all('article')
  for partial_url in list_urls:
    new_url = base_url + partial_url.find('a')['href']
    url_list.append(new_url)

In [None]:
#define function to get location,area,price,price
def extract_transform(url):
  response = requests.get(url)
  soup = BeautifulSoup(response.text,'html.parser')
  location = soup.find('span',{'itemprop':'addressLocality'}).get_text()
  area = soup.find_all('h3')[1].get_text().replace('Area','').replace(' m2','')
  price = soup.find('span',{'itemprop':'price'}).get_text().replace(',','')
  items = {'Location':location,'Area':area,'Price':price}
  items_list.append(items)

In [None]:
get_page_urls(1)


In [None]:
url=url_list[0]
url_list

['https://www.jumia.cm/en/terrain-pid11862981',
 'https://www.jumia.cm/en/terrain-de-250-m-titr-loti-vendre-nkolafamba-pid11686785',
 'https://www.jumia.cm/en/terrain-titr-commercial-vendre-1001-m2-carrefour-coron-pid11863906',
 'https://www.jumia.cm/en/terrain-titr-vendre-japoma-pid11863617',
 'https://www.jumia.cm/en/terrain-titr-de-500-m-lotis-un-signataire--pid11862974',
 'https://www.jumia.cm/en/terrains-titr-s-en-vente-ngodi-akwa--pid11861849',
 'https://www.jumia.cm/en/terrains-titr-s-en-vente-logbessou--pid11861783',
 'https://www.jumia.cm/en/terrains-en-vente-logbaba--pid11861764',
 'https://www.jumia.cm/en/terrain-titr-kribi-vente-en-hectares--pid11861657']

In [None]:
extract_transform(url_list[0])
items_list

[{'Area': '500', 'Location': 'Odza', 'Price': '25000000'}]

In [None]:
response = requests.get(url)
response

<Response [200]>

In [None]:
#inspecting location
soup = BeautifulSoup(response.text,'html.parser')
location = soup.find('span',{'itemprop':'addressLocality'}).get_text()
location

'Odza'

In [None]:
#inspecting land area
soup = BeautifulSoup(response.text,'html.parser')
area = soup.find_all('h3')[1].get_text().replace('Area','').replace(' m2','')
area

'500'

In [None]:
#inspecting price
soup = BeautifulSoup(response.text,'html.parser')
price = soup.find('span',{'itemprop':'price'}).get_text().replace(',','')
price

'25000000'

##Scraping the page

In [None]:
for page in range(1,2):
  get_page_urls(page)
url_list


['https://www.jumia.cm/en/terrain-pid11862981',
 'https://www.jumia.cm/en/terrain-de-250-m-titr-loti-vendre-nkolafamba-pid11686785',
 'https://www.jumia.cm/en/terrain-titr-commercial-vendre-1001-m2-carrefour-coron-pid11863906',
 'https://www.jumia.cm/en/terrain-titr-vendre-japoma-pid11863617',
 'https://www.jumia.cm/en/terrain-titr-de-500-m-lotis-un-signataire--pid11862974',
 'https://www.jumia.cm/en/terrains-titr-s-en-vente-ngodi-akwa--pid11861849',
 'https://www.jumia.cm/en/terrains-titr-s-en-vente-logbessou--pid11861783',
 'https://www.jumia.cm/en/terrains-en-vente-logbaba--pid11861764',
 'https://www.jumia.cm/en/terrain-titr-kribi-vente-en-hectares--pid11861657',
 'https://www.jumia.cm/en/terrain-pid11862981',
 'https://www.jumia.cm/en/terrain-de-250-m-titr-loti-vendre-nkolafamba-pid11686785',
 'https://www.jumia.cm/en/terrain-titr-commercial-vendre-1001-m2-carrefour-coron-pid11863906',
 'https://www.jumia.cm/en/terrain-titr-vendre-japoma-pid11863617',
 'https://www.jumia.cm/en/ter

In [None]:
for url in url_list:
  extract_transform(url)
items_list

[{'Area': '500', 'Location': 'Odza', 'Price': '25000000'},
 {'Area': '500', 'Location': 'Odza', 'Price': '25000000'},
 {'Area': '250', 'Location': 'Yaoundé', 'Price': '2150'},
 {'Area': '1001', 'Location': 'Mvog Mbi', 'Price': '250000'},
 {'Area': '5000', 'Location': 'Japoma', 'Price': '25000'},
 {'Area': '500', 'Location': 'Odza', 'Price': '23000000'},
 {'Area': '600', 'Location': 'Ngodi', 'Price': '250000'},
 {'Area': '400', 'Location': 'Logbessou', 'Price': '30000'},
 {'Area': '200', 'Location': 'Logbaba', 'Price': '40000'},
 {'Area': '10000', 'Location': 'Kribi', 'Price': '35000000'},
 {'Area': '500', 'Location': 'Odza', 'Price': '25000000'},
 {'Area': '250', 'Location': 'Yaoundé', 'Price': '2150'},
 {'Area': '1001', 'Location': 'Mvog Mbi', 'Price': '250000'},
 {'Area': '5000', 'Location': 'Japoma', 'Price': '25000'},
 {'Area': '500', 'Location': 'Odza', 'Price': '23000000'},
 {'Area': '600', 'Location': 'Ngodi', 'Price': '250000'},
 {'Area': '400', 'Location': 'Logbessou', 'Price'

##4.) Convert to CSV

In [None]:
import pandas as pd
data = pd.DataFrame(items_list)
data

Unnamed: 0,Location,Area,Price
0,Odza,500,25000000
1,Odza,500,25000000
2,Yaoundé,250,2150
3,Mvog Mbi,1001,250000
4,Japoma,5000,25000
5,Odza,500,23000000
6,Ngodi,600,250000
7,Logbessou,400,30000
8,Logbaba,200,40000
9,Kribi,10000,35000000


In [None]:
data.to_csv('land_price_data.csv',index=False)