# Land price web scraping project
The data to be scraped is at https://www.jumia.cm/en/land-plots. It contains the location, surface area (squared metre) and the prices per squared metre for various neigborhoods in Litoral region, Cameroon.


## Steps involved
- Import libraries
- Create ETL functions
- Scrape the data
- Create CSV file of the data

### 1.) Import libraries

In [13]:
import requests
from bs4 import BeautifulSoup

### 2.) Create ETL functions

In [122]:
# list to append urls
urls_list = []

# define first function
def get_page_urls(page):
    """ Get URLs on the page and concatenate the base URL to each
    
    Arg:
        page (int): the page number
        
    Returns:
        list: list of URLs
    """
    base_url = 'https://www.jumia.cm'
    # Access the web page
    response = requests.get(f'https://www.jumia.cm/en/land-plots?page={page}')
    # Get the text from the web page
    soup = BeautifulSoup(response.text, 'html.parser')
    # Find urls of all articles on the web page and append to url_list
    list_urls = soup.find_all('article')
    for partial_url in list_urls:
        new_url = base_url + partial_url.find('a')['href']
        urls_list.append(new_url)

In [123]:
#get_page_urls(1)

In [124]:
#urls_list

In [133]:
# list to append items
items_list = []

# define the second function
def extract_transform(url):
    """ Extract items from the URL and transform or clean them
    Arg:
        url (str): URL of the web page
        
    Returns:
        list: list of dictionaries
    """
    # Access the web page
    response = requests.get(url)
    # Get the text from the web page
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Extract items
    location = soup.find('span',{'itemprop':'addressLocality'}).get_text()
    area = float(soup.find_all('h3')[1].get_text().replace('Area', '').replace(' m2', ''))
    price = int(soup.find('span', {'itemprop':'price'}).get_text().replace(',', ''))
    seller = soup.find_all('dd')[0].get_text()
    
    # Create a dictionary to store items
    items = {
        'Location': location,
        'Area': area,
        'Price': price,
        'Seller': seller
    }
    # Append items to item_list
    items_list.append(items)

In [126]:
#extract_transform(url_list[0])

In [127]:
#items_list

### 3.) Scrape the data

In [128]:
for page in range(1,2):
    get_page_urls(page)

In [129]:
urls_list

['https://www.jumia.cm/en/terrain-titre-en-or-pid11720026',
 'https://www.jumia.cm/en/terrain-tr-s-bien-plac-vendre-a-lendi-quartier-g-n-ral--pid11754625',
 'https://www.jumia.cm/en/vente-terrain-titr-de-300m2-logpom-pid11754459',
 'https://www.jumia.cm/en/a-vendre-terrain-pid11754165',
 'https://www.jumia.cm/en/terrain-vendre--pid11754131',
 'https://www.jumia.cm/en/a-vendre-terrain-pid11754083',
 'https://www.jumia.cm/en/a-vendre-terrain-pid11754082',
 'https://www.jumia.cm/en/a-vendre-terrain-pid11754064',
 'https://www.jumia.cm/en/a-vendre-terrain-pid11754031']

In [134]:
for url in urls_list:
    extract_transform(url)

In [135]:
items_list

[{'Location': 'Ngaoundéré',
  'Area': 500.0,
  'Price': 5000000,
  'Seller': 'DJUNTU MICHAEL'},
 {'Location': 'Lendi',
  'Area': 500.0,
  'Price': 30000,
  'Seller': 'Orlande  ngalibassa'},
 {'Location': 'Logpom',
  'Area': 300.0,
  'Price': 35000,
  'Seller': 'Serge global service'},
 {'Location': 'Kotto', 'Area': 650.0, 'Price': 60000, 'Seller': 'Edgard'},
 {'Location': 'Kotto', 'Area': 515.0, 'Price': 38625000, 'Seller': 'Edgard'},
 {'Location': 'Kotto', 'Area': 400.0, 'Price': 24000000, 'Seller': 'Edgard'},
 {'Location': 'Kotto', 'Area': 400.0, 'Price': 24000000, 'Seller': 'Edgard'},
 {'Location': 'Makepe', 'Area': 220.0, 'Price': 28600000, 'Seller': 'Edgard'},
 {'Location': 'Logbessou',
  'Area': 500.0,
  'Price': 20000000,
  'Seller': 'Edgard'}]