# Web Scrapping

- Web scraping adalah proses ekstrasi data dari sebuah website.

# Case 1 Iphone - Erafone (Single Page)

## Load Library

In [1]:
import requests
import pandas as pd
from bs4 import BeautifulSoup

## Specify the Main URL

In [2]:
url = 'https://eraspace.com/erafone/catalogsearch/result/?cat=40&p=1&q=iphone'

## Connect to the URL

In [4]:
#Jika statusnya 200 maka udah OK
response = requests.get(url)
response

<Response [200]>

## Parsing Main HTML

In [5]:
#Mengurai HTML
soup = BeautifulSoup(response.content, 'html.parser')

## Find Specific Elements

In [7]:
#Outputnya itu jadi list
produk_hp = soup.find_all('li', 'item product product-item')
print(produk_hp)

[<li class="item product product-item"> <div class="product-item-info" data-container="product-grid">
<!-- product badge -->
<!-- 
/**
 * Copyright © 2019 PT Kemana Teknologi Solusi. All rights reserved.
 * http://www.kemana.com
 */

/**
 * @category Kemana
 * @package  Kemana_Catalog
 * @license  http://opensource.org/licenses/osl-3.0.php  Open Software License (OSL 3.0)
 *
 *
 * @author   Imam Kusuma <ikusuma@kemana.com>
 */
-->
<!-- product badge -->
<a class="product photo product-item-photo" href="https://eraspace.com/erafone/apple-iphone-xs-64gb" tabindex="-1">
<img alt="Apple iPhone Xs 64GB" class="photo image" height="150" src="https://eraspace.com/pub/media/catalog/product/cache/d03018dd3d9615b5c9bd12c6ee948951/i/p/iphone-xs-gold_1_1_1_3.jpg" width="150">
</img></a>
<div class="product details product-item-details">
<strong class="product name product-item-name">
<a class="product-item-link" href="https://eraspace.com/erafone/apple-iphone-xs-64gb">
                            

## Get the Information

In [8]:
nama_produk = []
harga_produk = []
for p in produk_hp:
    nama = p.find('a','product-item-link').get_text()
    harga = p.find('span','price').get_text()
    nama_produk.append(nama)
    harga_produk.append(harga)

## Organizing the Information

In [9]:
produk_dict = {
    'nama':nama_produk,
    'harga':harga_produk
}

In [10]:
df = pd.DataFrame(produk_dict, columns=['nama','harga'])

In [12]:
df.head()

Unnamed: 0,nama,harga
0,\n Apple iPhone...,Rp. 19.599.000
1,\n Apple iPhone...,Rp. 19.399.000
2,\n Apple iPhone...,Rp. 26.599.000
3,\n Apple iPhone...,Rp. 12.999.000
4,\n Apple iPhone...,Rp. 22.999.000


## Data Cleaning

In [17]:
df['nama'].replace(r'\s\s','', regex=True)

0           Apple iPhone Xs 64GB
1           Apple iPhone X 256GB
2          Apple iPhone Xs 512GB
3           Apple iPhone Xr 64GB
4      Apple iPhone Xs Max 256GB
5          Apple iPhone Xs 256GB
6          Apple iPhone Xr 256GB
7      Apple iPhone Xs Max 512GB
8      Apple iPhone 8 Plus 256GB
9           Apple iPhone 8 256GB
10      Apple iPhone Xs Max 64GB
11         Apple iPhone Xr 128GB
Name: nama, dtype: object

In [18]:
df['nama'] = df['nama'].replace(r'\s\s','', regex=True)
df.head()

Unnamed: 0,nama,harga
0,Apple iPhone Xs 64GB,Rp. 19.599.000
1,Apple iPhone X 256GB,Rp. 19.399.000
2,Apple iPhone Xs 512GB,Rp. 26.599.000
3,Apple iPhone Xr 64GB,Rp. 12.999.000
4,Apple iPhone Xs Max 256GB,Rp. 22.999.000


In [23]:
df['harga'].replace(r'[Rp.]','',regex=True)

0      19599000
1      19399000
2      26599000
3      12999000
4      22999000
5      22599000
6      15999000
7      26999000
8      15499000
9      13499000
10     19999000
11     13999000
Name: harga, dtype: object

In [24]:
df['harga'] = df['harga'].replace(r'[Rp.]','',regex=True)
df.head()

Unnamed: 0,nama,harga
0,Apple iPhone Xs 64GB,19599000
1,Apple iPhone X 256GB,19399000
2,Apple iPhone Xs 512GB,26599000
3,Apple iPhone Xr 64GB,12999000
4,Apple iPhone Xs Max 256GB,22999000


In [25]:
df

Unnamed: 0,nama,harga
0,Apple iPhone Xs 64GB,19599000
1,Apple iPhone X 256GB,19399000
2,Apple iPhone Xs 512GB,26599000
3,Apple iPhone Xr 64GB,12999000
4,Apple iPhone Xs Max 256GB,22999000
5,Apple iPhone Xs 256GB,22599000
6,Apple iPhone Xr 256GB,15999000
7,Apple iPhone Xs Max 512GB,26999000
8,Apple iPhone 8 Plus 256GB,15499000
9,Apple iPhone 8 256GB,13499000


# Case 2 Samsung - Erafone (Multiple Pages)

In [32]:
p_range = range(1,4)
nama_produk = []
harga_produk = []
for p in p_range:
    url = f'https://eraspace.com/erafone/catalogsearch/result/?cat=40&p={p}&q=samsung'
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    produk = soup.find_all('li', 'item product product-item')
    try:
        for x in produk:
            nama = x.find('a','product-item-link').get_text()
            harga = x.find('span','price').get_text().replace('Rp', '').replace('.','')
            nama_produk.append(nama)
            harga_produk.append(harga)
    except:
        continue
produk_dict = { 'nama':nama_produk, 'harga':harga_produk }
df = pd.DataFrame(produk_dict, columns=['nama','harga'])
df['nama'].replace(r'\s\s','',regex=True, inplace=True)

In [34]:
df.head()

Unnamed: 0,nama,harga
0,Samsung Galaxy Tab A 10.1 T515N 2019,4799000
1,Samsung Galaxy S10e 128GB,10499000
2,Samsung Galaxy Note10 256GB,13999000
3,Samsung Galaxy A50 (6GB/128GB),4349000
4,Samsung Galaxy Note9 (6GB/128GB) FREE Bluetoo...,12649000


# Case 3 Jakarta Notebook - Iphone Acc (Multiple Pages)

In [35]:
p_range = range(1,23)
nama_produk = []
harga_produk = []
links = []
for p in p_range:
    url = f'https://www.jakartanotebook.com/search?key=iphone&page={p}'
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    produk = soup.find_all('div', 'product-list')
    try:
        for x in produk:
            nama = x.find('a','product-list__title').get_text()
            harga = x.find('span','product-list__price').get_text()
            link = x.find('img').get('src')
            nama_produk.append(nama)
            harga_produk.append(harga)
            links.append(link)
    except:
        continue
produk_dict = { 'nama':nama_produk, 'harga':harga_produk, 'link':links }
df = pd.DataFrame(produk_dict, columns=['nama','harga', 'link'])

In [37]:
df.head()

Unnamed: 0,nama,harga,link
0,Anti Crack TPU Silicone Softcase for iPhone 7 ...,Rp. 10.800,https://www.jakartanotebook.com/images/product...
1,Sensitive Thermal Hardcase for iPhone 7/8 Plus...,Rp. 12.900,https://www.jakartanotebook.com/images/product...
2,Anti Crack TPU Silicone Softcase for iPhone X ...,Rp. 8.800,https://www.jakartanotebook.com/images/product...
3,Anti Crack TPU Silicone Softcase for iPhone 7 ...,Rp. 10.800,https://www.jakartanotebook.com/images/product...
4,Anti Crack TPU Silicone Softcase for Xiaomi Mi...,Rp. 3.900,https://www.jakartanotebook.com/images/product...


# Case 4 IndoXX1 - Movie (Single Page)

In [38]:
url = 'https://indoxx1.kim'
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
produk = soup.find_all('div', 'ml-item')

In [39]:
nama_produk = []
durasi = []
ratings = []
for p in produk:
    nama = p.find('div','mli-info').get_text()
    dur = p.find('span','mli-durasi').get_text()
    rating = p.find('span','mli-rating').get_text()
    nama_produk.append(nama)
    durasi.append(dur)
    ratings.append(rating)

In [40]:
produk_dict = { 'nama':nama_produk, 'durasi':durasi, 'rating':ratings }
df = pd.DataFrame(produk_dict, columns=['nama','durasi', 'rating'])

In [42]:
df

Unnamed: 0,nama,durasi,rating
0,Toy Story 4 (2019),99,8.7
1,Spider-Man: Far from Home (2019),128,8.2
2,The Battle: Roar to Victory (2019),134,6.7
3,Child's Play (2019),89,6.5
4,Anna (2019),119,6.8
5,Yesterday (2019),116,7.1
6,Paradise Hills (2019),93,7.5
7,The Divine Fury (2019),129,6.7
8,It Chapter Two (2019),160,7.2
9,Avengers: Endgame (2019),181,8.6
