# Import Modules

For introduction of Selenium, please be kind to open [**this site**](https://www.scrapingbee.com/blog/selenium-python/)

In [110]:
from selenium import webdriver
import pandas as pd

# Web Scraping

Read how to download webdriver for Chrome [**here**]('https://chromedriver.chromium.org/downloads')

In [78]:
DRIVER_PATH = '/home/audhi/github/Web-Scraping-Using-Python-and-R/Python/knapsackfamily/chromedriver'
driver = webdriver.Chrome(executable_path = DRIVER_PATH)
driver.get('http://www.knapsackfamily.com/MetaboliteActivity/top.php')

**Input targeted species name**

In [79]:
# Get the web elemnt corresponding to the targeted species (Text field)
element_species = driver.find_elements_by_name('targetsp')[0]

In [80]:
# Send value to the text input of species name
element_species.send_keys('allium cepa')

**Click on target species checkbox**

In [81]:
species_checkbox = driver.find_element_by_name('sname4')

In [82]:
species_checkbox.click()

**Clik on list button**

In [83]:
list_button = driver.find_element_by_name('search')

In [84]:
list_button.click()

**Switch to active tab - RESULT**

In [85]:
driver.switch_to.window(driver.window_handles[1])

# Scraping the Data

In [86]:
table = driver.find_element_by_xpath('/html/body/table[2]/tbody')

**Get column header**

In [159]:
file_data = []
file_header = []
head_line = table.find_element_by_tag_name('tr')
headers = head_line.find_elements_by_tag_name('th')

for header in headers:
    header_text_byte = header.text.encode('utf-8')
    header_text_raw = header_text_byte.decode('utf-8')
    file_header.append(header_text_raw)
file_data.append(';'.join(file_header))

In [160]:
file_header

['C_ID',
 'Metabolite Name',
 'Activity Category',
 'Biological Activity (Function)',
 'Target Species',
 'Reference']

**Get rows content**

In [161]:
body_rows = table.find_elements_by_tag_name('tr')
for row in body_rows:
    data = row.find_elements_by_tag_name('td')    
    file_row = []
    for datum in data:
        datum_text_byte = datum.text.encode('utf8')
        datum_text_raw = datum_text_byte.decode('utf-8')
        file_row.append(datum_text_raw)
    file_data.append(';'.join(file_row))

In [162]:
file_data

['C_ID;Metabolite Name;Activity Category;Biological Activity (Function);Target Species;Reference',
 '',
 '',
 'C00000092;N6-Benzyladenine;Enhance stem growth;exhibit tuberization process;Allium cepa;Thomas,Hortc.Res.,12,(1972),77-79',
 'C00000100;Indole-3-acetic acid\nIAA;Enhance root growth;promote root elongation;Allium cepa;Diez,Planta,97,(1971),364-366',
 'C00000100;Indole-3-acetic acid\nIAA;Enhance root growth;inhibit root elongation;Allium cepa;Diez,Planta,97,(1971),364-366',
 'C00001389;S-[(E)-Prop-1-enyl]-L-cysteine S-oxide;Irritant;the main lachrymatory principle, which is produced by the action of alliinase when cut or bruised;Allium cepa;Harborne,Phytochemical Dictionary Second Edition,Taylor and Francis,(1999),Chapter10',
 'C00002359;Porritoxin;Phytotoxic;phytotoxin, a fungus causing black spot disease;Allium cepa;Harborne,Phytochemical Dictionary Second Edition,Taylor and Francis,(1999),Chapter31']

In [163]:
# Remove indexes which are header
del(file_data[1:3])

In [164]:
# Convert to dataframe
data_knapsck = pd.DataFrame(columns = file_data[0].split(';'),data = [row.split(';') for row in file_data[1:]])

In [165]:
data_knapsck

Unnamed: 0,C_ID,Metabolite Name,Activity Category,Biological Activity (Function),Target Species,Reference
0,C00000092,N6-Benzyladenine,Enhance stem growth,exhibit tuberization process,Allium cepa,"Thomas,Hortc.Res.,12,(1972),77-79"
1,C00000100,Indole-3-acetic acid\nIAA,Enhance root growth,promote root elongation,Allium cepa,"Diez,Planta,97,(1971),364-366"
2,C00000100,Indole-3-acetic acid\nIAA,Enhance root growth,inhibit root elongation,Allium cepa,"Diez,Planta,97,(1971),364-366"
3,C00001389,S-[(E)-Prop-1-enyl]-L-cysteine S-oxide,Irritant,"the main lachrymatory principle, which is prod...",Allium cepa,"Harborne,Phytochemical Dictionary Second Editi..."
4,C00002359,Porritoxin,Phytotoxic,"phytotoxin, a fungus causing black spot disease",Allium cepa,"Harborne,Phytochemical Dictionary Second Editi..."


**Get link to inspect**

In [166]:
elems = table.find_elements_by_tag_name('a')
link_data = []
for elem in elems:
    link = elem.get_attribute('href')
    print(link)
    link_data.append(link)

http://www.knapsackfamily.com/knapsack_core/information.php?sname=C_ID&word=C00000092
http://www.knapsackfamily.com/knapsack_core/information.php?sname=C_ID&word=C00000100
http://www.knapsackfamily.com/knapsack_core/information.php?sname=C_ID&word=C00000100
http://www.knapsackfamily.com/knapsack_core/information.php?sname=C_ID&word=C00001389
http://www.knapsackfamily.com/knapsack_core/information.php?sname=C_ID&word=C00002359


In [167]:
link_data

['http://www.knapsackfamily.com/knapsack_core/information.php?sname=C_ID&word=C00000092',
 'http://www.knapsackfamily.com/knapsack_core/information.php?sname=C_ID&word=C00000100',
 'http://www.knapsackfamily.com/knapsack_core/information.php?sname=C_ID&word=C00000100',
 'http://www.knapsackfamily.com/knapsack_core/information.php?sname=C_ID&word=C00001389',
 'http://www.knapsackfamily.com/knapsack_core/information.php?sname=C_ID&word=C00002359']

**Get the whole data**

In [169]:
data_knapsck_url = pd.concat([data_knapsck,pd.Series(data = link_data,name = 'Link')],axis=1)

In [170]:
data_knapsck_url

Unnamed: 0,C_ID,Metabolite Name,Activity Category,Biological Activity (Function),Target Species,Reference,Link
0,C00000092,N6-Benzyladenine,Enhance stem growth,exhibit tuberization process,Allium cepa,"Thomas,Hortc.Res.,12,(1972),77-79",http://www.knapsackfamily.com/knapsack_core/in...
1,C00000100,Indole-3-acetic acid\nIAA,Enhance root growth,promote root elongation,Allium cepa,"Diez,Planta,97,(1971),364-366",http://www.knapsackfamily.com/knapsack_core/in...
2,C00000100,Indole-3-acetic acid\nIAA,Enhance root growth,inhibit root elongation,Allium cepa,"Diez,Planta,97,(1971),364-366",http://www.knapsackfamily.com/knapsack_core/in...
3,C00001389,S-[(E)-Prop-1-enyl]-L-cysteine S-oxide,Irritant,"the main lachrymatory principle, which is prod...",Allium cepa,"Harborne,Phytochemical Dictionary Second Editi...",http://www.knapsackfamily.com/knapsack_core/in...
4,C00002359,Porritoxin,Phytotoxic,"phytotoxin, a fungus causing black spot disease",Allium cepa,"Harborne,Phytochemical Dictionary Second Editi...",http://www.knapsackfamily.com/knapsack_core/in...
