In [20]:
import numpy as np 
import pandas as pd 
import urllib.request
from urllib.request import urlretrieve, Request, urlopen
import json
import time
import os
import fnmatch
import re
from bs4 import BeautifulSoup
import requests
import csv

##### URL structure

URL starting page:

Starting page - https://www.helpmefind.com/rose/plants.php?grp=A&t=2&qn=0&qc=0

Second page - https://www.helpmefind.com/rose/plants.php?grp=A&t=2&qn=1&qc=0

Third page - https://www.helpmefind.com/rose/plants.php?grp=A&t=2&qn=2&qc=0

To navigate to the next page you will need to increase by one in the initial url "qn=".

Base of the url for rose pages:

https://www.helpmefind.com/rose/

Parameters:

* 'grp' is the first latter of the rose name. grp=A will bring names that start with 'A'.

* 't' determins wich search tab we use on the website. In our case we are interested in tab two, which is alphabetic lists. 

* 'gn' determins page per letter specified by parameter 'grp'.

* 'qc' does not make a difference in our case. Will be left as default zero.  

In [19]:
# getting all urls from the page
# quick review

for i in range(0,1):
    url = "https://www.helpmefind.com/rose/plants.php?grp=A&t=2&qn=" + str(i) + "&qc=0"
    print(url)
    urllist = re.findall("href=[\"\'](.*?)[\"\']", urllib.request.urlopen(url).read().decode("utf-8"))
    print(urllist)
    

https://www.helpmefind.com/rose/plants.php?grp=A&t=2&qn=0&qc=0
['/css/minX/hmf.e963512e2160d6eb8f2f470607749660.minX.css', '/css/minX/cluetip.7432ee7f44458f58ee235146b03718d3.minX.css', '/css/minX/jquery-ui.5c7a0b384ab167ab29b0064c4346fef8.minX.css', '/css/minX/jquery-ui-overide.1512a14de489f2215b7525048f7b5fd6.minX.css', 'guest.php', '/gardening/donations.php', '/gardening/miscFrm.php?f=40', '/gardening/qcsFrm.php?qcCategoryID=33&qcTopicID=179&qcTyp=3', '/gardening/qcsFrm.php?qcCategoryID=33&qcTopicID=200&qcTyp=3', 'https://www.helpmefind.com/rose/index.php', 'https://www.helpmefind.com/gardening/membership.php', 'https://www.helpmefind.com/gardening/sponsorship.php', 'https://www.helpmefind.com/gardening/donations.php', 'https://www.helpmefind.com/gardening/recent.php', 'https://www.helpmefind.com/rose/plants.php', 'https://www.helpmefind.com/rose/plants.php?tab=5', 'https://www.helpmefind.com/rose/cuttings.php', 'https://www.helpmefind.com/rose/plants.php?tab=15', 'https://www.helpm

In [14]:
# adding to the list index urls that point to specific rose pages

list_of_rose_indexes_a = []
for i in range(0,67):
    url = "https://www.helpmefind.com/rose/plants.php?grp=A&t=2&qn=" + str(i) + "&qc=0"
    urllist = re.findall("href=[\"\'](.*?)[\"\']", urllib.request.urlopen(url).read().decode("utf-8"))
    for file_link in urllist:
        if fnmatch.fnmatch(file_link, '/rose/l.php?l*'):
            # print(file_link)
            list_of_rose_indexes_a.append(file_link)


In [15]:
# checking how many links got identified in the stage above
len(list_of_rose_indexes_a)

3323

In [16]:
# listing 5 first urls in the list "list_of_pages_to_scrape_a"
list_of_rose_indexes_a[0:5]

['/rose/l.php?l=2.65689',
 '/rose/l.php?l=2.35077.7',
 '/rose/l.php?l=2.58955',
 '/rose/l.php?l=2.69785',
 '/rose/l.php?l=2.35206']

In [17]:
# creating a list of proper rose urls

list_of_pages_to_scrape = []
base_link = 'https://www.helpmefind.com'
for url in list_of_rose_indexes_a:
    rose_url = base_link + url
    list_of_pages_to_scrape.append(rose_url)

In [18]:
len(list_of_pages_to_scrape)

3323

Now we have all rose urls for rose names that start with letter 'A'. 

Next challenge is to identify structure of rose pages and scrape data from them in the csv file. 

We will be scraping data from the html file since this is the only format that the website provides. Looks like it is time to use BeautifulSoup. 

In [21]:
# Creating csv

filename = "Helpmefind_roses_a.csv"


with open(filename,'w',newline='',encoding='utf-8') as f:
    w = csv.writer(f)
    headers = 'Rose_name URL Synonyms ARS Origin Class Bloom Habit Growing Parentage Notes'
    bytes_headers = bytes(headers, 'utf-8')
    w.writerow(headers.split())


In [45]:
text_list_roses = list_of_pages_to_scrape[0:10]

In [None]:
for rose_link in text_list_roses:
    
    source = requests.get(rose_link).text

    # Parsing in BeautifulSoup

    soup = BeautifulSoup(source, 'lxml')

   # rose = soup.find("td", {"class":"content"})

   # print(rose.prettify())
    
   # rose_div = rose.find_all("div", class_="hdg")
    
   # print(rose_div)

    rose_title = rose.find_all("span", style="font-variant:small-caps;")
    #title = rose_title.text
    print(rose_title)
    
    
    
    origin = soup.select_one('div.hdg:contains("Origin:") + .grp')
    if origin is not None:
        origin_ = origin.text
    else:
        origin_ = ""
    print(origin_)
    
    
    synonyms = soup.select_one('div.hdg:contains("Synonyms:") + .grp')
    if synonyms is not None:
        synonym = synonyms.text
    else:
        synonym = ""
    print(synonym)
    
    
    ars = soup.select_one('div.hdg:contains("ARS:") + .grp')
    if ars is not None:
        ars_ = ars.text
    else:
        ars_ = ""
    print(ars_)
    
    
    classs = soup.select_one('div.hdg:contains("Class:") + .grp')
    if classs is not None:
        class_ = classs.text
    else:
        class_ = ""
    print(class_)
    
    
    growing = soup.select_one('div.hdg:contains("Growing:") + .grp')
    if growing is not None:
        growing_ = growing.text
    else:
        growing_ = ""
    print(growing_)
    
    
    habit = soup.select_one('div.hdg:contains("Habit:") + .grp')
    if habit is not None:
        habit_ = habit.text
    else:
        habit_ = ""
    print(habit_)
    

    parentage = soup.select_one('div.hdg:contains("Parentage:") + .grp')
    if parentage is not None:
        parentage_ = parentage.text
    else:
        parentage_ = ""
    print(parentage_)
    
    
    notes = soup.select_one('div.hdg:contains("Notes:") + .grp')
    if notes is not None:
        notes_ = notes.text
    else:
        notes_ = ""
    print(notes_)
    

    bloom = soup.select_one('div.hdg:contains("Bloom:") + .grp')
    if bloom is not None:
        bloom_ = bloom.text
    else:
        bloom_ = ""
    print(bloom_)

#    bloom = soup.select_one('div.hdg:contains("Bloom:") + .grp').text
    
  #  print(origin)
  #  print(ars)
  #  print(class_)
  #  print(growing)
  #  print(habit)
  #  print(parentage)
  #  print(notes)
  #  print(bloom)
    

    



In [None]:
# Getting extra data from scraped url and adding it to dictionary

    results = {}
    for item in rose2.find_all(class_='characteristics-wrapper')[0].find_all("li"):
        try:
            characteristic = item.h4.text
        except Exception as e:
            characteristic = 'None'
        try:
            type = item.p.text
        except Exception as e:
            type = 'None'
        results[characteristic] = type
        print('Characteristic : {}'.format(characteristic), 'Type : {}'.format(type))
        print(results)

# Printing data to csv file
# Had to change encoding of name as it was not in utf-8
    family = results.get('Family:')
    print(family)
    fragrance = results.get('Fragrance Strength:')
    print(fragrance)
    flowering = results.get('Flowering:')
    print(flowering)
    notes = results.get('Fragrance Notes:')
    print(notes)
    color2 = results.get('Colour:')
    print(color2)
    height = results.get('Height:')
    print(height)

    w.writerow([(name.encode('ascii','ignore')).decode('utf-8'),url,category,(price.encode('ascii','ignore')).decode('utf-8'),color,family,fragrance,flowering,notes,color2,height])

