# **Preply Web Scrape and Upload to Firebase**

### Task Description
<p>I need to write a python script which will scrape records from a website (each record has about ten text fields and two images which need to be downloaded to a local folder) and save all the text information to a pandas data frame along with the image file names. Each record on the website has a unique id in the url string which can be used as the index/primary key. I think using Selenium with ChromeDriver will be the easiest way to do this because I will have to login through the browser window and navigate to the main page and pick some options before starting the scraping loop.

The second part of the project will be to push the scraped information and images to my firebase project. Since each record has a unique identifier it should be pretty straight forward to use that as the primary key to save the text to firestore. Also the images will need to be uploaded to firebase storage and links to the images saved as a field in firestore for each record.</p>

In [1]:
from selenium.common.exceptions import ElementClickInterceptedException
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC 
from selenium.webdriver.common.by import By
from time import sleep
from bs4 import BeautifulSoup
import re

In [2]:
CURRENCY_BUTTON = "#__next > section > div._15uGWh._3rnoor.X4eHUe._1o4EVj.hUFwKI._1-5zEl._2x1aeM._1WH1_Q._2JlASK.GWHMDM._1kf68Q._3nnn2i > div.styles_HeaderDropdownWrapper__uIOg5 > button"
URL = "https://preply.com/en/online/english-tutors?page={0}"
n_of_pages_selector = "#__next > div.styles_PageWrap__lIYEc > main > div.styles_ResultsWrapper__kAXts > div.styles_MainContentWrapper__h3r02 > div > div > ul > li:nth-child(5) > a"
def safe_execute(error_message, exception, function, *args):
    try:
        return function(*args)
    except exception as e:
        print(e)
        return error_message
to_int = lambda raw_string: int(''.join(re.findall(r'\d+', raw_string)))

class Selectors:
    #__next > div.styles_PageWrap__lIYEc > main > div.styles_ResultsWrapper__kAXts > div.styles_MainContentWrapper__h3r02 > ul > div:nth-child(15) > aside > div > div:nth-child(3)
    #__next > div.styles_PageWrap__lIYEc > main > div.styles_ResultsWrapper__kAXts > div.styles_MainContentWrapper__h3r02 > ul > div:nth-child(15) > aside > div > div:nth-child(3)
    tutor = "#__next > div.styles_PageWrap__lIYEc > main > div.styles_ResultsWrapper__kAXts > div.styles_MainContentWrapper__h3r02 > ul > li:nth-child({0})"
    tutor_basic_info = "#__next > div.styles_PageWrap__lIYEc > main > div.styles_ResultsWrapper__kAXts > div.styles_MainContentWrapper__h3r02 > ul > li:nth-child({0}) > section > div > div:nth-child(2) > div.styles_SearchCardHeading__fL9u3"
    tutor_country = "#__next > div.styles_PageWrap__lIYEc > main > div.styles_ResultsWrapper__kAXts > div.styles_MainContentWrapper__h3r02 > ul > li:nth-child({0}) > section > div > div:nth-child(2) > div.styles_SearchCardHeading__fL9u3 > span > img"
    is_newly_joined = "#__next > div.styles_PageWrap__lIYEc > main > div.styles_ResultsWrapper__kAXts > div.styles_MainContentWrapper__h3r02 > ul > li:nth-child({0}) > section > div > div:nth-child(2) > div.styles_CoreIndicatorsBlockWrapper__JrZw6 > div > div > div:nth-child(1) > div > span"
    tutor_rating = "#__next > div.styles_PageWrap__lIYEc > main > div.styles_ResultsWrapper__kAXts > div.styles_MainContentWrapper__h3r02 > ul > li:nth-child({0}) > section > div > div:nth-child(2) > div.styles_CoreIndicatorsBlockWrapper__JrZw6 > div > div > div:nth-child(1) > div"
    thumbnail_img = "#__next > div.styles_PageWrap__lIYEc > main > div.styles_ResultsWrapper__kAXts > div.styles_MainContentWrapper__h3r02 > ul > li:nth-child({0}) > section > div > div.styles_SearchCardAvatarWrapper__u_zNw > div > a > picture > img"
    price = "#__next > div.styles_PageWrap__lIYEc > main > div.styles_ResultsWrapper__kAXts > div.styles_MainContentWrapper__h3r02 > ul > li:nth-child({0}) > section > div > div:nth-child(2) > div.styles_CoreIndicatorsBlockWrapper__JrZw6 > div > div > div:nth-child(2) > div > div.styles_PriceIndicatorPrice__vz9WR > div.styles_PriceIndicatorValue__ndpfb.styles_PriceIndicatorValueSecond__Py5XD"
    lesson_duration = "#__next > div.styles_PageWrap__lIYEc > main > div.styles_ResultsWrapper__kAXts > div.styles_MainContentWrapper__h3r02 > ul > li:nth-child({0}) > section > div > div:nth-child(2) > div.styles_CoreIndicatorsBlockWrapper__JrZw6 > div > div > div:nth-child(2) > div > div.styles_PriceIndicatorDuration__GKnmh"
    lessons_and_students = "#__next > div.styles_PageWrap__lIYEc > main > div.styles_ResultsWrapper__kAXts > div.styles_MainContentWrapper__h3r02 > ul > li:nth-child({0}) > section > div > div:nth-child(3) > div.styles_InfoWrap__CBt_k > div._15uGWh._3rnoor.X4eHUe._1o4EVj.qh7on_._22lwVW._2x1aeM._1WH1_Q._2JlASK.GWHMDM"
    tutor_languages = "#__next > div.styles_PageWrap__lIYEc > main > div.styles_ResultsWrapper__kAXts > div.styles_MainContentWrapper__h3r02 > ul > li:nth-child({0}) > section > div > div:nth-child(3) > div.styles_InfoWrap__CBt_k > div.styles_SpeaksWrapper__ssRz1"
    tutor_languages_more = "#__next > div.styles_PageWrap__lIYEc > main > div.styles_ResultsWrapper__kAXts > div.styles_MainContentWrapper__h3r02 > ul > li:nth-child({0}) > section > div > div:nth-child(3) > div.styles_InfoWrap__CBt_k > div.styles_SpeaksWrapper__ssRz1 > ul > span.styles_MoreSubjects__eJRX9"
    tutor_description = "#__next > div.styles_PageWrap__lIYEc > main > div.styles_ResultsWrapper__kAXts > div.styles_MainContentWrapper__h3r02 > ul > li:nth-child({0}) > section > div > div:nth-child(3) > div.styles_InfoWrap__CBt_k > div.styles_TutorDescriptionBlock__Th9E_ > div > div"
    tutor_description_more = "#__next > div.styles_PageWrap__lIYEc > main > div.styles_ResultsWrapper__kAXts > div.styles_MainContentWrapper__h3r02 > ul > li:nth-child({0}) > section > div > div:nth-child(3) > div.styles_InfoWrap__CBt_k > div.styles_TutorDescriptionBlock__Th9E_ > div > span > span"

In [14]:
# Initialize Chrome Driver
options = webdriver.ChromeOptions()
options.add_argument("--start-maximized")
options.add_argument("disable-infobars")
chrome_driver_path="C:\Program Files\Google\Chrome\Application\chromedriver.exe"
driver = webdriver.Chrome(options=options)
driver.get(URL.format(1))
sleep(2)
print("Driver successfully initialized.")

Driver successfully initialized.


In [4]:
### Selecting Currency 
from selenium.webdriver.support.ui import Select

def refresh_currency():
    driver.find_element(By.CSS_SELECTOR,CURRENCY_BUTTON).click()
    select = Select(driver.find_element(By.NAME, 'currency'))
    sleep(1)
    select.select_by_visible_text("EUR")
    sleep(1)
    driver.find_element(By.CSS_SELECTOR,CURRENCY_BUTTON).click()
    select = Select(driver.find_element(By.NAME, 'currency'))
    select.select_by_visible_text("USD")

try:
    refresh_currency()
    print("Currency successfully refreshed.")
except Exception as e:
    print(f"Could not refresh currency, try again. {e}")

Currency successfully refreshed.


In [16]:
try:
    refresh_currency()
    print("Currency successfully refreshed.")
except Exception as e:
    print(f"Could not refresh currency, try again. {e}")
    
n_of_pages = int(driver.find_element(By.CSS_SELECTOR, n_of_pages_selector).text) + 1
test_n_of_pages = 11
tutors_dicttionary_list = []

for page_index in range(1,test_n_of_pages):
    driver.get(URL.format(page_index))
    sleep(3)
    for tutor_index in range(1,20):
        ## Validating Collected Selectors
        try:
            tutor_basic_info = driver.find_element(By.CSS_SELECTOR,Selectors.tutor_basic_info.format(tutor_index))
            tutor_name = tutor_basic_info.text.split('\n')[0]
            tutor_teaches = tutor_basic_info.text.split('\n')[1]
        except NoSuchElementException:
            continue
        try:
            driver.find_element(By.CSS_SELECTOR,Selectors.tutor_description_more.format(tutor_index)).click()
            sleep(0.5)
        except:
            print("Description is already fully visible - Nothing to click.")

        try:
            driver.find_element(By.CSS_SELECTOR,Selectors.tutor_languages_more.format(tutor_index)).click()
            sleep(0.5)
        except:
            print("No additional Languages - Nothing to click.")


        try:
            price = driver.find_element(By.CSS_SELECTOR,Selectors.price.format(tutor_index)).text
        except NoSuchElementException:
            pass

        try:
            lesson_duration = driver.find_element(By.CSS_SELECTOR,Selectors.lesson_duration.format(tutor_index)).text
        except NoSuchElementException:
            pass

        try:
            is_newly_joined = driver.find_element(By.CSS_SELECTOR,Selectors.is_newly_joined.format(14)).text
            is_newly_joined = True
        except NoSuchElementException:
            is_newly_joined = False

        try:
            tutor_rating = driver.find_element(By.CSS_SELECTOR,Selectors.tutor_rating.format(tutor_index)).text
            rating = tutor_rating.split('\n')[0]
            try:
                n_of_reviews = to_int(tutor_rating.split('\n')[1])
            except:
                message = "Not rated yet"
        except NoSuchElementException:
            message = "Not rated yet"

        try:
            thumbnail_img = driver.find_element(By.CSS_SELECTOR,Selectors.thumbnail_img.format(tutor_index)).get_attribute('src')
        except NoSuchElementException:
            pass

        try:
            tutor_country =  driver.find_element(By.CSS_SELECTOR,Selectors.tutor_country.format(tutor_index)).get_attribute('alt')
        except NoSuchElementException:
            pass

        try:
            lessons_and_students = driver.find_element(By.CSS_SELECTOR,Selectors.lessons_and_students.format(tutor_index)).text
            lessons = to_int(lessons_and_students.split("•")[0])
            try:
                students = to_int(lessons_and_students.split("•")[1])
            except:
                print("No Students yet.")
        except NoSuchElementException:
            pass

        try:
            languages = driver.find_element(By.CSS_SELECTOR,Selectors.tutor_languages.format(tutor_index)).text
        except NoSuchElementException:
            pass

        try:
            description = driver.find_element(By.CSS_SELECTOR,Selectors.tutor_description.format(tutor_index)).text
        except NoSuchElementException:
            pass

        tutor_dict = \
        { 
        "tutor_name":tutor_name
        ,"tutor_teaches":tutor_teaches
        ,"thumbnail_img":thumbnail_img
        ,"tutor_country":tutor_country
        ,"price":price
        ,"lesson_duration":lesson_duration
        ,"is_newly_joined":is_newly_joined
        ,"rating":rating
        ,"n_of_reviews":n_of_reviews
        ,"lessons":lessons
        ,"students":students
        ,"languages":languages
        ,"description":description}

        tutors_dicttionary_list.append(tutor_dict)
        print(tutor_dict)

Currency successfully refreshed.
No Students yet.
{'tutor_name': 'Brianna C.', 'tutor_teaches': 'English', 'thumbnail_img': 'https://avatars.preply.com/i/logos/i/logos/avatar_mm6njhw04lb.jpg?d=160x160&f=jpeg', 'tutor_country': 'United States of America', 'price': '30', 'lesson_duration': '50-min lesson', 'is_newly_joined': False, 'rating': 'Newly joined', 'n_of_reviews': 59, 'lessons': 2, 'students': 5498, 'languages': 'Speaks:\nEnglishNative Chinese (Mandarin)Beginner SpanishUpper-Intermediate', 'description': "Licensed English teacher with 7 years of experience with both kids and adults Hello, my name is Teacher Bri and I am from the USA. I enjoy traveling and learning new languages, so I can't wait to help you learn English!\nSome other things I enjoy are reading, cooking, yoga, and learning about new cultures. Do we have any of the same interests?"}
No additional Languages - Nothing to click.
No Students yet.
{'tutor_name': 'Siriporn K.', 'tutor_teaches': 'English', 'thumbnail_img'

In [19]:
tutors_dicttionary_list[-1]

{'tutor_name': 'Mike Q.',
 'tutor_teaches': 'English',
 'thumbnail_img': 'https://avatars.preply.com/i/logos/i/logos/avatar_l5nln7l83yn.jpg?d=160x160&f=jpeg',
 'tutor_country': 'United States of America',
 'price': '38',
 'lesson_duration': '50-min lesson',
 'is_newly_joined': False,
 'rating': '5',
 'n_of_reviews': 24,
 'lessons': 68,
 'students': 2989,
 'languages': 'Speaks:\nEnglishNative SpanishAdvanced',
 'description': '🔹Unlock Your Potential: Learn English with an Experienced Teacher 🔹Native Speaker California🔹English for Business, Academic & Foreign Language (EFL)🔹Pronunciation, Accent Reduction🔹Job Coach/Interview/Prep🔹University/HS ELA🔹Masters Degree 🔹NOTE: MR MIKE NOW OFFERS COMPREHENSIVE PRONUNCIATION, PHRASAL VERBS, & BUSINESS ENGLISH (using audio) COURSES TO SHARE WITH HIS DEAR STUDENTS🔹\nGreetings, I\'m Mr. Mike and I\'m from California.\nI\'m so pleased to share with you a little about me. I have a lot of general knowledge, so I think you\'ll find I can help you with wh

In [21]:
import pandas as pd

In [22]:
pd.DataFrame(data=tutors_dicttionary_list)

Unnamed: 0,tutor_name,tutor_teaches,thumbnail_img,tutor_country,price,lesson_duration,is_newly_joined,rating,n_of_reviews,lessons,students,languages,description
0,Brianna C.,English,https://avatars.preply.com/i/logos/i/logos/ava...,United States of America,30,50-min lesson,False,Newly joined,59,2,5498,Speaks:\nEnglishNative Chinese (Mandarin)Begin...,Licensed English teacher with 7 years of exper...
1,Siriporn K.,English,https://avatars.preply.com/i/logos/i/logos/ava...,Thailand,13,50-min lesson,False,Newly joined,59,2,5498,Speaks:\nEnglishProficient ThaiNative,An English teacher with 15 years of experience...
2,Tomasz Ż.,English,https://avatars.preply.com/i/logos/i/logos/888...,Poland,15,50-min lesson,False,Newly joined,59,2,5498,Speaks:\nEnglishProficient PolishNative German...,Certified teacher with 5 years of experience H...
3,Dave N.,English,https://avatars.preply.com/i/logos/i/logos/ava...,United Kingdom,15,50-min lesson,False,Newly joined,59,2,5498,Speaks:\nEnglishNative RussianUpper-Intermediate,English Speaking Trainer with 10 years experie...
4,Shannon F.,English,https://avatars.preply.com/i/logos/i/logos/ava...,South Africa,9,50-min lesson,False,Newly joined,59,2,5498,Speaks:\nEnglishNative,English tutor with 7 years of training and coa...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,Wayne P.,English,https://avatars.preply.com/i/logos/i/logos/ava...,Australia,18,50-min lesson,False,5,5,20,164,Speaks:\nEnglishNative,English for Business Hello. My name is Wayne a...
96,Gareth P.,English,https://avatars.preply.com/i/logos/i/logos/ava...,United Kingdom,16,50-min lesson,False,5,6,26,432,Speaks:\nEnglishNative JapaneseIntermediate,Certified English Tutor with experience teachi...
97,Jordan M.,English,https://avatars.preply.com/i/logos/i/logos/ava...,United Kingdom,25,50-min lesson,False,5,76,69,2523,Speaks:\nEnglishNative,Certified ESL tutor with 4 years experience He...
98,Gennylyn D.,English,https://avatars.preply.com/i/logos/i/logos/ava...,Philippines,20,50-min lesson,False,5,18,28,3369,Speaks:\nEnglishNative TagalogNative CebuanoNa...,✅Licensed Professional Teacher✅Certified in Te...
