In [17]:
import time
import requests
import json
from bs4 import BeautifulSoup
from datetime import datetime
import pandas as pd
from selenium import webdriver

def get_all_projects():
    project_urls = []
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; Touch; rv:11.0) like Gecko',
    }
    req = requests.get('https://experiment.com/discover/more?offset=0')
    total_project_count = int(str(json.loads(req.content)['count']).strip())
    total_project_count = 10 # TODO Delete this
    for i in range(1, (total_project_count // 6) + 2):
        req = requests.get('https://experiment.com/discover/more?offset=' + str(i * 6), headers=headers)
        content = json.loads(req.content)
        soup = BeautifulSoup(content['cards'], 'html.parser')

        for project in soup.find_all('div', {'class': 'project-card-content'}):
            project_url = project.find('a', {'class': 'plain project-link home-page:clicked-project-card'}).get('href')
            project_urls.append('https://experiment.com' + project_url)
    return list(set(project_urls))

def init_driver(url=None):
    options = webdriver.ChromeOptions()
    options.add_argument("--headless")
    options.add_argument("--disable-dev-shm-usage")
    options.add_argument('--no-sandbox')

        
    CHROMEDRIVER_PATH = r"C:\Don't You Touch This !!!!!\chromedriver.exe"  # Aoran: "/Users/aoran/misc/chromedriver"  # Zhonghui: r"C:\Don't You Touch This !!!!!\chromedriver.exe"
    driver = webdriver.Chrome(CHROMEDRIVER_PATH, options=options)
    driver.get(url)
    time.sleep(1)
    return driver

def crawl_projects(testing=False):
    all_project_results = []
    if testing:
        project_urls = [
            "https://experiment.com/projects/do-zoonotic-pathogens-exist-in-fijian-bats-and-if-so-what-are-the-risks-of-exposure-to-humans",
            "https://experiment.com/projects/exploring-the-bioremediation-and-environmental-impact-of-halogenated-organic-compounds"
        ]
    else:
        project_urls = get_all_projects()
    for each_project_url in project_urls:
        try:
            driver = init_driver(url=each_project_url)

            title = driver.find_element_by_xpath('//h1[@class="title"]').text

            researchers = [a.text
                # {a.find_element_by_xpath('.//a').get_attribute('href'): a.text}
                           for a in driver.find_elements_by_xpath('//*[@id="team"]//div[@class="researcher"]//div[@class="full-name"]')]
            lab_notes = [
                {a.find_element_by_tag_name('a').get_attribute('href'): a.find_element_by_xpath(
                    './/div[@class="title"]').text}
                for a in driver.find_elements_by_xpath('//div[@class="labnote-card"]')]
            backers = driver.find_element_by_xpath('//ul[@class="backer-stats"]//span[@class="stat-number"]').text
            tags = [a.text for a in driver.find_elements_by_xpath('//div[@class="tags"]/a')]
            category = [a.text for a in driver.find_elements_by_xpath('//a[contains(@class, "tag category")]')] # tag category
            try:
                about = driver.find_element_by_xpath('//section[@id="about"]//p').text
            except:
                about = driver.find_element_by_xpath('//section[@id="about"]').text
            try:
                budgets = driver.find_element_by_xpath('(//div[@class="funding-raised"]//span)[3]').text
            except:
                budgets = None

            endorsed_by = [a.text for a in
                           driver.find_elements_by_xpath('//div[@class="endorsement"]//div[@class="name"]')]
            time.sleep(0.1)
            try:
                show_more_btn = driver.find_element_by_xpath('//div[@class="show-more"]/a').click()
            except:
                pass

            timeline = [a for a in
                        driver.find_elements_by_xpath('//div[@class="milestones-timeline"]//div[@class="description"]')]
            timeline_events = []
            for t in timeline:
                try:

                    timeline_events.append(
                        [t.find_element_by_tag_name("h4").text, t.find_element_by_tag_name("h2").text])

                except:
                    pass

            try:
                beg_timeline = datetime.strptime(timeline_events[0][0], '%b %d, %Y')
                end_timeline = datetime.strptime(timeline_events[-1][0], '%b %d, %Y')

                days = (end_timeline - beg_timeline).days
            except:
                days = None
                pass
            project_fund_percentage = driver.find_element_by_xpath(
                '//li[@class="backer-stat second"]//span[@class="stat-number"]').text
            project_fund_rased = driver.find_element_by_xpath(
                '//div[@class="funding-raised"]//span[@class="focus-stat"]').text
            
            
            try:
                number_of_images = len([driver.find_elements_by_xpath('//div[@class="hero-image"]')])
            except:
                number_of_images = None
                print("No Image:::", each_project_url)
            try:
                number_of_videos = len([driver.find_elements_by_xpath('//div[@class="hero-image js-video"]')])
            except:
                number_of_videos = None
                print("No Video:::", each_project_url)
                
            try:
                goal = driver.find_element_by_xpath('(//div[@class="funding-bar-stats"]//div[@class="stat"]//span)[2]').text#still funding
            except:
                goal = driver.find_element_by_xpath('(//div[@class="description text-antialiased"])[1]').text
            
            try:
                time_left = driver.find_element_by_xpath('//div[@class="stat float-right text-right"]//span').text #still funding
            except:
                time_left = driver.find_element_by_xpath('(//div[@class="description text-antialiased"])[2]').text#finish funding

            
            all_project_results.append({"title": title,
                                        'author': researchers[0],
                                        "researchers": '; '.join(researchers),
                                        "category": '; '.join(category),
                                        "number_of_researchers": len(researchers),
                                        "backers": backers,
                                    
                                        "tags": tags,
                                        "number_of_tags": len(tags),
                                        "about": about,
                                        "budgets": budgets,
                                        "number_of_budget_segments": len(budgets) if budgets else None,
                                        "project_fund_percentage": project_fund_percentage,
                                        "project_fund_rased": project_fund_rased,
                                        "number_of_images": number_of_images,
                                        "number_of_videos": number_of_videos,
                                        "days": days,
                                        "timeline_events": timeline_events,
                                        "number_of_timeline_events": len(timeline_events) if timeline_events else None,
                                        "lab_notes": lab_notes,
                                        "number_of_lab_notes": len(lab_notes) if lab_notes else None,
                                        "endorsed_by": '; '.join(endorsed_by),
                                        "number_of_endorsed_by": len(endorsed_by) if endorsed_by else None,
                                        'url':each_project_url,
                                        'time_left':time_left,
                                        'goal':goal.replace('Raised of'and 'Goal','')
                                        })

            driver.close()
            print("Project Detail Success:::", title)
        except Exception as e:
            print("Project Detail Crawl Error:::", each_project_url, str(e))
            pass

    df = pd.DataFrame(all_project_results)
    df.to_excel('try.xlsx', index=False)
    df.to_pickle('project.pkl')
    print('Done')

In [18]:
crawl_projects(testing=False)

Project Detail Success::: CRISPR Cas9 Testing Model
Project Detail Success::: Investigating the Impact of Beaver Dams on Parasite Community in Freshwater Ecosystems
Project Detail Success::: Engineering yeast to produce eco-friendly compounds for sunscreen
Project Detail Success::: Dogs, Cats and the Health of Capuchin monkeys at Touristic Paradise of Costa Rica
Project Detail Success::: Developing a new tissue preservation method inspired by Tardigrades
Project Detail Success::: Can DNA collected from pond water help save frogs?
Project Detail Success::: Can spiders be used as bioindicators of environmental pollution in a geothermal area?
Project Detail Success::: Do African buffalo in Kruger National Park share mosquito-borne diseases with cattle on the boundary?
Project Detail Success::: Developing a production method for the synthesis of high quality monolayer Graphene sheets
Project Detail Success::: Misappropriation of Funds in Foreign Aid: An Experimental Investigation
Project D

In [None]:
df = pd.read_pickle('project.pkl')