In [6]:
import time
import requests
import json
from bs4 import BeautifulSoup
from datetime import datetime
import pandas as pd
from selenium import webdriver

def get_all_projects():
    project_urls = []
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; Touch; rv:11.0) like Gecko',
    }
    req = requests.get('https://experiment.com/discover/more?offset=0')
    total_project_count = int(str(json.loads(req.content)['count']).strip())
    #total_project_count = 10 # test
    for i in range(1, (total_project_count // 6) + 2):
        req = requests.get('https://experiment.com/discover/more?offset=' + str(i * 6), headers=headers)
        content = json.loads(req.content)
        soup = BeautifulSoup(content['cards'], 'html.parser')

        for project in soup.find_all('div', {'class': 'project-card-content'}):
            project_url = project.find('a', {'class': 'plain project-link home-page:clicked-project-card'}).get('href')
            project_urls.append('https://experiment.com' + project_url)
    return list(set(project_urls))

def init_driver(url=None):
    options = webdriver.ChromeOptions()
    options.add_argument("--headless")
    options.add_argument("--disable-dev-shm-usage")
    options.add_argument('--no-sandbox')

        
    CHROMEDRIVER_PATH = r"C:\Don't You Touch This !!!!!\chromedriver.exe"  
    driver = webdriver.Chrome(CHROMEDRIVER_PATH, options=options)
    driver.get(url)
    time.sleep(2)
    return driver

def crawl_projects(testing=False):
    all_project_results = []
    if testing:
        project_urls = [
            "https://experiment.com/projects/do-zoonotic-pathogens-exist-in-fijian-bats-and-if-so-what-are-the-risks-of-exposure-to-humans",
            "https://experiment.com/projects/exploring-the-bioremediation-and-environmental-impact-of-halogenated-organic-compounds"
        ]
    else:
        project_urls = get_all_projects()
    for each_project_url in project_urls:
        try:
            driver = init_driver(url=each_project_url)

            title = driver.find_element_by_xpath('//h1[@class="title"]').text

            researchers = [a.text
                # {a.find_element_by_xpath('.//a').get_attribute('href'): a.text}
                           for a in driver.find_elements_by_xpath('//*[@id="team"]//div[@class="researcher"]//div[@class="full-name"]')]
            lab_notes = [
                {a.find_element_by_tag_name('a').get_attribute('href'): a.find_element_by_xpath(
                    './/div[@class="title"]').text}
                for a in driver.find_elements_by_xpath('//div[@class="labnote-card"]')]
            backers = driver.find_element_by_xpath('//ul[@class="backer-stats"]//span[@class="stat-number"]').text
            tags = [a.text for a in driver.find_elements_by_xpath('//div[@class="tags"]/a')]
            category = [a.text for a in driver.find_elements_by_xpath('//a[contains(@class, "tag category")]')] # tag category
            try:
                about = driver.find_element_by_xpath('//section[@id="about"]//p').text
            except:
                about = driver.find_element_by_xpath('//section[@id="about"]').text
            try:
                budgets = driver.find_element_by_xpath('(//div[@class="funding-raised"]//span)[3]').text
            except:
                budgets = None

            endorsed_by = [a.text for a in
                           driver.find_elements_by_xpath('//div[@class="endorsement"]//div[@class="name"]')]
            time.sleep(0.1)
            try:
                show_more_btn = driver.find_element_by_xpath('//div[@class="show-more"]/a').click()
            except:
                pass

            timeline = [a for a in
                        driver.find_elements_by_xpath('//div[@class="milestones-timeline"]//div[@class="description"]')]
            timeline_events = []
            for t in timeline:
                try:

                    timeline_events.append(
                        [t.find_element_by_tag_name("h4").text, t.find_element_by_tag_name("h2").text])

                except:
                    pass

            try:
                beg_timeline = datetime.strptime(timeline_events[0][0], '%b %d, %Y')
                end_timeline = datetime.strptime(timeline_events[-1][0], '%b %d, %Y')

                days = (end_timeline - beg_timeline).days
            except:
                days = None
                pass
            project_fund_percentage = driver.find_element_by_xpath(
                '//li[@class="backer-stat second"]//span[@class="stat-number"]').text
            project_fund_rased = driver.find_element_by_xpath(
                '//div[@class="funding-raised"]//span[@class="focus-stat"]').text


            try:
                number_of_images = len([driver.find_elements_by_xpath('//div[@class="hero-image"]')])
            except:
                number_of_images = None
                print("No Image:::", each_project_url)
            try:
                number_of_videos = len([driver.find_elements_by_xpath('//div[@class="hero-image js-video"]')])
            except:
                number_of_videos = None
                print("No Video:::", each_project_url)

            try:
                goal = driver.find_element_by_xpath('(//div[@class="funding-bar-stats"]//div[@class="stat"]//span)[2]').text#still funding
            except:
                goal = driver.find_element_by_xpath('(//div[@class="description text-antialiased"])[1]').text

            try:
                time_left = driver.find_element_by_xpath('//div[@class="stat float-right text-right"]//span').text #still funding
            except:
                time_left = driver.find_element_by_xpath('(//div[@class="description text-antialiased"])[2]').text#finish funding


            all_project_results.append({"title": title,
                                        'author': researchers[0],
                                        "researchers": '; '.join(researchers),
                                        "category": '; '.join(category),
                                        "number_of_researchers": len(researchers),
                                        "backers": backers,
                                        "tags": tags,
                                        "number_of_tags": len(tags),
                                        "about": about,
                                        "budgets": budgets,
                                        "number_of_budget_segments": len(budgets) if budgets else None,
                                        "project_fund_percentage": project_fund_percentage,
                                        "project_fund_rased": project_fund_rased,
                                        "number_of_images": number_of_images,
                                        "number_of_videos": number_of_videos,
                                        "days": days,
                                        "timeline_events": timeline_events,
                                        "number_of_timeline_events": len(timeline_events) if timeline_events else None,
                                        "lab_notes": lab_notes,
                                        "number_of_lab_notes": len(lab_notes) if lab_notes else None,
                                        "endorsed_by": '; '.join(endorsed_by),
                                        "number_of_endorsed_by": len(endorsed_by) if endorsed_by else None,
                                        'url':each_project_url,
                                        'time_left':time_left,
                                        'goal':goal.replace('Raised of'and 'Goal','')
                                        })

            driver.close()
            print("Project Detail Success:::", title)
        except Exception as e:
            print("Project Detail Crawl Error:::", each_project_url, str(e))
            pass

    df = pd.DataFrame(all_project_results)
    df.to_excel('10.13 project.xlsx', index=False)
    df.to_pickle('project.pkl')
    print('Done')

In [7]:
crawl_projects(testing=False)

Project Detail Success::: What's best for bees? Radio tracking bumblebee foraging to determine the best bee environments
Project Detail Success::: Viral Causes of Lung Cancer
Project Detail Success::: Developing a New Mouse Model for Studying Chemotherapeutic Nephrotoxicity
Project Detail Success::: Where is this vulnerable Indian Ocean seabird feeding? Using micro-GPS to track seabirds in the Indian Ocean
Project Detail Success::: How have genetics of African Lions changed over the last century?
Project Detail Success::: A Modification to Homeopathic Drug Carcinosin
Project Detail Success::: Exploring an age-old question: Can we demystify skin color?
Project Detail Success::: Cat and Human Personality Interaction Project
Project Detail Success::: Do Hummingbirds Actually Sleep?
Project Detail Success::: How do temperature and competition affect true fruit flies and their natural enemies?
Project Detail Success::: Habitat Use by the Eastern Hognose Snake on a Barrier Beach
Project Deta

Project Detail Success::: Discovering natural antibiotics from extremophile microalgae
Project Detail Success::: How does the brain quickly recognize threat and can we improve it?
Project Detail Success::: People of the Swamp: LiDAR and "Invisible" Structures at El Pilar
Project Detail Success::: Do street lights affect flight behavior of nocturnally migrating birds?
Project Detail Success::: Decreasing the Risk of Death and Diving Injuries in Artisanal Fishermen
Project Detail Success::: Identifying predator-prey relationships of spotted seatrout in Mississippi's coastal waters
Project Detail Success::: Conserving California Red-Legged Frogs
Project Detail Success::: How does natural gas fracking contribute to air pollution?
Project Detail Success::: Big Fish a Long Way from Home: Using ear bones and teeth to unravel migration in Amazonian fish
Project Detail Success::: Bio-Fluorescence on Coral Reefs as a Measure of Reef Health
Project Detail Success::: Hacked clocks tell us how an i

Project Detail Success::: Training a new generation of scientific beekeepers
Project Detail Success::: Long-term Outcome of Women with Purging Disorder
Project Detail Success::: The Ecological Impact of Invasive Lionfish (Pterois volitans and P. miles) in Bermuda
Project Detail Success::: Freshwater Biodiversity Crisis: Is Restoration the Key?
Project Detail Success::: Pecuniary & Non Pecuniary Incentives for Teachers : Evidence from Nigeria
Project Detail Success::: How did Paleolithic Hunter-Gatherers Use and Consume Plant Resources in Eurasia?
Project Detail Success::: Does fracking contaminate water with hormone disrupting chemicals?
Project Detail Success::: How Do Wolves and Beavers Interact in Boreal Ecosystems?
Project Detail Success::: Giving Children a Better Life: Understanding Developmental Coordination Disorder
Project Detail Success::: Can we use ultraviolet light to identify bats that have survived white nose syndrome?
Project Detail Success::: Creating Youth Historians:

Project Detail Success::: Using aphids to measure electrical outputs in ferns
Project Detail Success::: How do sponges influence the availability of nutrients on coral reefs?
Project Detail Success::: Achieving food sovereignty with edible insects: Breaking the cycle of poverty and malnutrition
Project Detail Success::: Eyes On The Reef; using remote cameras on cleaning stations to uncover the hidden habits of Maldivian manta rays
Project Detail Success::: How deep are microplastics in the sea? Understanding wind-driven vertical mixing
Project Detail Success::: The calf connection: California humpbacks in their Costa Rican nursery
Project Detail Success::: PLAY ON! How can a sport program better the lives of young people with mental illness?
Project Detail Success::: The DIG Field School: Hands-On Paleontology Experiences for K-12 Science Teachers
Project Detail Success::: Can our unconscious minds predict the stock market?
Project Detail Success::: Tree Climbing for Climate Change Res

Project Detail Success::: Building a gel imager on a budget
Project Detail Success::: Can Hookpods and reusable LED lights reduce albatross deaths and marine pollution?
Project Detail Success::: Does group gender composition influence how we explain success or failure?
Project Detail Success::: Mountain Rescue Anchor Strength Testing
Project Detail Success::: Tornadoes, Casualties, and Climate Change
Project Detail Success::: Where are the Pacific harbor seals in Mexico?
Project Detail Success::: Prestige, dominance, and leadership among the Chabu hunter-gatherers of Ethiopia
Project Detail Success::: How do Bobtail Squid choose their glowing bacterial partner?
Project Detail Success::: VIDAA project: Venom Investigation for the Development of novel Antimicrobial Agents
Project Detail Success::: How do Hovering Hummingbirds Survive Hot Days?
Project Detail Success::: Can a Dietary Intervention Reduce Mercury Toxicity Among Native Communities in Peru?
Project Detail Success::: What's in

Project Detail Success::: Gene therapy against vision loss in the elderly
Project Detail Success::: Filling the Knowledge Gap: Seismic Hazards at Cherry Point, WA
Project Detail Success::: Capturing canine personality
Project Detail Success::: Can music improve memories in patients with brain damage?
Project Detail Success::: Islands and Resilience: A Maya City After “Collapse”
Project Detail Success::: Why do informed consumers refrain from buying ethical products? Applying insights from behavioural economics
Project Detail Success::: How Does Technology Affect Our Memory?
Project Detail Success::: Is there climate-smart coffee? A search for resilient arabica varietals in Costa Rica
Project Detail Success::: Understanding How Mosquitoes are Repelled by DEET
Project Detail Success::: Are cities hostile environments for honey bees?
Project Detail Success::: Dicty World Race - finding the fastest and smartest Dicty cells
Project Detail Success::: Transition from Formula to Baja
Project D

Project Detail Success::: Population Ecology of Risso's Dolphins in Monterey Bay, California
Project Detail Success::: Karst Springs Initiative: Measuring Tennessee's Largest Springs
Project Detail Success::: Describing a Jurassic maniraptoran dinosaur from the Morrison Formation of North America
Project Detail Success::: A Bird's-eye View of Frog Habitats: Using a Drone to Survey Wetlands
Project Detail Success::: The pursuit of happiness: The basis of color in the happy face spider (Theridion grallator)
Project Detail Success::: Tooth plates in chimaeras and their relationship to teeth in sharks
Project Detail Success::: Illuminating the firefly genome
Project Detail Success::: How do gorillas see the world?
Project Detail Success::: Chasing orangutans in uncharted territory: how large are their home ranges?
Project Detail Success::: Friends and food...how does an artificial termite mound affect the social behavior of gorilla groups?
Project Detail Success::: What factors are causing

Project Detail Success::: Birds eavesdrop on plant indirect defenses to locate insect preys
Project Detail Success::: Did the Pima Indians design arrow points specifically to penetrate Apache rawhide shields?
Project Detail Success::: Circulating biomarkers of fibrosis in captive gorillas: a pilot study
Project Detail Success::: A fossil leaf site west of Ellensburg: part of the Ellensburg Flora? Or its own thing?
Project Detail Success::: Coquina in my Backyard: Can photogrammetry and 3D scanning be used to answer a 250 year old enigma?
Project Detail Success::: Manuka Honey: A New Tool in the Battle Against Antibiotic Resistance?
Project Detail Success::: The Neuroscience of Digital Product Experiences
Project Detail Success::: Eating tough stuff with floppy jaws - how do freshwater rays eat crabs, insects, and mollusks?
Project Detail Success::: How can a specialized diet affect the shape of the primate neck?
Project Detail Success::: The Red Devil squid: Oxygen deprived, cold, and 

Project Detail Success::: Biodiversity survey and ant species discovery in a heavily populated tropical city: Macau
Project Detail Success::: Supporting Women's Economic Entrepreneurship in Abuja, Nigeria: An Economic Experiment.
Project Detail Success::: Can music influence the longevity of human blood cells?
Project Detail Success::: What personal qualities and situational factors enable someone rise to leadership in disaster?
Project Detail Success::: Improving Collaborations for Neglected Tropical Diseases
Project Detail Success::: Thermolyze: A Temperature Controlled Kill-Switch for Containment of Pathogenic Bacteria in Research Labs
Project Detail Success::: Warm Ants! Predicting the impacts of climate change on biodiversity Down Under.
Project Detail Success::: Patterns of bee diversity in the Madrean Sky Islands: elevational habitat gradients, isolation, and gene flow
Project Detail Success::: View into Vocalizations, making affordable tags from smartwatches to track humpback w

Project Detail Success::: Oh My Science Blog! Who reads science blogs, and why?
Project Detail Success::: Colonial Period Archaeology in Northwestern Belize
Project Detail Success::: Physical experience of emotion: an early marker of Parkinson's Disease?
Project Detail Success::: Science of Sustainability: Expanding Conservation Capabilities in Cameroon
Project Detail Success::: How does lung fluke travel from snail to crab to human?
Project Detail Success::: Learning beyond the textbook: Developing dialect-specific grammar in a study abroad context
Project Detail Success::: Impacts of Oil and Natural Gas on Your Native Fishes
Project Detail Success::: Unlock the Secrets of Animals that Survive Freezing!
Project Detail Success::: Squid in Space: Symbiosis and Innate Immunity
Project Detail Success::: How have warming waters influenced reef species around Poor Knights Islands, New Zealand?
Project Detail Success::: How has the Asian Tiger Mosquito invaded the world?
Project Detail Succe

In [None]:
df = pd.read_pickle('project.pkl')