In [1]:
import requests
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
import re
import time
import os
from pymongo import MongoClient
import pprint
from tqdm import tqdm
from dotenv import load_dotenv

# Parameters
machine = 'local' # 'aws', 'aws-2', or 'local'
num_problems = 439 # Number of problems on website
start_problem = 170
min_page = 2
max_page = 52
skip = 10
solutions_per_page = 20 # Pick number less than or equal to 20. 
IP_ADDRESS = '13.58.253.233'

# Connect to database
if machine == 'aws':
    client = MongoClient()   
elif machine == 'local' or machine == 'aws-2':
    client = MongoClient(f"mongodb://cjm715:password@{IP_ADDRESS}/py2cpp")
    
db = client.py2cpp
print(f'Number of python db documents: {db.python.estimated_document_count()}')
print(f'Number of C++ db documents: {db.cpp.estimated_document_count()}')

# Load website and credentials
load_dotenv()
EMAIL = os.environ.get('USERNAME')
PASSWORD = os.environ.get('PASSWORD')
WEBSITE = os.environ.get('WEBSITE')

# Language options
python_options = {
    'name' : 'Python 3',
    'suffix': '.py',
    'search_term': 'python3',
    'db_collection': db.python
}
cpp_options = {
    'name' : 'C++',
    'suffix': '.cpp',
    'search_term': 'cpp',
    'db_collection': db.cpp
}

# Setting up selenium web driver
if machine == 'aws' or machine == 'aws-2':
    chromedriver = "/bin/chromedriver"
elif machine == 'local':
    chromedriver = "/Applications/chromedriver" 
os.environ["webdriver.chrome.driver"] = chromedriver
options = Options()


if machine == 'aws' or machine == 'aws-2':
    options.add_argument("--headless")
    options.add_argument("--window-size=1920x1080")
    options.binary_location =  "/bin/headless-chromium"
# else:
#     options.add_argument("--headless")
#     options.add_argument("--window-size=1920x1080")
    

# Helper function for sleeping
def sleeper(lower,higher):
    delay = lower + (higher-lower)*np.random.random()
    time.sleep(delay)

def open_website():
    # Open up website
    driver = webdriver.Chrome(chromedriver,options=options)
    driver.get(WEBSITE)
    sleeper(5,10)

    # Login
    username_field = driver.find_element_by_id('input-1')
    username_field.send_keys(EMAIL) 
    sleeper(5,10)
    pw_field = driver.find_element_by_id('input-2')
    pw_field.send_keys(PASSWORD) 
    sleeper(5,10)
    driver.find_elements_by_tag_name('button')[0].click()
    sleeper(5,10)
    driver.find_elements_by_tag_name('button')[0].click()
    sleeper(5,10)

    # Go to algorithms page
    driver.find_element_by_link_text('Algorithms').click()
    sleeper(5,10)
    
    return driver

# Function to go to a particular challenge problem by id
def go_to_leaderboard(req_challenge_id):
    assert req_challenge_id < num_problems
    driver.get('https://www.hackerrank.com/domains/algorithms')
    sleeper(5,10)

    # Scroll until you get to Challenge identified by req_challenge_id
    while True:
        # Scroll down to bottom
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

        # Wait to load page
        sleeper(3,4)

        # Break if number of challenges exceeds req_challenge_id
        challenge_list = driver.find_element_by_class_name('challenges-list')
        clist_items = challenge_list.find_elements_by_class_name('challenge-list-item')
        num_challenges_on_page = len(clist_items)
        if num_challenges_on_page > req_challenge_id:
            break

        sleeper(1,2)
        print(num_challenges_on_page)
    sleeper(5,10)
    
   
    # Get individual challenge data and click 
    challenge_list = driver.find_element_by_class_name('challenges-list')
    challenge_item = challenge_list.find_elements_by_class_name('challenge-list-item')[req_challenge_id]
    challenge_title = challenge_item.find_element_by_class_name('challengecard-title').text.split('\n')[0]
    challenge_difficulty = challenge_item.find_element_by_class_name('difficulty').text
    max_score_str = challenge_item.find_element_by_class_name('max-score').text
    max_score= float(re.findall('\d+', max_score_str)[0])
    success_rate_str = challenge_item.find_element_by_class_name('success-ratio').text
    success_rate = 0.01*float(re.findall('\\d+(?:\\.\\d+)?', success_rate_str)[0])
    print(f'challenge_title: {challenge_title}')
    print(f'challenge_difficulty: {challenge_difficulty}')
    print(f'max_score: {max_score}')
    print(f'success_rate: {success_rate}')

    challenge_item.click()
    sleeper(5,10)

    # Go to Leaderboard
    driver.find_element_by_link_text('Leaderboard').click()
    sleeper(5,10)

    try:
        reveal_button = driver.find_element_by_class_name('unlock-solutions')
        reveal_button = reveal_button.find_element_by_tag_name('div')
        reveal_button.click()
        sleeper(5,10)
        driver.find_element_by_class_name('hr_primary-btn').click()
    except:
        print('solutions already revealed')

    sleeper(5,10) 
    return challenge_title,challenge_difficulty,max_score,success_rate

# Filter by language (used soon in two cells)

def filter_by_language(lang_options):
    sleeper(1,3)
    language_field = driver.find_elements_by_class_name('ac-input')[1]
    sleeper(1,3)
    for _ in range(len(language_field.get_attribute('value'))):
        language_field.send_keys(Keys.BACK_SPACE)
        sleeper(0.1,0.5)

    sleeper(1,3)
    language_field.send_keys(lang_options['search_term'])

    sleeper(1,3)
    language_field.send_keys(Keys.ENTER)
    sleeper(1,3)
    
def save_info_from_row_elem(row):
    success = True # true unless error arises and will change to false
    try:
        rank = int(row.find_element_by_class_name('rank').text)
        language = row.find_element_by_class_name('language').text
        score = float(row.find_element_by_class_name('score').text)
        solution_link = row.find_element_by_link_text('View solution')
        url = solution_link.get_attribute('href')
    except Exception as e:
        print('Error occured while extracting meta data for solution...')
        print(e)
        success = False

    # Navigate to solution page and get code as a string
    curr_leaderboard_url = driver.current_url
    try:
        driver.get(url)
        sleeper(1.0,1.5)
        b = driver.find_element_by_tag_name('body')
        code = b.text
    except:
        print('Error occured while extracting code ...')
        success = False
        pass
    
    # Navigate back to leaderboard
    driver.get(curr_leaderboard_url)
    
    # Save to mongodb server
    if success:
        try:
            doc = {
                'challenge_title': c_title,
                'challenge_difficulty': c_difficulty,
                'max_score': c_max_score,
                'success_rate': c_success_rate,
                'rank' : rank,
                'language' : language,
                'score': score,
                'code' : code
            }
            lang_options['db_collection'].insert_one(doc)
#             print(doc)
        except:
            print('Error occured by inserting document to mongo...')
            pass
    sleeper(1.0,1.5)
    

Number of python db documents: 25199
Number of C++ db documents: 26876


In [2]:
# open website
driver = open_website()    

# Collect solutions


#Loop over problems
for problem_id in range(start_problem,num_problems):
    print(f'Problem Id: {problem_id}')
    try:
        c_title,c_difficulty,c_max_score,c_success_rate=go_to_leaderboard(problem_id)
        print('Successfully loaded challenge meta data ...')
    except:
        print('Error occured while loading meta data. Continuing to next problem ...')
        continue
        
    # Loop over languages
    for lang_options in [python_options, cpp_options]:
        
        # Filter by language
        print(f'\tLanguage: {lang_options["name"]} \n')
        filter_by_language(lang_options)
        
        # Loop through leaderboard pages
        curr_url = driver.current_url
        assert curr_url.endswith('&page=1')
        url_without_page_number = curr_url.replace('&page=1','')
        last_page = int(driver.find_element_by_class_name('last-page').text) - 1
        min_page_capped = min(min_page,last_page)
        max_page_capped = min(max_page,last_page) 
        print('\t\tLooping through leaderboard pages ...')
        for page_number in tqdm(range(min_page_capped,max_page_capped + 1,skip)):
            
            # go to leaderboard page number
            
            url_with_page_number = url_without_page_number + f'&page={page_number+1}'
            driver.get(url_with_page_number)
            sleeper(3,6)
            
            # Loop through solutions on page
            for j in range(solutions_per_page):
                # Get meta data on solution such as rank, language, and score   
                table_row_elems = driver.find_elements_by_class_name('table-row')
                if j<len(table_row_elems):
                    row = table_row_elems[j]
                    save_info_from_row_elem(row)


Problem Id: 170
19
29
39
49
59
69
79
89
99
109
119
129
139
149
159
169
challenge_title: Kruskal (MST): Really Special Subtree
challenge_difficulty: Medium
max_score: 50.0
success_rate: 0.8248000000000001
solutions already revealed
Successfully loaded challenge meta data ...
	Language: Python 3 



  0%|          | 0/6 [00:00<?, ?it/s]

		Looping through leaderboard pages ...


100%|██████████| 6/6 [10:11<00:00, 101.85s/it]


	Language: C++ 



  0%|          | 0/6 [00:00<?, ?it/s]

		Looping through leaderboard pages ...


100%|██████████| 6/6 [12:40<00:00, 126.72s/it]


Problem Id: 171
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10


  0%|          | 0/6 [00:00<?, ?it/s]

		Looping through leaderboard pages ...


100%|██████████| 6/6 [09:29<00:00, 94.95s/it] 


	Language: C++ 



  0%|          | 0/6 [00:00<?, ?it/s]

		Looping through leaderboard pages ...


 83%|████████▎ | 5/6 [07:59<01:35, 95.85s/it]

TimeoutException: Message: timeout
  (Session info: chrome=72.0.3626.119)
  (Driver info: chromedriver=2.35.528157 (4429ca2590d6988c0745c24c8858745aaaec01ef),platform=Mac OS X 10.13.4 x86_64)
