In [None]:
import requests
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
import re
import time
import os
from pymongo import MongoClient
import pprint
from tqdm import tqdm
from dotenv import load_dotenv

# Parameters
machine = 'local' # 'aws' or 'local'
num_problems = 439 # Number of problems on website
start_problem = 4
min_page = 5
max_page = 9
solutions_per_page = 20 # Pick number less than or equal to 20. 

# Connect to database
if machine == 'aws':
    client = MongoClient()   
elif machine == 'local':
    client = MongoClient("mongodb://cjm715:password@3.17.9.57/py2cpp")
db = client.py2cpp
print(f'Number of python db documents: {db.python.count()}')
print(f'Number of C++ db documents: {db.cpp.count()}')

# Load website and credentials
load_dotenv()
EMAIL = os.environ.get('USERNAME')
PASSWORD = os.environ.get('PASSWORD')
WEBSITE = os.environ.get('WEBSITE')

# Language options
python_options = {
    'name' : 'Python 3',
    'suffix': '.py',
    'search_term': 'python3',
    'db_collection': db.python
}
cpp_options = {
    'name' : 'C++',
    'suffix': '.cpp',
    'search_term': 'cpp',
    'db_collection': db.cpp
}

# Setting up selenium web driver
if machine == 'aws':
    chromedriver = "/bin/chromedriver"
elif machine == 'local':
    chromedriver = "/Applications/chromedriver" 
os.environ["webdriver.chrome.driver"] = chromedriver
options = Options()


if machine == 'aws':
    options.add_argument("--headless")
    options.add_argument("--window-size=1920x1080")
    options.binary_location =  "/bin/headless-chromium"
else:
    options.add_argument("--headless")
    options.add_argument("--window-size=1920x1080")
    

# Helper function for sleeping
def sleeper(lower,higher):
    delay = lower + (higher-lower)*np.random.random()
    time.sleep(delay)

def open_website():
    # Open up website
    driver = webdriver.Chrome(chromedriver,options=options)
    driver.get(WEBSITE)
    sleeper(5,10)

    # Login
    username_field = driver.find_element_by_id('input-1')
    username_field.send_keys(EMAIL) 
    sleeper(5,10)
    pw_field = driver.find_element_by_id('input-2')
    pw_field.send_keys(PASSWORD) 
    sleeper(5,10)
    driver.find_elements_by_tag_name('button')[0].click()
    sleeper(5,10)
    driver.find_elements_by_tag_name('button')[0].click()
    sleeper(5,10)

    # Go to algorithms page
    driver.find_element_by_link_text('Algorithms').click()
    sleeper(5,10)
    
    return driver

# Function to go to a particular challenge problem by id
def go_to_leaderboard(req_challenge_id):
    assert req_challenge_id < num_problems
    driver.get('https://www.hackerrank.com/domains/algorithms')
    sleeper(5,10)

    # Scroll until you get to Challenge identified by req_challenge_id
    while True:
        # Scroll down to bottom
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

        # Wait to load page
        sleeper(3,4)

        # Break if number of challenges exceeds req_challenge_id
        challenge_list = driver.find_element_by_class_name('challenges-list')
        clist_items = challenge_list.find_elements_by_class_name('challenge-list-item')
        num_challenges_on_page = len(clist_items)
        if num_challenges_on_page > req_challenge_id:
            break

        sleeper(1,2)
        print(num_challenges_on_page)
    sleeper(5,10)
    
   
    # Get individual challenge data and click 
    challenge_list = driver.find_element_by_class_name('challenges-list')
    challenge_item = challenge_list.find_elements_by_class_name('challenge-list-item')[req_challenge_id]
    challenge_title = challenge_item.find_element_by_class_name('challengecard-title').text.split('\n')[0]
    challenge_difficulty = challenge_item.find_element_by_class_name('difficulty').text
    max_score_str = challenge_item.find_element_by_class_name('max-score').text
    max_score= float(re.findall('\d+', max_score_str)[0])
    success_rate_str = challenge_item.find_element_by_class_name('success-ratio').text
    success_rate = 0.01*float(re.findall('\\d+(?:\\.\\d+)?', success_rate_str)[0])
    print(f'challenge_title: {challenge_title}')
    print(f'challenge_difficulty: {challenge_difficulty}')
    print(f'max_score: {max_score}')
    print(f'success_rate: {success_rate}')

    challenge_item.click()
    sleeper(5,10)

    # Go to Leaderboard
    driver.find_element_by_link_text('Leaderboard').click()
    sleeper(5,10)
    reveal_button = driver.find_elements_by_tag_name('button')
    try:
        reveal_button[0].click()
        sleeper(5,10)
        driver.find_element_by_class_name('hr_primary-btn').click()
    except:
        print('solutions already revealed')

    sleeper(5,10) 
    return challenge_title,challenge_difficulty,max_score,success_rate

# Filter by language (used soon in two cells)

def filter_by_language(lang_options):
    sleeper(1,3)
    language_field = driver.find_elements_by_class_name('ac-input')[1]
    sleeper(1,3)
    for _ in range(len(language_field.get_attribute('value'))):
        language_field.send_keys(Keys.BACK_SPACE)
        sleeper(0.1,0.5)

    sleeper(1,3)
    language_field.send_keys(lang_options['search_term'])

    sleeper(1,3)
    language_field.send_keys(Keys.ENTER)
    sleeper(1,3)
    
def save_info_from_row_elem(row):
    success = True # true unless error arises and will change to false
    try:
        rank = int(row.find_element_by_class_name('rank').text)
        language = row.find_element_by_class_name('language').text
        score = float(row.find_element_by_class_name('score').text)
        solution_link = row.find_element_by_link_text('View solution')
        url = solution_link.get_attribute('href')
    except:
        print('Error occured while extracting meta data for solution...')
        success = False

    # Navigate to solution page and get code as a string
    curr_leaderboard_url = driver.current_url
    try:
        driver.get(url)
        sleeper(1.0,1.5)
        b = driver.find_element_by_tag_name('body')
        code = b.text
    except:
        print('Error occured while extracting code ...')
        success = False
        pass
    
    # Navigate back to leaderboard
    driver.get(curr_leaderboard_url)
    
    # Save to mongodb server
    if success:
        try:
            doc = {
                'challenge_title': c_title,
                'challenge_difficulty': c_difficulty,
                'max_score': c_max_score,
                'success_rate': c_success_rate,
                'rank' : rank,
                'language' : language,
                'score': score,
                'code' : code
            }
            lang_options['db_collection'].insert_one(doc)
#             print(doc)
        except:
            print('Error occured by inserting document to mongo...')
            pass
    sleeper(1.0,1.5)
    

# open website
driver = open_website()    

# Collect solutions


#Loop over problems
for problem_id in range(start_problem,num_problems):
    print(f'Problem Id: {problem_id}')
    try:
        c_title,c_difficulty,c_max_score,c_success_rate=go_to_leaderboard(problem_id)
        print('Successfully loaded challenge meta data ...')
    except:
        print('Error occured while loading meta data. Continuing to next problem ...')
        continue
        
    # Loop over languages
    for lang_options in [python_options, cpp_options]:
        
        # Filter by language
        print(f'\tLanguage: {lang_options["name"]} \n')
        filter_by_language(lang_options)
        
        # Loop through leaderboard pages
        url_without_page_number = driver.current_url
        last_page = int(driver.find_element_by_class_name('last-page').text) - 1
        min_page_capped = min(min_page,last_page)
        max_page_capped = min(max_page,last_page) 
        print('\t\tLooping through leaderboard pages ...')
        for page_number in tqdm(range(min_page_capped,max_page_capped + 1)):
            
            # go to leaderboard page number
            url_with_page_number = url_without_page_number + f'?page={page_number}'
            driver.get(url_with_page_number)
            sleeper(2,5)
            
            # Loop through solutions on page
            solutions_per_page=len(driver.find_elements_by_class_name('table-row'))
            for j in range(solutions_per_page):

                # Get meta data on solution such as rank, language, and score       
                row = driver.find_elements_by_class_name('table-row')[j]
                save_info_from_row_elem(row)

            # Navigate to next page if it exists
            next_page_elem = driver.find_element_by_class_name('next-page')
            if 'disabled' in next_page_elem.get_attribute('class'):
                break



Number of python db documents: 2164
Number of C++ db documents: 1610




Problem Id: 4
challenge_title: Diagonal Difference
challenge_difficulty: Easy
max_score: 10.0
success_rate: 0.9586
solutions already revealed
Successfully loaded challenge meta data ...
	Language: Python 3 




  0%|          | 0/5 [00:00<?, ?it/s][A

		Looping through leaderboard pages ...



 20%|██        | 1/5 [01:46<07:04, 106.07s/it][A
 40%|████      | 2/5 [03:31<05:16, 105.54s/it][A
 60%|██████    | 3/5 [05:14<03:29, 104.85s/it][A
 80%|████████  | 4/5 [06:59<01:44, 104.96s/it][A
100%|██████████| 5/5 [08:44<00:00, 104.81s/it][A
[A

	Language: C++ 




  0%|          | 0/5 [00:00<?, ?it/s][A

		Looping through leaderboard pages ...



 20%|██        | 1/5 [01:44<06:58, 104.73s/it][A
 40%|████      | 2/5 [03:22<05:03, 101.23s/it][A
 60%|██████    | 3/5 [05:01<03:21, 100.64s/it][A
 80%|████████  | 4/5 [06:47<01:41, 102.00s/it][A
100%|██████████| 5/5 [08:25<00:00, 101.15s/it][A
[A

Problem Id: 5
challenge_title: Plus Minus
challenge_difficulty: Easy
max_score: 10.0
success_rate: 0.9812000000000001
solutions already revealed
Successfully loaded challenge meta data ...
	Language: Python 3 




  0%|          | 0/5 [00:00<?, ?it/s][A

		Looping through leaderboard pages ...



 20%|██        | 1/5 [01:55<07:41, 115.29s/it][A
 40%|████      | 2/5 [03:50<05:45, 115.11s/it][A
 60%|██████    | 3/5 [05:24<03:36, 108.31s/it][A
 80%|████████  | 4/5 [07:10<01:47, 107.68s/it][A
100%|██████████| 5/5 [08:59<00:00, 107.93s/it][A
[A

	Language: C++ 




  0%|          | 0/5 [00:00<?, ?it/s][A

		Looping through leaderboard pages ...



 20%|██        | 1/5 [01:50<07:22, 110.53s/it][A
 40%|████      | 2/5 [03:37<05:25, 108.62s/it][A
 60%|██████    | 3/5 [05:21<03:34, 107.20s/it][A
 80%|████████  | 4/5 [07:02<01:45, 105.72s/it][A
100%|██████████| 5/5 [08:50<00:00, 106.03s/it][A
[A

Problem Id: 6
challenge_title: Staircase
challenge_difficulty: Easy
max_score: 10.0
success_rate: 0.9828
solutions already revealed
Successfully loaded challenge meta data ...
	Language: Python 3 




  0%|          | 0/5 [00:00<?, ?it/s][A

		Looping through leaderboard pages ...



 20%|██        | 1/5 [01:40<06:40, 100.20s/it][A
 40%|████      | 2/5 [03:22<05:03, 101.04s/it][A
 60%|██████    | 3/5 [05:01<03:21, 100.59s/it][A
 80%|████████  | 4/5 [06:45<01:41, 101.46s/it][A
100%|██████████| 5/5 [08:32<00:00, 102.58s/it][A
[A

	Language: C++ 




  0%|          | 0/5 [00:00<?, ?it/s][A

		Looping through leaderboard pages ...



 20%|██        | 1/5 [01:44<06:58, 104.67s/it][A
 40%|████      | 2/5 [03:30<05:15, 105.11s/it][A
 60%|██████    | 3/5 [05:10<03:27, 103.50s/it][A
 80%|████████  | 4/5 [06:55<01:43, 103.82s/it][A
100%|██████████| 5/5 [08:37<00:00, 103.51s/it][A
[A

Problem Id: 7
challenge_title: Mini-Max Sum
challenge_difficulty: Easy
max_score: 10.0
success_rate: 0.9268000000000001
Successfully loaded challenge meta data ...
	Language: Python 3 




  0%|          | 0/5 [00:00<?, ?it/s][A

		Looping through leaderboard pages ...



 20%|██        | 1/5 [01:52<07:28, 112.20s/it][A
 40%|████      | 2/5 [03:40<05:30, 110.33s/it][A
 60%|██████    | 3/5 [05:27<03:38, 109.05s/it][A
 80%|████████  | 4/5 [07:13<01:48, 108.28s/it][A