## Data Extraction

The following functions are used to extract raw data from the various sources and store them in offline text and json files.

`LeetcodeDataExtractor` : 
    This class defines the basic methods required to login to leetcode with specific user credentials. 
    
`problem_stat_scraper` : 
    This function in the above class extracts all problem stats and dumps it into a json file in the following format - ../data/problems_{user}.json  
    
`submission_scraper` : 
    This function in the above class extracts all submission stats and dumps it into a json file in the following format - ../data/submissions_{user}.json  
    

In [17]:
from bs4 import BeautifulSoup
import requests
import pprint
import json
from requests_toolbelt import MultipartEncoder

In [18]:
class LeetcodeDataExtractor():
    def __init__(self):
        self.session = requests.Session()
        self.csrftoken = ''
        self.is_login = False

    def get_csrftoken(self):
        url = 'https://leetcode.com'
        cookies = self.session.get(url).cookies
        for cookie in cookies:
            if cookie.name == 'csrftoken':
                self.csrftoken = cookie.value
                break

    def login(self, username, password):
        url = "https://leetcode.com/accounts/login"

        params_data = {
            'csrfmiddlewaretoken': self.csrftoken,
            'login': username,
            'password':password,
            'next': 'problems'
        }
        headers = {'User-Agent': user_agent, 'Connection': 'keep-alive', 'Referer': 'https://leetcode.com/accounts/login/',
        "origin": "https://leetcode.com"}
        m = MultipartEncoder(params_data)

        headers['Content-Type'] = m.content_type
        self.session.post(url, headers = headers, data = m, timeout = 10, allow_redirects = False)
        self.is_login = self.session.cookies.get('LEETCODE_SESSION') != None
        return self.is_login

    def problem_stat_scraper(self,user,credentials):
        '''
        Function to extract all content from the problemset api of a user and dump the content into a json file
        Args:
            user: Name of current user
            credentials: username and password of current leetcode login
        Returns:
            None
            Dumps content into ../data/problems_{user}.json
        '''
        username = credentials[0]
        password = credentials[1]
        is_login = self.login(username,password)

        #breakpoint()
        headers = {'User-Agent': user_agent, 'Connection': 'keep-alive'}
        page_link = "https://leetcode.com/api/problems/all/"
        resp = self.session.get(page_link, headers = headers, timeout = 10)
        page_content = json.loads(resp.content.decode('utf-8'))

        with open(f'../data/problems_{user}.json', 'w') as file_json:
            json.dump(page_content, file_json, indent=2, sort_keys=False)

        pp = pprint.PrettyPrinter(indent=4)
        #pp.pprint(page_content)
        #print(page_content.keys())
        print(f'{user} Problem Scraper success')

        return page_content

    def submission_scraper(self,user,credentials):
        '''
        Function to extract all content from the submissions api of a user and dump the content into a json file
        Args:
            user: Name of current user
            credentials: username and password of current leetcode login
        Returns:
            None
            Dumps content into ../data/submissions_{user}.json
        '''
        username = credentials[0]
        password = credentials[1]
        is_login = self.login(username,password)

        #breakpoint()
        headers = {'User-Agent': user_agent, 'Connection': 'keep-alive'}
        page_link = "https://leetcode.com/api/submissions/"
        resp = self.session.get(page_link, headers = headers, timeout = 10)
        page_content = json.loads(resp.content.decode('utf-8'))

        with open(f'../data/submissions_{user}.json', 'w') as file_json:
            json.dump(page_content, file_json, indent=2, sort_keys=False)

        pp = pprint.PrettyPrinter(indent=4)
        #pp.pprint(page_content)
        #print(page_content.keys())
        print(f'{user} Submission Scraper success')

        return page_content

In [19]:
def extract_leetcode_tags():
    '''
    Function to extract all the topic tags from leetcode problems page
    Args: None
    Returns:
        topic_tags: list of all topic tags
    '''

    page_link = 'https://leetcode.com/problemset/all/'
    page_response = requests.get(page_link, timeout=5)
    page_content = BeautifulSoup(page_response.content, "html.parser")

    #if we want current_topics only
    current_topics = page_content.findAll('div',attrs={"class":"tags tags-fade", "id":"current-topic-tags"})
    topic_elems=current_topics[0].findAll('span', attrs={'class':'text-sm text-gray'})
    for elem in topic_elems:
        #print(elem.text.strip())
        pass

    #if we want all topic tags
    all_topics = page_content.find('span', attrs={'class':'hide', 'id':'all-topic-tags'}).findAll('span',attrs={'class':'text-sm text-gray'})
    topic_tags = []
    for elem in all_topics:
        topic_tags.append(elem.text.strip())

    return topic_tags

In [20]:
user_agent = r'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36'

def dataset_collect():
    with open(f'../data/credentials.json', 'r') as file_json:
        credentials = json.load(file_json)
    for names in credentials.keys():
        trial = LeetcodeDataExtractor()
        trial.get_csrftoken()
        trial.problem_stat_scraper(names, credentials[names])
        trial.submission_scraper(names, credentials[names])
        del trial

In [21]:
dataset_collect()
topic_tags = extract_leetcode_tags()
with open('../data/tags.txt', 'w') as file:
    [file.write(f'{elem}\n') for elem in topic_tags]

Alice Problem Scraper success
Alice Submission Scraper success
Bob Problem Scraper success
Bob Submission Scraper success
Carl Problem Scraper success
Carl Submission Scraper success
Dave Problem Scraper success
Dave Submission Scraper success
Eve Problem Scraper success
Eve Submission Scraper success
Frank Problem Scraper success
Frank Submission Scraper success
