## Create Folder Structure

In [47]:
from bs4 import BeautifulSoup
from pathlib import Path
import pandas as pd


with open('questions.html', 'r') as f:
    soup = BeautifulSoup(f.read(), 'html.parser')
    rows = soup.find('tbody').find_all('tr')
    data = [row.find_all('td') for row in rows] # [[td, td, td], [td, td, td], ...]

questions = [{
    'company': td[0].find('p').text.strip()
    , 'title': td[1].find('p').text.strip()
    , 'category': td[2].find('a').text.lower().strip()
    , 'difficulty': td[3].find('a').text.lower().strip()
    , 'url': 'https://datalemur.com/questions' + td[4].find('a')['href']
    , 'file_name': td[4].find('a')['href'].split('/')[-1] + '.sql'
    } for td in data]

In [7]:
''.join(sorted(set(''.join([q['file_name'] for q in questions]))))

'&-.01247abcdefghijklmnopqrstuvwxyz'

In [45]:
for q in questions:
    file_path = Path('questions', q['category'], q['difficulty'])
    file_name = file_path.joinpath(q['file_name'])
    
    # create directory / file
    file_path.mkdir(parents=True, exist_ok=True)
    file_name.touch(exist_ok=True)
    
    comment = f"-- {q['url']}\n"
    with open(file_name, 'r+') as f:
        contents = f.readlines() # keep existing contents
        f.seek(0) # go to beginning of file

        if len(contents) == 0:
            f.writelines([comment] + [';\n']) # write comment and semicolon for fluff
        elif contents[0] != comment:
            f.writelines([comment] + contents)

In [4]:
pd.DataFrame(questions)

Unnamed: 0,company,title,category,difficulty,url,file_name
0,Facebook,Page With No Likes,sql,easy,https://datalemur.com/questions/questions/sql-...,sql-page-with-no-likes.sql
1,Tesla,Unfinished Parts,sql,easy,https://datalemur.com/questions/questions/tesl...,tesla-unfinished-parts.sql
2,Twitter,Histogram of Tweets,sql,easy,https://datalemur.com/questions/questions/sql-...,sql-histogram-tweets.sql
3,New York Times,Laptop vs. Mobile Viewership,sql,easy,https://datalemur.com/questions/questions/lapt...,laptop-mobile-viewership.sql
4,Facebook,Coin Fairness Test,statistics,easy,https://datalemur.com/questions/questions/coin...,coin-fairness-test.sql
...,...,...,...,...,...,...
172,IBM,Entropy of Univariate Gaussian Random Variable,machine learning,hard,https://datalemur.com/questions/questions/entr...,entropy-univariate-gaussian-rv.sql
173,Morgan Stanley,Normally Distributed MGF,statistics,hard,https://datalemur.com/questions/questions/norm...,normal-mgf.sql
174,Google,Blended Mean and Standard Deviation,statistics,hard,https://datalemur.com/questions/questions/blen...,blended-mean-and-std.sql
175,Citadel,Covariance of X and Y,statistics,hard,https://datalemur.com/questions/questions/cova...,covariance-x-y.sql


In [5]:
pd.DataFrame(questions).to_parquet('questions/questions.parquet', index=False)