# Imports

In [667]:
import requests
from IPython.display import display, Markdown
from bs4 import BeautifulSoup
from os import path
from tqdm.notebook import tqdm
from typeguard import typechecked
import os

In [338]:
import numpy as np
import pandas as pd

# Func

In [2]:
@typechecked
def get_page_soup(url : str):
    page = requests.get(url)
    if page.ok:
        pass
#         printmd('Page OK', color='green')
    else:
        printmd(f'Page not OK, code {page.status_code}', color='red')
        raise ValueError()
    soup = BeautifulSoup(page.content, 'html.parser')
    return soup

In [588]:
def get_block_sizes_from_soup_trs(soup_trs):
    data = pd.DataFrame([[v.text for v in row.find_all('td')] for row in soup_trs])
    if data.shape[0] < data.shape[1]:
        data = data.T
    data = data.apply(lambda x: ','.join(x).strip(','), axis=1)
    data = data.apply(lambda x: [int(v) for v in x.split(',')])
    data = list(data.values)
    return data

row_sizes = get_block_sizes_from_soup_trs(soup.find('table', id='full_cross_tbl').find('table', class_='bcross', id='cross_left').find_all('tr'))
col_sizes = get_block_sizes_from_soup_trs(soup.find('table', id='full_cross_tbl').find('table', class_='bcross', id='cross_top').find_all('tr'))

In [688]:
def get_data_from_puzzle_id(pid):
    assert isinstance(pid, (int, np.int64)), 'pid must be an integer'
    soup = get_page_soup(f'https://onlinenonograms.com/{pid}')
    
    row_sizes = get_block_sizes_from_soup_trs(soup.find('table', id='full_cross_tbl').find('table', class_='bcross', id='cross_left').find_all('tr'))
    col_sizes = get_block_sizes_from_soup_trs(soup.find('table', id='full_cross_tbl').find('table', class_='bcross', id='cross_top' ).find_all('tr'))
    
    return row_sizes, col_sizes

puzzle_data = get_data_from_puzzle_id(123)

# Scrapping

##### Scrape all puzzle meta data - <font color='red'>run only once<font/>

In [670]:
assert not os.path.exists('./puzzle_meta_data.csv'), 'You already have the puzzle meta-data.'

df = pd.DataFrame(columns=['puzzle_id', 'size', 'rating']).set_index('puzzle_id')

for page_idx in tqdm(range(1, 100)):
    soup = get_page_soup(f'https://onlinenonograms.com/index.php?place=catalog&kat=0&color=bw&size=&star=&filtr=all&sort=sortstard&noset=2&page={page_idx}')
    for puzzle_div in soup.find_all('div', class_='catitem'):
        puzzle_div = puzzle_div.find('div')
        p_size, p_id = puzzle_div.text.split('#', 1)
        p_size = p_size.strip()
        p_id = int(p_id.strip())
        p_rating = int(puzzle_div.find('img', title='Rating')['src'].rsplit('/', 1)[-1].split('.', 1)[0])

        df.loc[p_id] = [p_size, p_rating]

df.to_csv('./puzzle_meta_data.csv')

AssertionError: You already have puzzle meta-data

##### Load exising puzzle meta data

In [750]:
puzzle_df = pd.read_csv('./puzzle_meta_data.csv')
puzzle_df['puzzle_id'] = puzzle_df['puzzle_id'].astype(int)
puzzle_df['row_data'] = None
puzzle_df['col_data'] = None
puzzle_df.set_index('puzzle_id', inplace=True)

##### Fill meta data with the puzzle data

In [793]:
assert not os.path.exists('./puzzle_full_data.csv'), 'You already have the puzzle full-data.'

for puzzle_id in tqdm(puzzle_df.index):
    if pd.notna(puzzle_df.loc[puzzle_id, ['row_data', 'col_data']]).all():
        continue
    try:
        row_data, col_data = get_data_from_puzzle_id(puzzle_id)
        puzzle_df.at[puzzle_id, 'row_data'] = row_data
        puzzle_df.at[puzzle_id, 'col_data'] = col_data
    except:
        print(f'error at puzzle: {puzzle_id}')
puzzle_df.to_csv('./puzzle_full_data.csv')

AssertionError: You already have the puzzle full-data.