In [9]:
import string
import requests
import json
import os.path

from bs4 import BeautifulSoup
from tqdm import tqdm
from datetime import datetime

#### get settings

In [10]:
with open('./settings.json', 'r') as settings_file:
    settings = json.load(settings_file)

print('keys in settings file:')
list(settings.keys())

keys in settings file:


['data_dir',
 'data_file',
 'top_tags_count',
 'top_tags_file',
 'filtered_tmp_file',
 'train_size',
 'train_file',
 'train_labels_file',
 'test_file',
 'test_labels_file',
 'additional_data_dir',
 'additional_data_file',
 'objects_to_scrap']

In [12]:
# needed objects count
needed_objects = settings['objects_to_scrap']

# filepath to store data
timestamp = datetime.now().strftime('%Y%m%d_%H%M')
data_filepath = os.path.join(settings['additional_data_dir'], 
                             settings['additional_data_file'].format(timestamp))

#### connection settings

In [13]:
domain = 'https://stackoverflow.com'
questions_preview_page = '/questions?page={}&sort=newest'

proxy_string = None
#proxy_string = 'http://ihodos:987@192.168.5.1:3128'

proxy = {'http' : proxy_string, 'https': proxy_string} if proxy_string is not None else None

#### simple question text preprocess method

In [14]:
def preprocess_quest(quest_string):
    filt = string.punctuation + '\n'
    s = ''.join([symb if symb not in filt else ' ' for symb in quest_string])
    s = ' '.join(word for word in s.split() if len(word) > 1)
    return s

#### receiving data

In [15]:
preview_pages_limit = (2 * (needed_objects / 50)) + 1

processed_previews = 0
received_objects = 0

with open(data_filepath, 'w', encoding='utf-8') as data_file, \
    tqdm(total=needed_objects) as progress_bar:
    
    while (received_objects < needed_objects) and (processed_previews < preview_pages_limit):

        # get preview page
        preview_questions_url = domain + questions_preview_page.format(processed_previews + 1)
        #print('<=', end='')
        preview_questions_req = requests.get(preview_questions_url, proxies=proxy)
        #print('>', end='')
        processed_previews += 1

        # parse previews page and get questions url
        preview_questions_soup = BeautifulSoup(preview_questions_req.text, 'lxml')

        preview_question_boxes = preview_questions_soup.find_all('div', attrs={'class': 'summary'})

        question_urls = []

        for box in preview_question_boxes:
            url = box.find('a', attrs={'class': 'question-hyperlink'})['href']
            question_urls.append(url)

        # collect questions and tags
        for url in question_urls:

            #print('<', end='')
            quest_req = requests.get(domain + url, proxies=proxy)
            #print('>', end='')
            quest_soup = BeautifulSoup(quest_req.text, 'lxml')
            quest_text_box = quest_soup.find('div', attrs={'class': 'postcell'})

            if quest_text_box is None:
                continue

            quest_text_elem = quest_text_box.find('div', attrs={'class': 'post-text'})
            quest_text = preprocess_quest(quest_text_elem.text.strip())

            quest_tag_boxes = quest_text_box.find('div', attrs={'class': 'post-taglist'}).find_all('a')
            tags_text = ' '.join([tag.text for tag in quest_tag_boxes])

            if (len(quest_text) > 0) and (len(tags_text) > 0):
                data_file.write('{}\t{}\n'.format(quest_text, tags_text))
                received_objects += 1
                #print(received_objects, end='')
                progress_bar.update(1)

            if received_objects >= needed_objects:
                break

100%|██████████| 200/200 [01:22<00:00,  2.03it/s]
