In [1]:
%load_ext autoreload
%autoreload 2

In [6]:
import warnings
warnings.filterwarnings('ignore')

import logging
#logging.basicConfig(level=logging.ERROR)
#logger = logging.getLogger(__name__)
logging.getLogger("selenium").setLevel(logging.CRITICAL)

from src.pars_tools import ProxyGet, AvitoBot 
import argparse
import os
import sys
import re
import joblib
from pathlib import Path
import pandas as pd
import numpy as np
import random
import tqdm
import csv
from time import sleep
from tqdm import tqdm_notebook
from collections import OrderedDict

In [7]:
def is_interactive():
    return not hasattr(sys.modules['__main__'], '__file__')

In [8]:
if __name__ == "__main__":
    parser = argparse.ArgumentParser('arguments for setting driver and additional parsing options')
    parser.add_argument('--driver', type=str, default='Chrome')
    # headless mode:
    parser.add_argument('--headless', type=bool, default=True)  
    # choose category for parsing: 
    #    'predlozheniya_uslug'
    #    'nedvizhimost'
    parser.add_argument('--category', type=str, default='predlozheniya_uslug')  
    # pass main url for query: 
    #    'https://www.avito.ru/moskovskaya_oblast/kvartiry/'
    #    'https://www.avito.ru/moskovskaya_oblast/komnaty/'
    #    'https://www.avito.ru/moskva/predlozheniya_uslug/transport_perevozki'
    parser.add_argument('--url', type=str, default='https://www.avito.ru/moskva/predlozheniya_uslug/transport_perevozki') 
    # choose category subtype --> int: 
    # if nedvizhimost is selected then:        1 - sobstvennik,  2 - agentstvo
    # if predlozheniya_uslug is selected then: 1 - chastnoe,     2 - companya
    parser.add_argument('--usertype', type=int, default=1) 
    # define query type: 'sdam' for category=nedvizhimost or empty string ''
    parser.add_argument('--query', type=str, default='')     
    # choose the way to grab each page: soup=>True or selenium=>False
    parser.add_argument('--get_wall_soup', type=bool, default=True) 
    parser.add_argument('--adv_scrap_soup', type=bool, default=True)
    parser.add_argument('--findnewadvs', type=dict, default={'findnewadvs':False,'daysback':12})
    parser.add_argument('--useproxy', type=bool, default=False)
    parser.add_argument('--usesocks', type=bool, default=False)
    # pass proxy to selenium driver
    parser.add_argument('--proxylst', nargs='+', default=ProxyGet().get_random_proxy()[1]) 
    # pass proxy to request.get() method
    parser.add_argument('--proxyDict', type=dict, default=ProxyGet().get_random_proxy()[0]) 
    parser.add_argument('--takescreenshot', type=bool, default=False)
    # parse mobile=>True/web=>False version of Avito
    parser.add_argument('--parsemobile', type=bool, default=True)
    # run bot in parallel mode
    parser.add_argument('--runparallel', type=bool, default=True) 

# work-around for Jupyter notebook and IPython console
argv = [] if is_interactive() else sys.argv[1:]
args = parser.parse_args(argv)    

## Instantiane child of AvitoBot class

In [6]:
for i in range(150):
    args.proxylst  = ProxyGet().get_random_proxy()[1]
    args.proxyDict = ProxyGet().get_random_proxy()[0]
bot = AvitoBot(args)

In [8]:
# bot.driver.get('https://www.whatsmyip.org/')

In [None]:
# ProxyGet().collect_proxies()
bot.collect_proxies()

In [18]:
del bot

instantiated object has been deleted


### Grab number of pages to be parsed and find all advs links on each page 

In [None]:
outjson = []
offset = 1000
prices = [(max(int(i*offset),1),int((i+1)*offset)) for i in range(20)]
for items in prices:
    pricemin, pricemax = items
    print(pricemin, pricemax)
    res = bot.advert_collect_by_pages(args.url, pricemax, pricemin)
    if res is not None:
        outjson.append(res)

In [22]:
__len=0
for item in outjson:
    for k,v in item.items():
        __len+=len(item[k])
__len

1322

### Save json file with information about avito advs and relevant links for future parsing

In [10]:
joblib.dump(outjson,Path.joinpath(Path(os.getcwd()), 'avito_links_all_moskovskaya_oblast_kvartiry_agentstva.json'))

['C:\\Users\\anthony\\Documents\\Python Scripts\\avito_parse\\avito_links_all_moskovskaya_oblast_kvartiry_agentstva.json']

### Load previously saved json file 

In [9]:
outjson_obsol = joblib.load(Path.joinpath(Path(os.getcwd()), 'avito_links_all_sankt-peterburg_kvartiry_sobstvennik.json'))
outjson = outjson_obsol

### Convert collected links into mobile-like format if we're going to parse mobile version of Avito.ru

In [None]:
# outjson = outjson_obsol
for item in outjson:
    for p,v in tqdm_notebook(item.items()):
        for adv in v:
            adv['href']=re.sub('www','m',adv['href'])

## Run parsing without parallelization

In [9]:
parsedpages = [[(k,v) for k,v in item.items()] for item in outjson if len(item)!=0]

In [None]:
#total_pages = len(outjson)
# for page in tqdm_notebook(range(1,total_pages)):
for batches in parsedpages:
    for page in tqdm_notebook(batches): 
        args.proxylst  = ProxyGet().get_random_proxy()[1]
        args.proxyDict = ProxyGet().get_random_proxy()[0]
        bot = AvitoBot(args)             
        res = bot.navigate(page[1])
        del bot

## Run parsing with parallelization

In [24]:
parsedpages = [[(k,v) for k,v in item.items()] for item in outjson if len(item)!=0]

### Use Parallel with map-delayed

In [25]:
from multiprocessing import Process, JoinableQueue
from queue import Queue
from threading import Thread
from joblib import Parallel, delayed
import time
import json

In [26]:
def saver(q,category):
    file_path      = Path.joinpath(Path(os.getcwd()), 'csv','avito_db_{}.csv'.format(category))
    file_path_pgs  = Path.joinpath(Path(os.getcwd()), 'csv','parsed_pages.dat')
    if category == 'nedvizhimost': 
        headers = ['href', 'title', 'full_text', 'phone', 'region', 'city', 'real_estate', 'type', 'marketplace']
    elif category == 'predlozheniya_uslug': 
        headers = ['href', 'title', 'full_text', 'phone', 'region', 'city', 'uslugi', 'type', 'marketplace']
    #if not os.path.isfile(str(file_path)):
    with open(file_path, 'a', encoding='utf8') as outcsv:
        writer = csv.writer(outcsv, delimiter=',', quotechar='"', 
                            quoting=csv.QUOTE_MINIMAL, lineterminator='\n')
#         writer.writerow(['href', 'title', 'full_text', 'phonestr', 'loctext', 'sellerinfo']) 
#         if not os.path.isfile(str(file_spath)):
#             writer.writerow(headers)
        file_is_empty = os.stat(str(file_path)).st_size == 0
        if file_is_empty:
            writer.writerow(headers)     
        while True:
            strfrom_q = q.get()
            if strfrom_q is None: break
            indx_batch, page, arrstr = strfrom_q.split('&&&')
            val = json.loads(arrstr)                    
            for item in val:
                writer.writerow(item)                    
            with open(file_path_pgs, 'a', encoding='utf8') as f:
                f.write(indx_batch + ',' + page + '\n')                
            q.task_done()
        # Finish up
        q.task_done()      

In [27]:
def parse_page(q, indx_batch, pagesarr):
    #q,pagesarr = arg
    #collectres = []
    #parsedpages[0][0][1]
    for page in pagesarr:  
#         print('page num#{}'.format(page))
        for i in range(20):
            args.proxylst  = ProxyGet().get_random_proxy()[1]
            args.proxyDict = ProxyGet().get_random_proxy()[0]
        bot = AvitoBot(args)             
        res = bot.navigate(page[1])
        restr = json.dumps(res)
        q.put(str(indx_batch) + '&&&' + str(page[0]) + '&&&' + restr)
        del bot
    #return collectres

### Using JoinableQueue with Threadings

#### Shared queue with automated Threads initialization 

In [30]:
arr = np.arange(len(parsedpages))
num_partitions=1
batches = np.array_split(arr, num_partitions)
batches

[array([0, 1, 2])]

In [27]:
# for i in range(6,-1,-1):
#     parsedpages[0].pop(i) 

In [None]:
for indx in tqdm_notebook(batches):
    result_queue = JoinableQueue() #Queue()
    category=args.category
    p = Thread(target=saver, args=(result_queue,category))    
    threadlst=[]
    p.start()
    # We create list of threads and pass shared queue to all of them.
    threadlst=[Thread(target=parse_page, args=(result_queue, i, parsedpages[i])) for i in indx]
    # Starting threads...
    print("Start: %s" % time.ctime())
    for th in threadlst:
        th.start()
    # Waiting for threads to finish execution... 
    for th in threadlst:
        th.join() 
    print("End:   %s" % time.ctime())

    result_queue.put(None) # Poison pill
    p.join()          

#### Shared queue with manual Threads initialization 

In [None]:
result_queue = JoinableQueue() #Queue()
p = Thread(target=saver, args=(result_queue,))
p.start()
# We create two threads and pass shared queue to both of them.
t0 = Thread(target=parse_page, args=(result_queue, parsedpages[0][:1]))
#t1 = Thread(target=parse_page, args=(result_queue, parsedpages[1][:1]))
#t2 = Thread(target=parse_page, args=(result_queue, parsedpages[2]))
#t3 = Thread(target=parse_page, args=(result_queue, df_split[3]))

# Starting threads...
print("Start: %s" % time.ctime())
t0.start()
#t1.start()
#t2.start()
#t3.start()

# Waiting for threads to finish execution...
t0.join()
#t1.join()
#t2.join()
#t3.join()
print("End:   %s" % time.ctime())

# After threads are done, we can read results from the queue.
# while not result_queue.empty():
#     result = result_queue.get()
#     print(result)

result_queue.put(None) # Poison pill
p.join()  