In [1]:
# 3rd party imports
from aiohttp import ClientSession
import asyncio
import nest_asyncio
import pandas as pd

# Built-in imports
import math

# Local imports
from services.scraping.scrape import get_url, get_data

In [2]:
nest_asyncio.apply()

In [3]:
df: pd.DataFrame = pd.read_csv('../data/New_York_Airbnb_4_dec_2021_cleaned.csv', sep=',', header=0)
df.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365,number_of_reviews_ltm,rating,images
0,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75356,-73.98559,Entire home/apt,150,30,48,2019-11-04,0.33,3,338,0,4.7,https://a0.muscache.com/im/pictures/f0813a11-4...
1,3831,"Whole flr w/private bdrm, bath & kitchen(pls r...",4869,LisaRoxanne,Brooklyn,Bedford-Stuyvesant,40.68494,-73.95765,Entire home/apt,75,1,409,2021-10-22,4.86,1,194,32,,
2,5121,BlissArtsSpace!,7356,Garon,Brooklyn,Bedford-Stuyvesant,40.68535,-73.95512,Private room,60,30,50,2016-06-05,0.52,2,365,0,4.52,https://a0.muscache.com/im/pictures/2090980c-b...
3,5136,"Spacious Brooklyn Duplex, Patio + Garden",7378,Rebecca,Brooklyn,Sunset Park,40.66265,-73.99454,Entire home/apt,275,5,2,2021-08-08,0.02,1,123,1,,
4,5178,Large Furnished Room Near B'way,8967,Shunichi,Manhattan,Midtown,40.76457,-73.98317,Private room,68,2,507,2021-11-08,3.68,1,192,33,4.22,https://a0.muscache.com/im/pictures/12065/f070...


In [4]:
limit: int = 5000

async def fetch(index: int, row: pd.Series, session: ClientSession) -> None:
    """ Fetch the data from the url and save it to the dataframe.
    :param index: The index of the row.
    :param row: The row of the dataframe.
    :param session: The aiohttp session.
    :return: None
    """
    url: str = get_url(row['id'])
    async with session.get(url) as response:
        response_str: str = await response.text()
        data: dict = get_data(room_id=row['id'], response=response_str)
        df.at[index, 'rating'] = data['rating']
        if data['images']:
            df.at[index, 'images'] = ','.join(data['images'])


async def bound_fetch(sem: asyncio.Semaphore, index: int, row: pd.Series, session: ClientSession) -> None:
    """ A bounded fetch function using Semaphore to limit concurrent requests.
    :param sem: asyncio.Semaphore
    :param index: int
    :param row: pd.Series
    :param session: ClientSession
    :return: None
    """
    # Getter function with semaphore.
    async with sem:
        await fetch(index=index, row=row, session=session)

async def run(session: ClientSession, df: pd.DataFrame) -> None:
    """ Run the bounded fetch function.
    :param session: ClientSession
    :param df: pd.DataFrame
    :return: None
    """
    tasks: list = []
    sem: asyncio.Semaphore = asyncio.Semaphore(limit)
    for index, row in df.iterrows():
        if not row['images'] or math.isnan(row['rating']):
            task: asyncio.Task = asyncio.ensure_future(bound_fetch(sem=sem, index=index, row=row, session=session))
            tasks.append(task)
    responses: asyncio.coroutines = asyncio.gather(*tasks)
    await responses

In [5]:
df['images'] = None
df['rating'] = None

In [6]:
while df['images'].count() < 22000:
    loop = asyncio.get_event_loop()
    async with ClientSession() as session:
        loop.run_until_complete(asyncio.ensure_future(run(session, df)))
    df.to_csv('../data/New_York_Airbnb_4_dec_2021_cleaned_with_rating_and_images.csv', sep=',', index=False)

20225
20238
20243
20252
20259
20266
20270
20279
20290
20296
20306
20312
20318
20325
20328
20332
20335
20340
20348
20352
20357
20361
20365
20371
20379
20384
20389
20394
20398
20404
20406
20408
20414
20417
20419
20420
20425
20429
20432
20437
20438
20440
20443
20446
20449
20451
20451
20454
20459
20461
20464
20465
20466
20470
20471
20473
20476
20476
20480
20482
20489
20490
20493
20494
20496
20497
20498
20499
20500
20501
20502
20503
20505
20505
20506
20506
20507
20507
20507
20510
20510
20511
20511
20512
20514
20514
20517
20518


ClientOSError: [Errno 54] Connection reset by peer