# 0. Initialization

## 0.1 Imports

In [2]:
from warcio.archiveiterator import ArchiveIterator
from bs4 import BeautifulSoup
import urllib
from collections import namedtuple
import uuid
import os
import pandas as pd

## 0.2 Constants

In [3]:
INP_PATH = "data/segments/CC-MAIN-20210723143921-20210723173921-00258.warc.gz"
OUT_PATH = "data/img"
CSV_PATH = "data/csv/img.csv"

In [4]:
SEGMENT_ID = "1627046149929.88"

## 0.3 Utility Functions

In [5]:
def img_exists(url):
    try:
        status = urllib.request.urlopen(url).getcode()
        if status == 200:
            return True
        else:
            return False
    except:
        return False

In [6]:
def download(url:str, uuid:str, outdir:str=OUT_PATH):
    ext = url.split('.')[-1].split('?')[0]
    path = outdir + '/' + uuid + '.' + ext
    urllib.request.urlretrieve(url, path)
    
    return path

In [7]:
def export_csv(l, out_path=CSV_PATH):
    df = pd.DataFrame(l)
    df.to_csv(out_path, index=False)

In [8]:
def extract_text(parent):
    text = []
    for tag in parent.find_all(recursive=False):
        text.append(tag.get_text())
        text.extend(extract_text(tag))

# 1. Parsing

In [15]:
def parse(content:str):
    """Extract image src-alt pairs.
    
    Args:
        content: String containing a page
    Returns:
        imgs: A list of dictionary objects with the format:
            {"src": "http://sample.url/pathtoimg.jpg", "alt": "Sample description",}
    """
    imgs = []
    try:
        soup = BeautifulSoup(content, 'html.parser')
        tags = soup.find_all('img', src=True, alt=True)
        for tag in tags:
            src = tag['src']
            alt = tag['alt']
            try:
                par = tag.parent.gettext()
            exce
            imgs.append({'src': src, 'alt': alt, 'par': par})
    except:
        pass
    return imgs

# 2. Process WARC

For each record
    parse
    for each image
        process url
            check if url exists
            generate uuid
            download image
        append to all_imgs

In [10]:
def process_warc(warc_path:str, warc_segment_id:str, imgs:list=[], uuids:dict={}, limit=-1):
    with open(warc_path, 'rb') as stream:
        for record in ArchiveIterator(stream):
            try:
                # Parse page
                warc_imgs = parse(record.content_stream().read().decode("utf-8"))
                for img in warc_imgs:
                    if img_exists(img['src']):
                        print('[ACCEPTED] ' + img['src'])
                        # Generate UUID
                        img_uuid = uuid.uuid4()

                        # Download image
                        img_path = download(img['src'], str(img_uuid))

                        # Append to imgs
                        imgs.append({
                            'img_uuid': str(img_uuid),
                            'img_url': img['src'],
                            'img_path': img_path,
                            'img_caption': img['alt'],
                            'img_par': img['par']
                            'warc_segment_id': warc_segment_id,
                            'warc_path': warc_path,
                            'warc_url': record.rec_headers.get_header('WARC-Target-URI')
                        })
                        uuids[img_uuid] = True
                    else:
                        print('[REJECTED] ' + img['src'])
            except:
                pass
            if len(imgs) > limit:
                break
    return imgs, uuids

In [11]:
imgs, uuids = process_warc(INP_PATH, SEGMENT_ID, limit=10)

KeyboardInterrupt: 

In [32]:
export_csv(imgs)