# Extracting Images from Adidas PDF <br/>
**Libraries used:**
- **pdfplumber**: opens pdfs and allows them to be parsed page by page
- **re**: finds patterns in text to know where to find specific data (e.g. dates, model numbers, etc.)
- **glob**: allows for iterating through files ending in .pdf
- **csv**: used here for reading matching manuf_model to item_ids
- **fitz**: extracting images
- **os**: for getting current directory and creating a new folder containing images

In [4]:
import pdfplumber
import re
import glob
import csv
import fitz
import os

In [5]:
manuf_model_re = re.compile(r'(\w+)\D+\d+\D+\d+\D+\d+\D+\d+')

In [6]:
def extract_model_nums():
    """
    Returns: a list of all model numbers in an Adidas PDF
    Assumptions: your PDF's are in the folder called "put_pdfs_here"
    
    """
    model_nums = []
    check_model_num = False

    for file in glob.iglob("put_pdfs_here/*pdf"):
        with pdfplumber.open(file) as pdf:
            pages = pdf.pages
            for page in pdf.pages:
                text = page.extract_text()
                for line in text.split("\n"):

                    if line.startswith('Style') or line.startswith('Qty'):
                        check_model_num = True

                    elif check_model_num:
                        model_num = manuf_model_re.search(line)
                        if model_num:
                            model_nums.append(model_num.group(1))
                        check_model_num = False
                        
    return model_nums      

In [7]:
def get_item_ids():
    """
    Returns: a dictionary of all item_ids corresponding to the model numbers, and a list of item_ids
    Assumptions: - you've pulled all items from an org and saved it as "all_items.csv" in the directory of this file
                 - item_ids are in the first column and manuf_models in the second
    
    """
    item_ids = {}
    item_id_l = []
    model_nums = extract_model_nums()
    for i in range(len(model_nums)):
        item_ids[i] = []

    with open('all_items.csv', 'r', encoding='utf-8-sig') as file:
        reader = csv.reader(file)

        for line in reader:
            for model_num in model_nums:
                if line[1] == model_num:
                    item_ids[model_nums.index(model_num)].append(line[0])
                    item_id_l.append(line[0])

    return item_ids, item_id_l
    

In [9]:
# The actual image extraction
image_num = -1
for file in glob.iglob("put_pdfs_here/*pdf"):
    doc = fitz.open(file)
    item_ids, item_id_l = get_item_ids()
    # new directory that will hold images
    path = os.path.join(os.getcwd(), "All Files")
    os.mkdir(path)
    
    for i in range(len(doc)):
        for img in doc.getPageImageList(i):
            
            # this step is needed because the first image is the Adidas logo, which we don't want
            if image_num == -1:
                image_num += 1
                continue
                
            xref = img[0]
            pix = fitz.Pixmap(doc, xref)
            
            for j in range(len(item_ids[image_num])):
                pix.writePNG(f"{path}/I{item_ids[image_num][j]}.jpg")
                pix.writePNG(f"{path}/I{item_ids[image_num][j]}Thumb.jpg")
            pix = None
            
            image_num += 1
            
# Outputting item_ids into a csv to generate queries
with open('queries.csv', 'w') as file:
    writer = csv.writer(file)
    for i in range(len(item_id_l)):
        writer.writerow([f"insert into item_avatar(item_id, avatar_link, avatar_thumbnail_link, last_modified, last_modified_by) select i.item_id, 'https://acsequip.acsathletics.com/Images/ItemAvatar/I' + cast(i.item_id as varchar) + '.jpg', 'https://acsequip.acsathletics.com/Images/ItemAvatar/I' + cast(i.item_id as varchar) + 'Thumb.jpg', dateadd(hh,o.time_diff,getDate()), e.employee_id from item i inner join org o on i.org_id = o.org_id inner join employee e on e.org_id = o.org_id left join ITEM_AVATAR ia on ia.item_id = i.item_id where i.item_id = {item_id_l[i]} and ia.item_id is null and e.logon_id = 'acs' + cast(o.org_id as varchar)"])


{0: ['470718'], 1: []} ['470718']
