# Assigment 2 - Preliminary Code
 **Subject** : Monopolistic behaviour assessment on food items in Californian prisons.

 **Authors** : Louise Gatty, Alice Pétillon, Charles Pyle, Antonio Raphael, Anne Thébaud

## 1. Walkenhorts Family Visit Catalogue Scraping
The following code in R (copy to R to run) takes the [Walkenhorst family visit pdf catalogue](https://assets.walkenhorsts.com/catalog_item/files/27/original/2025.pdf?1735933541) and performs pdf scraping creating a dataset of associated items and prices.

In [None]:
library(tidyverse) # basic functions
library(conflicted) # force conflicts to errors
library(pdftools) # read in pdf documents

pdf <- pdf_text("~/Documents/PROJECT-CLONES/Data-Storage/Walkenhorst/family visit catalogue.pdf") # read in data

pdf <- pdf[-c(1,2)] # drop first two pages with no information

PDF_Scraper <- function(page_number){
  
  pdf <- get("pdf", parent.frame()) # get from global environment into sub-environment
  
  page <- pdf[[page_number]]# index by page number
  
  lines <- str_split(page, "\n")[[1]] # split document on line breaks
  
  lines_df <- data.frame(lines) # convert to data frame
  
  lines_df <- lines_df[-c(1, 2, 67, 68),] # drop header and footer
  
  lines_df <- data.frame(lines_df) # re-covert to data frame
  
  lines_df <- lines_df |> 
    rename(V1 = lines_df) |> # rename
    mutate(V1 = trimws(V1, which = "left", whitespace = " ")) |> # remove extra whitespace
    mutate(count = str_count(V1, "\\b\\d{4,5}-\\d{3}\\b")) |> # unneeded but leaving in so I dont have to change another line of code
    mutate(position = str_locate_all(V1, "\\b\\d{4,5}-\\d{3}\\b")) |> # locate positions of item numbers
    mutate(position = map(position, ~ sort(as.numeric(.x)[as.numeric(.x) >= 10]))) |> # drop initial item position
    mutate(na=map_lgl(.x = position, .f = is_empty)) |> # test for empty positions where only one item
    mutate(position_dollar = str_locate_all(V1, "\\$\\d{1,2}\\.\\d{2}")) |>  # dollar positions
    mutate(position_dollar = map2(
              position_dollar, na,
              ~ if (.y == FALSE) character(0) else .x)) |> # dollar position empty if item position valid
    mutate(break_code = map_chr(
                      position,
                      ~ if (length(.x) == 0) NA_character_ else .x[1] # take first position for item code break
                    ),
           break_dollar = map_chr(
        position_dollar,
        ~ if (length(.x) == 0) NA_character_ else .x[length(.x)] # take last position for dollar break
      )
    ) |> 
    mutate(break_code = as.numeric(break_code), 
           break_dollar = as.numeric(break_dollar) + 1) |> # convert to numeric and add one to dollar break position
    mutate(break_point = coalesce(break_code,
                                  break_dollar)) |> # coalesce to column of break points
    select(-c(count:break_dollar)) |> # drop extra cols
    mutate(first = str_sub(V1, 1, break_point-1), # first item
           second = str_sub(V1, break_point, str_length(V1))) |> # second item
    select(-c(V1, break_point))# drop extra data
    
  Full_Data <- data.frame(item = lines_df$first) |> # data frame as list of items, left hand column first
    bind_rows(data.frame(item = lines_df$second)) # right hand column second
  
  Full_Data <- Full_Data |> 
    mutate(item = trimws(item, which = "left", whitespace = " ")) |> # remove extra whitespace from left hand side
    mutate(item_detect = str_detect(item, "\\b\\d{4,5}-\\d{3}\\b")) |> # detect item v item category
    mutate(Category = case_when(item_detect == FALSE ~ item,
                                TRUE ~ NA_character_)) |> # create column for walkenhorst categories
    select(Category, item, item_detect) |> # reorder
    fill(Category, .direction = "down") |> # fill category in down direction
    dplyr::filter(item_detect == TRUE) |> # drop items with just categories
    select(-c(item_detect)) |> # drop test
    mutate(item_number = str_extract(item, "\\b\\d{4,5}-\\d{3}\\b"), # extract item number
           item = str_remove(item, "\\b\\d{4,5}-\\d{3}\\b")) |> # remove item number from description
    mutate(kosher = str_detect(item, "ⓚ"), # test if kosher
           item = str_remove(item, "ⓚ")) |> # remove kosher symbol
    mutate(price = str_extract(item, "\\$\\d{1,2}\\.\\d{2}"), # extract price
           item = str_remove(item, "\\$\\d{1,2}\\.\\d{2}")) |> # remove price item description
    mutate(weight = str_extract(item, "\\d{1,3}\\.?(?:[^a-zA-Z]*?)oz\\."), # extract weight
           weight2 = str_extract(item, "\\d{1,3}\\.?(?:[^a-zA-Z]*?)oz")) |> # extract weight different format
    mutate(weight = coalesce(weight, weight2)) |> # coalesnce to one weight variable
    select(-c(weight2)) |> # drop extraneous variable
    mutate(item = trimws(item, which = "both", whitespace = " "))|> 
    mutate(item = trimws(item, which = "both", whitespace = "[\t\r\n]")) # format to remove white space
  
  return(Full_Data) # return data
}

dataList <- list() # empty item list

pages <- 1:31 # pages 1-31

dataList <- lapply(pages, PDF_Scraper) # apply function to pages and save

for (i in 2:31){
  
  dataList[[1]] <- dataList[[1]] |> 
    bind_rows(dataList[[i]]) # bind each page to first page

}

Full_Data <- dataList[[1]] # extact first page from list that now contains full data


rm(list = ls(pattern = "dataList|i|pages|pdf|PDF_Scraper")) # drop unnecessary environmental variables


Full_Data <- Full_Data |> 
  mutate(Category = trimws(Category, which = "both", whitespace = "[\t\r\n]"),
         Category = trimws(Category, which = "both", whitespace = " ")) |> # clean category labels
  fill(Category, .direction = "down") # fill in downward direction for categories that span multiple pages


Full_Data <- Full_Data |> 
  mutate(weight = recode(weight,
                         "8.7” 12 oz." = "12 oz.")) |> 
  mutate(weight = str_remove_all(weight, "oz."),
         weight = str_remove_all(weight, "oz"),
         weight = trimws(weight, which = "right", whitespace = " "),
         weight = recode(weight,
                         "8.8." = "8.8",
                         "5 ."  = "5",
                         "2 5/8" = "2.625"),
         weight = as.numeric(weight)) |> 
  mutate(weight = paste(weight, " oz.")) # clean weight vairable to consistent format and correct mistakes

## 2. Webscraping of Walmart items

The following code in python takes the [Walmart website](https://www.walmart.com/search?q=crackers&page=2&affinityOverride=store_led) in California and webscrapes all the items for one category of products : "crackers". It uses asynchronous programming to enable multiple requests at once. 

For now, we have not been able to webscrape all categories associated to the pdf due to high antibot detection on Walmart's website, which potentially means we will need to use a VPN for the rest of the webscraping. 

In [None]:
# Library imports

import asyncio #allows multiple requests at once
import json
import math
import httpx # makes the web requests
import nest_asyncio
import os
from urllib.parse import urlencode
from typing import List, Dict
from loguru import logger as log
from parsel import Selector #used to read HTML of the page and find the specific data
from pathlib import Path

#Fill these with the cookies and headers gotten by curl converter when giving HTLM curl link of request
cookies = {} 
headers = {} 

nest_asyncio.apply() # allows to run asyncio inside Jupyter notebook

# Function to extract search results from search HTML response
def parse_search(html_text:str) -> tuple[list[dict], int]:
    sel = Selector(text=html_text)
    data = sel.xpath('//script[@id="__NEXT_DATA__"]/text()').get() #script tag with JSON data
    data = json.loads(data) # extract JSON data
    total_results = data["props"]["pageProps"]["initialData"]["searchResult"]["itemStacks"][0]["count"] #counts items
    results = data["props"]["pageProps"]["initialData"]["searchResult"]["itemStacks"][0]["items"] #gives items list
    return results, total_results

# Function to scrape a single Walmart search page
async def scrape_walmart_page(session:httpx.AsyncClient, query:str="crackers", page=1):
    url = "https://www.walmart.com/search?" + urlencode({"q": query,"page": page,"facet": "fulfillment_method:Pickup","affinityOverride": "default",},)
    
    for attempt in range(5): #tries up to 5 times to get a valid response
        resp = await session.get(url)
        if resp.status_code == 200:
            return resp
        await asyncio.sleep(2 ** attempt)  # exponential wait before retrying if it gets blocked (to mimic human behaviour)
    raise Exception(f"Blocked: {url}")
    resp = await session.get(url)
    assert resp.status_code == 200, "request is blocked"
    return resp 

# Function to determine number of pages for the request and scrape all search pages
async def scrape_search(search_query:str, session:httpx.AsyncClient, max_scrape_pages:int=None) -> List[Dict]:
    # scrape the first search page first
    log.info(f"scraping Walmart search for the keyword {search_query}")
    _resp_page1 = await scrape_walmart_page(query=search_query, session=session)
    results, total_items = parse_search(_resp_page1.text) 

    # get the total number of pages available and the number of pages to scrape
    max_page = math.ceil(total_items / 40)
    if max_page > 25: # limits itself to 25 (Walmart's maximum limit for search results)
        max_page = 25
    if max_scrape_pages and max_scrape_pages < max_page:
        max_page = max_scrape_pages
    
    log.info(f"scraped the first search, remaining ({max_page-1}) more pages")
    # Asyncio.gather launches requests for all remaining pages at the same time
    for response in await asyncio.gather(*[scrape_walmart_page(query=search_query, page=i, session=session) for i in range(2, max_page+1)]): 
        results.extend(parse_search(response.text)[0]) # it extends the list with results of each page
    log.success(f"scraped {len(results)} products from walmart search")
    return results 

# Main function to run the scraper and save results
async def main():
    export_dir = Path(" To specify ") #path to export data
    export_dir.mkdir(parents=True, exist_ok=True)  #creates a folder 

    output_file = export_dir / "walmart_crackers.json"

    #AsynClient Keeps the connection open and remebers cookiues. The rest runs the search fro "crackers"
    async with httpx.AsyncClient(headers=headers,cookies=cookies, timeout=30) as session: results = await scrape_search(search_query="crackers",session=session,max_scrape_pages=25)

    #Saves everything to a JSON file
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(results, f, ensure_ascii=False, indent=2)

    print(f"Saved {len(results)} products to {output_file}")

#Runs the script
if __name__ == "__main__":
    asyncio.run(main())

Due to antibot blocks for some of us, we also used other libraries:
- Thinking that standard httpx might be too "obvious" for Walmart some of us used **curl_cffi** instead of httpx. It is a library specifically designed to impersonate the TLS fingerprint of a real browser.
- We also used **Selennium** to "mimick" human behaviour.

We might need to use these when continuing the code on multiple queries.