## Day 5 - Soil maps etc. 

A map is of the form
XX YY ZZ

Where:
- XX is destination range start
- YY is source range start
- ZZ is range length

Such that a Source-to-Destination map of 50 98 2 corresponds to:
- Destination source = 50
- Source range start = 98
- Range length = 2
And this means that we have the mapping Source:Destination of {98:50, 99:51}

In [13]:
with open("./example.txt") as f:
    example_lines = []
    for line in f.readlines():
        example_lines.append(line.strip())

with open("./input.txt") as f:
    input_lines = []
    for line in f.readlines():
        input_lines.append(line.strip())

In [14]:
example_lines[:10]

['seeds: 79 14 55 13',
 '',
 'seed-to-soil map:',
 '50 98 2',
 '52 50 48',
 '',
 'soil-to-fertilizer map:',
 '0 15 37',
 '37 52 2',
 '39 0 15']

In [15]:
SEEDS = "seeds"
MAPS = "maps"
DESTINATION_START = "destination_start"
SOURCE_START = "source_start"
RANGE = "range"

def parse_input(lines: list[str]) -> dict:
    """
    Returns dictionary of the form:
    {
        "seeds": list[int],
        "maps": {
            "X-to-Y": [
                {
                    "destination_start": int,
                    "source_start": int,
                    "range": int,
                },
            ]
        }
    }
    """
    d = {}
    d[MAPS] = {}

    # Extract seeds
    seed_line = lines[0].split(": ")[-1]
    seeds = [int(seed) for seed in seed_line.split()]
    assert isinstance(seeds, list), isinstance(seeds[0], int)
    d[SEEDS] = seeds

    # Extract maps!
    parsing_map = False
    map_key = "INVALID"
    for line in lines[1:]:
        assert isinstance(line, str)
        if line == "":
            parsing_map = False
            map_key = "INVALID"
            continue
        if parsing_map:
            values = [int(val) for val in line.split()]
            assert len(values) == 3
            destination_start, source_start, map_range = values
            tmp_mapping_list = d[MAPS][map_key]
            assert isinstance(tmp_mapping_list, list)
            tmp_mapping_list.append(
                {
                    DESTINATION_START: destination_start,
                    SOURCE_START: source_start,
                    RANGE: map_range,
                }
            )
        if "map" in line:
            map_key = line.split()[0]
            assert map_key not in d[MAPS].keys()
            d[MAPS][map_key] = []
            parsing_map = True
    
    return d

initial_example_mapping = parse_input(example_lines)
initial_example_mapping.__str__()

"{'maps': {'seed-to-soil': [{'destination_start': 50, 'source_start': 98, 'range': 2}, {'destination_start': 52, 'source_start': 50, 'range': 48}], 'soil-to-fertilizer': [{'destination_start': 0, 'source_start': 15, 'range': 37}, {'destination_start': 37, 'source_start': 52, 'range': 2}, {'destination_start': 39, 'source_start': 0, 'range': 15}], 'fertilizer-to-water': [{'destination_start': 49, 'source_start': 53, 'range': 8}, {'destination_start': 0, 'source_start': 11, 'range': 42}, {'destination_start': 42, 'source_start': 0, 'range': 7}, {'destination_start': 57, 'source_start': 7, 'range': 4}], 'water-to-light': [{'destination_start': 88, 'source_start': 18, 'range': 7}, {'destination_start': 18, 'source_start': 25, 'range': 70}], 'light-to-temperature': [{'destination_start': 45, 'source_start': 77, 'range': 23}, {'destination_start': 81, 'source_start': 45, 'range': 19}, {'destination_start': 68, 'source_start': 64, 'range': 13}], 'temperature-to-humidity': [{'destination_start

In [16]:
test_on_input = parse_input(input_lines)  # Ok well that part is plenty quick

In [17]:
# source_start = 2642418175 
# destination_start = 2192252668
# map_range = 507721065

# new_map = dict(zip(
#     range(source_start, source_start+map_range),
#     range(destination_start, destination_start+map_range)
# ))

# Evidently this kind of approach is wayyyy too inefficient!

In [18]:
def extract_map_func(initial_mapping: dict[str, list[dict[str, int]]]) -> callable:
    """
    So we're taking a dict of the form:
    {
        "source-to-destination": [
            {
                "destination_start": int,
                "source_start": int,
                "range": int,
            },
            {
                "destination_start": int,
                "source_start": int,
                "range": int,
            },
            ...
        ], 
        ...
    }

    And converting to one of the form
    
    {
        "source-to-destination": dict[source: int, destination: int]
    } ***OLD***

    {
        "source-to-destination": func(val_to_map: int) -> int
    } NEW
    """
    # new_mapping_dict = {}

    # DESTINATION, SOURCE, RANGE tuples
    mapping_tuples_list = []

    for single_map in initial_mapping:
        assert isinstance(single_map, dict)
        destination_start = single_map[DESTINATION_START]
        source_start = single_map[SOURCE_START]
        map_range = single_map[RANGE]

        # new_map = {
        #     (source_start + i):(destination_start + i)
        #     for i in range(map_range)
        # }  
        # Above method took like a million years lol, left in for fun
        # new_map = dict(zip(
        #     range(source_start, source_start+map_range),
        #     range(destination_start, destination_start+map_range)
        # ))
        # Ok that also was way too slow....
        
        # lets try tuples..., and functions??
        mapping_tuples_list.append(
            (destination_start, source_start, map_range)
        )
    
    def f(value_to_map: int):
        # find out which tuple said input would be applicable to:
        for tuple_map in mapping_tuples_list:
            destination, source, range_len = tuple_map
            if source <= value_to_map <= source+range_len:
                diff = destination - source
                return value_to_map+diff

        # didn't find
        return value_to_map

    return f
    # return new_mapping_dict

func = extract_map_func(initial_example_mapping[MAPS]["seed-to-soil"])
assert func(79) == 81

In [19]:
# check it's not too slow on the input version!
extract_map_func(test_on_input[MAPS]["seed-to-soil"])  # way quicker lol

<function __main__.extract_map_func.<locals>.f(value_to_map: int)>

In [20]:
def get_map_key(source_name: str, map_names: list[str]) -> tuple[str, str]:
    """
    Takes in source name e.g. "seed", and the list of map_names
    Finds the map_name that corresponds in this example to "seed-to-X"
    returns this mapping name alongisde the destination string "X"
    """
    relevant_names = [name for name in map_names if source_name+"-to-" in name]
    assert len(relevant_names) == 1
    map_name = relevant_names[0]
    destination_name = relevant_names[0].split("-")[-1]

    return map_name, destination_name

**Part 1: return the lowest location number that corresponds to any of the initial seeds**

In [21]:
def part1(lines: list[str]):
    initial_mapping_dict = parse_input(lines)

    initial_seeds = initial_mapping_dict[SEEDS]

    maps = [map_name for map_name in initial_mapping_dict[MAPS].keys()]
    final_mapping_dict = {}
    for map_name in maps:
        final_mapping_dict[map_name] = extract_map_func(initial_mapping_dict[MAPS][map_name])
    
    lowest_value = 999999999999999999999999999999999999
    for seed in initial_seeds:
        source_name = "seed"
        location_found = False
        current_value = seed
        while not location_found:
            map_name, destination_name = get_map_key(source_name, maps)
            # If there's not a mapping, it maps to itself
            next_value = final_mapping_dict[map_name](current_value)
            if destination_name == "location":
                location_found = True
                lowest_value = min(lowest_value, next_value)
            else:
                source_name = destination_name
                current_value = next_value
    
    return lowest_value

assert part1(example_lines) == 35
%timeit part1(input_lines)

841 µs ± 19 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


A nice improvement from first attempt with the huge dictionary maps lol

**Part 2: just more seeds to parse**

> update- regretting saying 'just'

In [29]:
import numpy as np
from joblib import Parallel, delayed

def part2_failed_attempt(lines: list[str]):
    initial_mapping_dict = parse_input(lines)

    
    maps = [map_name for map_name in initial_mapping_dict[MAPS].keys()]
    final_mapping_dict = {}
    for map_name in maps:
        final_mapping_dict[map_name] = extract_map_func(initial_mapping_dict[MAPS][map_name])

    def search_seeds(seed: int):
        source_name = "seed"
        location_found = False
        current_value = seed
        while not location_found:
            map_name, destination_name = get_map_key(source_name, maps)
            # If there's not a mapping, it maps to itself
            next_value = final_mapping_dict[map_name](current_value)
            if destination_name == "location":
                location_found = True
                return next_value
            else:
                source_name = destination_name
                current_value = next_value
    
    search_seeds_vectorised = np.vectorize(search_seeds, cache=True)

    initial_seeds_raw = initial_mapping_dict[SEEDS]
    starts_and_lengths = [(initial_seeds_raw[i], initial_seeds_raw[i+1]) for i in range(0, len(initial_seeds_raw), 2)]

    input_s_l_vectors = [
        np.arange(start, start+length) for start, length in starts_and_lengths
    ]

    output_list = Parallel(n_jobs=-1)(
        delayed(search_seeds_vectorised)(input_vector)
        for input_vector in input_s_l_vectors
    )
    # output_list = []
    # for input_vector in input_s_l_vectors:
    #     output_list.append(search_seeds_vectorised(input_vector))
    #     print("done a vector")

    outputs = np.concatenate(output_list)
    return outputs.min()

assert part2_failed_attempt(example_lines) == 46

In [31]:
# with open("./medium_example.txt") as f:
#     medium_lines = []
#     for line in f.readlines():
#         medium_lines.append(line.strip())

# part2_failed_attempt(medium_lines)

In [78]:
def part2(lines: list[str]) -> int:
    initial_mapping_dict = parse_input(lines)

    maps = [map_name for map_name in initial_mapping_dict[MAPS].keys()]
    final_mapping_dict = {}
    for map_name in maps:
        final_mapping_dict[map_name] = extract_map_func(initial_mapping_dict[MAPS][map_name])

    def search_seeds(seed: int):
        source_name = "seed"
        location_found = False
        current_value = seed
        while not location_found:
            map_name, destination_name = get_map_key(source_name, maps)
            # If there's not a mapping, it maps to itself
            next_value = final_mapping_dict[map_name](current_value)
            if destination_name == "location":
                location_found = True
                return next_value
            else:
                source_name = destination_name
                current_value = next_value
    
    search_seeds_vectorised = np.vectorize(search_seeds, cache=True)

    initial_seeds_raw = initial_mapping_dict[SEEDS]
    starts_and_lengths = [(initial_seeds_raw[i], initial_seeds_raw[i+1]) for i in range(0, len(initial_seeds_raw), 2)]

    raw_input_s_l_vectors = [
        np.arange(start, start+length) for start, length in starts_and_lengths
    ]
    # need to split the input_s_l_vectors into more manageable chunks?
    input_s_l_vectors = []
    for big_vector in raw_input_s_l_vectors:
        # If the big vector is of size 1,000,000,000
        # and we want all vectors to be of size 1,000,000
        # we split into big_vector.size // 1,000,000 = 10 vectors
        num_splits = max(1, big_vector.size // 100000.)
        input_s_l_vectors.extend(
            np.array_split(big_vector, num_splits)
        )

    output_list = Parallel(n_jobs=-1)(
        delayed(search_seeds_vectorised)(input_vector)
        for input_vector in input_s_l_vectors
    )
    # import time

    # output_list = []
    # print(len(input_s_l_vectors))
    # for vec in input_s_l_vectors:
        # t1 = time.time()
        # output_list.append(search_seeds_vectorised(vec))
        # t2 = time.time()
        # print(f"did a vec of size {vec.size} in {t2-t1}")

    outputs = np.concatenate(output_list)
    return outputs.min()

assert part2(example_lines) == 46

In [73]:
# with open("./medium_example.txt") as f:
#     medium_lines = []
#     for line in f.readlines():
#         medium_lines.append(line.strip())

# part2(medium_lines)

# Medium estimated to take 1656 * 6 seconds on one core so like 2.76 hours

Estimated that part2(input_lines) would take 30 hours on one core, so hoping on all cores it's more like 4 hours? Trying overnight lol

In [75]:
part2(input_lines)

OverflowError: Python int too large to convert to C long

RIP this attempt :(