In [15]:
import pandas as pd

In [16]:

# Opening the data correctly needed to be done using the regular Python open function as the read_csv failed to use \n\n correctly
with open('../inputs/advent4.txt') as file:
    data = file.read()

passport_lines = data.split('\n\n')

passport_lines_clean = [line.replace('\n',' ') for line in passport_lines]

df = pd.DataFrame({'raw':passport_lines_clean})

df

Unnamed: 0,raw
0,byr:1937 eyr:2030 pid:154364481 hgt:158cm iyr:...
1,cid:279 eyr:2029 pid:675014709 ecl:amb byr:198...
2,iyr:2011 hgt:181cm hcl:#341e13 pid:282499883 b...
3,eyr:2040 iyr:1984 pid:2371396209 byr:1951 cid:...
4,iyr:2014 byr:1966 hgt:153cm pid:900693718 eyr:...
...,...
260,pid:849044092 eyr:2020 hgt:186cm iyr:2014 byr:...
261,hgt:159cm iyr:1950 eyr:2021 pid:325442644 hcl:...
262,eyr:2023 hgt:188cm iyr:2014 pid:945115479 byr:...
263,eyr:2024 cid:274 pid:390115952 byr:1934 hgt:16...


In [17]:
# Creating a column for each of the potential variables
def detect_pattern(df, col, pattern):
    regex = pattern + ":(\S+)"
    df = df[col].str.extract(regex)
    return df


In [18]:

patterns = ['ecl','pid','eyr','hcl','byr','iyr','cid','hgt']

for pattern in patterns:
    df[pattern] = detect_pattern(df, 'raw', pattern)

df
    

Unnamed: 0,raw,ecl,pid,eyr,hcl,byr,iyr,cid,hgt
0,byr:1937 eyr:2030 pid:154364481 hgt:158cm iyr:...,brn,154364481,2030,#c0946f,1937,2015,155,158cm
1,cid:279 eyr:2029 pid:675014709 ecl:amb byr:198...,amb,675014709,2029,z,1985,2025,279,179in
2,iyr:2011 hgt:181cm hcl:#341e13 pid:282499883 b...,brn,282499883,2023,#341e13,1953,2011,,181cm
3,eyr:2040 iyr:1984 pid:2371396209 byr:1951 cid:...,,2371396209,2040,#623a2f,1951,1984,283,164cm
4,iyr:2014 byr:1966 hgt:153cm pid:900693718 eyr:...,gry,900693718,2020,#866857,1966,2014,,153cm
...,...,...,...,...,...,...,...,...,...
260,pid:849044092 eyr:2020 hgt:186cm iyr:2014 byr:...,,849044092,2020,#866857,1991,2014,,186cm
261,hgt:159cm iyr:1950 eyr:2021 pid:325442644 hcl:...,gry,325442644,2021,#888785,1986,1950,,159cm
262,eyr:2023 hgt:188cm iyr:2014 pid:945115479 byr:...,blu,945115479,2023,#b6652a,1979,2014,,188cm
263,eyr:2024 cid:274 pid:390115952 byr:1934 hgt:16...,,390115952,2024,#b95b0d,1934,2017,274,161cm


In [19]:
# Check which values are missing aka not valid
missings = ~df.isnull()

# Count the fields present in mandatory columns
valid_cols = ['ecl', 'pid', 'eyr', 'hcl', 'byr', 'iyr', 'hgt']
df['valid_cols'] = missings[valid_cols].sum(axis=1)
df['is_valid'] = df.apply(lambda x: x['valid_cols'] == 7, axis=1)
df

Unnamed: 0,raw,ecl,pid,eyr,hcl,byr,iyr,cid,hgt,valid_cols,is_valid
0,byr:1937 eyr:2030 pid:154364481 hgt:158cm iyr:...,brn,154364481,2030,#c0946f,1937,2015,155,158cm,7,True
1,cid:279 eyr:2029 pid:675014709 ecl:amb byr:198...,amb,675014709,2029,z,1985,2025,279,179in,7,True
2,iyr:2011 hgt:181cm hcl:#341e13 pid:282499883 b...,brn,282499883,2023,#341e13,1953,2011,,181cm,7,True
3,eyr:2040 iyr:1984 pid:2371396209 byr:1951 cid:...,,2371396209,2040,#623a2f,1951,1984,283,164cm,6,False
4,iyr:2014 byr:1966 hgt:153cm pid:900693718 eyr:...,gry,900693718,2020,#866857,1966,2014,,153cm,7,True
...,...,...,...,...,...,...,...,...,...,...,...
260,pid:849044092 eyr:2020 hgt:186cm iyr:2014 byr:...,,849044092,2020,#866857,1991,2014,,186cm,6,False
261,hgt:159cm iyr:1950 eyr:2021 pid:325442644 hcl:...,gry,325442644,2021,#888785,1986,1950,,159cm,7,True
262,eyr:2023 hgt:188cm iyr:2014 pid:945115479 byr:...,blu,945115479,2023,#b6652a,1979,2014,,188cm,7,True
263,eyr:2024 cid:274 pid:390115952 byr:1934 hgt:16...,,390115952,2024,#b95b0d,1934,2017,274,161cm,6,False


In [20]:
# Part A Answer
df['is_valid'].sum()

200

In [21]:
# Part B - adding the more complex rules to the password detection

# For HGT, we first need to deconstruct the value into a regex that verifies whether there is a in or cm
# and between 2 and 3 digits
hgt_values = df['raw'].str.extract('hgt:(?P<hgt_value>\d{2,3})(?P<hgt_system>in|cm)')
df = pd.concat([df, hgt_values], axis=1)

In [37]:
# Transform integer variables into int's instead of strings
df[['pid','eyr','byr','iyr','cid','hgt_value']] = df[['pid','eyr','byr','iyr','cid','hgt_value']].apply(pd.to_numeric, errors='coerce')

df['valid_byr'] = df['byr'].between(1920, 2002)
df['valid_iyr'] = df['iyr'].between(2010, 2020)
df['valid_eyr'] = df['eyr'].between(2020, 2030)

# (Height) - a number followed by either cm or in:
# If cm, the number must be at least 150 and at most 193.
# If in, the number must be at least 59 and at most 76.
# Create the function that will determine whether the height is valid
def get_hgt_validity(df):
    if df['hgt_system'] == 'cm' and 150 <= df['hgt_value'] <= 193:
        return True
    elif df['hgt_system'] == 'in' and 59 <= df['hgt_value'] <= 76:
        return True
    else:
        return False
                                      
df['valid_hgt'] = df.apply(get_hgt_validity, axis=1)

# (Hair Color) - a # followed by exactly six characters 0-9 or a-f.
#df['valid_hcl'] = df['raw'].str.contains('hcl:#(?:[\d]|[a-f]){6}', regex = True)
df['valid_hcl'] = df['raw'].str.contains('hcl:#[a-z0-9]{6}', regex = True)
    
# ecl (Eye Color) - exactly one of: amb blu brn gry grn hzl oth.
# df['valid_ecl'] = df['raw'].str.contains('ecl:(?:amb|blu|brn|gry|grn|hzl|oth){1}', regex = True)
df['valid_ecl'] = df['raw'].str.contains('ecl:(?:amb|blu|brn|gry|grn|hzl|oth)', regex = True)


# a nine-digit number, including leading zeroes
#df['valid_pid'] = df['raw'].str.contains('pid:(?:\d){9}[^\d]', regex = True)
df['valid_pid'] = df['raw'].str.contains('pid:\d{9}\\b', regex = True)

valid_cols = ['valid_ecl', 'valid_pid', 'valid_eyr', 'valid_hcl', 'valid_byr', 'valid_iyr', 'valid_hgt']

df['valid_cols'] = df[valid_cols].sum(axis=1)
df['is_valid_b'] = df.apply(lambda x: x['valid_cols'] == 7, axis=1)
pd.options.display.float_format = '{:20.0f}'.format
df[['pid' ,'valid_pid']]

Unnamed: 0,pid,valid_pid
0,154364481,True
1,675014709,True
2,282499883,True
3,2371396209,False
4,900693718,True
...,...,...
260,849044092,True
261,325442644,True
262,945115479,True
263,390115952,True


In [38]:
df['is_valid_b'].sum()


116