Skip to content

Commit

Permalink
amended tokenizer for dash, robust to weird spacing for dashes
Browse files Browse the repository at this point in the history
  • Loading branch information
alvinwan committed Aug 27, 2018
1 parent c57c41c commit 0e87563
Show file tree
Hide file tree
Showing 5 changed files with 89 additions and 24 deletions.
4 changes: 3 additions & 1 deletion tests/test_e2e.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,6 @@ def test_choices(now):
(datetime.datetime(2018, 7, 17, 16, 0), datetime.datetime(2018, 7, 17, 17, 0)),
(datetime.datetime(2018, 7, 17, 17, 0), datetime.datetime(2018, 7, 17, 18, 0))
]
print(timefhuman('7/17 4-5 or 5-6 PM', raw=True))
assert timefhuman('7/17 4-5 or 5-6 PM') == [
(datetime.datetime(2018, 7, 17, 16, 0), datetime.datetime(2018, 7, 17, 17, 0)),
(datetime.datetime(2018, 7, 17, 17, 0), datetime.datetime(2018, 7, 17, 18, 0))
Expand All @@ -53,3 +52,6 @@ def test_edge_cases_range(now):
assert timefhuman('7/17-7/18', now) == (
datetime.datetime(2018, 7, 17, 0, 0),
datetime.datetime(2018, 7, 18, 0, 0),)
assert timefhuman('7/17 3 pm- 7/19 2 pm') == (
datetime.datetime(2018, 7, 17, 15, 0),
datetime.datetime(2018, 7, 19, 14, 0),)
14 changes: 5 additions & 9 deletions timefhuman/categorize.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@ def categorize(tokens, now):
[8/6/2018, 'or', 8/7/2018, 12 pm]
>>> categorize(['7/17', '4', 'or', '5', 'PM'], now)
[7/17/2018, 4:00, 'or', 5 pm]
>>> categorize(['7/17', '3', 'pm', '-', '7/19', '2', 'pm'], now)
[7/17/2018, 3 pm, '-', 7/19/2018, 2 pm]
"""
tokens = list(tokens)
tokens = convert_day_of_week(tokens, now)
Expand Down Expand Up @@ -224,18 +226,17 @@ def maybe_substitute_using_date(tokens, now=datetime.datetime.now()):
(month).(day).(year)
(month)-(day)-(year)
>>> now = datetime.datetime(2018, 8, 18)
>>> maybe_substitute_using_date(['7/17/18'])
[7/17/2018]
>>> maybe_substitute_using_date(['7-17-18'])
[7/17/2018]
>>> maybe_substitute_using_date(['3', 'on', '7.17.18'])
['3', 'on', 7/17/2018]
>>> maybe_substitute_using_date(['7-25', '3-4', 'pm'])
>>> maybe_substitute_using_date(['7-25', '3-4', 'pm'], now=now)
[7/25/2018, 3/4/2018 OR 3:00 - 4:00, 'pm']
>>> maybe_substitute_using_date(['7/4', '-', '7/6'])
>>> maybe_substitute_using_date(['7/4', '-', '7/6'], now=now)
[7/4/2018, '-', 7/6/2018]
>>> maybe_substitute_using_date(['7/17-7/18'])
[7/17/2018, '-', 7/18/2018]
"""
i = 0
while i < len(tokens):
Expand All @@ -249,11 +250,6 @@ def maybe_substitute_using_date(tokens, now=datetime.datetime.now()):
if punctuation not in token:
continue

if '-' in token and '-' != punctuation:
parts = token.split('-')
tokens = tokens[:i] + [parts[0], '-', parts[1]] + tokens[i+1:]
i -= 1
break
parts = tuple(map(int, token.split(punctuation)))
if len(parts) == 2:
day = DayToken(month=parts[0], day=parts[1], year=now.year)
Expand Down
18 changes: 18 additions & 0 deletions timefhuman/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,24 @@ def __init__(self, year, month, day, relative_hour, minute=0, time_of_day='am'):
self.day = DayToken(month, day, year)
self.time = TimeToken(relative_hour, time_of_day, minute)

def combine(self, other):
"""
>>> dt = DayTimeToken(2018, 8, 18, 3, 0, 'pm')
>>> day = DayToken(8, 20, 2018)
>>> dt.combine(day)
8/20/2018 3 pm
>>> time = TimeToken(5, 'pm')
>>> dt.combine(time)
8/18/2018 5 pm
"""
assert isinstance(other, (DayToken, TimeToken))
if isinstance(other, DayToken):
return other.combine(self.time)
elif isinstance(other, TimeToken):
self.time.apply(other)
return self.day.combine(other)

def datetime(self, now):
# TODO: handle Nones
return datetime.datetime(
Expand Down
46 changes: 42 additions & 4 deletions timefhuman/tokenize.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,26 @@
def tokenize(characters):
"""Tokenize all characters in the string.
>>> list(tokenize('7/17/18 3:00 p.m.'))
>>> list(tokenize('7/17-7/18 3 pm- 4 pm'))
['7/17', '-', '7/18', '3', 'pm', '-', '4', 'pm']
>>> list(tokenize('7/17 3 pm- 7/19 2 pm'))
['7/17', '3', 'pm', '-', '7/19', '2', 'pm']
"""
tokens = generic_tokenize(characters)
tokens = clean_dash_tokens(tokens)
return tokens


def generic_tokenize(characters):
"""Default tokenizer
>>> list(generic_tokenize('7/17/18 3:00 p.m.'))
['7/17/18', '3:00', 'p.m.']
>>> list(tokenize('July 17, 2018 at 3p.m.'))
>>> list(generic_tokenize('July 17, 2018 at 3p.m.'))
['July', '17', '2018', 'at', '3', 'p.m.']
>>> list(tokenize('July 17, 2018 3 p.m.'))
>>> list(generic_tokenize('July 17, 2018 3 p.m.'))
['July', '17', '2018', '3', 'p.m.']
>>> list(tokenize('3PM on July 17'))
>>> list(generic_tokenize('3PM on July 17'))
['3', 'PM', 'on', 'July', '17']
"""
token = ''
Expand All @@ -34,6 +47,31 @@ def tokenize(characters):
yield token


def clean_dash_tokens(tokens):
"""Clean up dash tokens.
- If the dash-delimited values are not integers, the values joined by dashes
will need further parsing.
>>> list(clean_dash_tokens(['7-18', '3', 'pm-']))
['7-18', '3', 'pm', '-']
>>> list(clean_dash_tokens(['7/17-7/18']))
['7/17', '-', '7/18']
"""
for token in tokens:
if '-' in token:
parts = token.split('-')
if not all([s.isdigit() for s in parts]):
if parts[0]:
yield parts[0]
for part in parts[1:]:
yield '-'
if part:
yield part
continue
yield token


def get_character_type(character):
"""
>>> get_character_type('a')
Expand Down
31 changes: 21 additions & 10 deletions timefhuman/tree.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,11 +25,11 @@ def build_tree(tokens, now=datetime.datetime.now()):
>>> build_tree([DayToken(7, 5, 2018), TimeToken(3, None), 'or', TimeToken(4, 'pm')])
[7/5/2018 3 pm, 'or', 7/5/2018 4 pm]
"""
tokens = combine_ranges(tokens)
tokens = combine_on_at(tokens)
tokens = combine_ors(tokens)
tokens = combine_days_and_times(tokens)
tokens = combine_ors(tokens) # TODO: is this the cleanest way to do this?
tokens = combine_ranges(tokens)
return tokens


Expand Down Expand Up @@ -112,6 +112,10 @@ def combine_ranges(tokens):
[7/5/2018 - 7/7/2018, 9-11 am]
>>> combine_ranges([TimeToken(7, 'pm'), 'to', DayToken(7, 7, 2018)]) # ignore meaningless 'to' # TODO: assert?
[7 pm, 7/7/2018]
>>> combine_ranges([DayToken(7, 5, 2018), 'to', DayTimeToken(2018, 7, 7, 11)])
[7/5/2018 11 am - 7/7/2018 11 am]
>>> combine_ranges([DayTimeToken(2018, 7, 17, 15, 30), '-', TimeToken(16)])
[7/17/2018 3:30-4 pm]
"""
while '-' in tokens or 'to' in tokens:
if '-' in tokens:
Expand All @@ -127,10 +131,20 @@ def combine_ranges(tokens):
end = tokens[index+1]
start = tokens[index-1]

daytime_day_or_time_step = ifmatchinstance([start, end], (DayTimeToken, (DayToken, TimeToken)))

if areinstance((start, end), TimeToken):
tokens = tokens[:index-1] + [TimeRange(start, end)] + tokens[index+2:]
elif areinstance((start, end), DayToken):
tokens = tokens[:index-1] + [DayRange(start, end)] + tokens[index+2:]
elif daytime_day_or_time_step:
daytime1, day_or_time = [start, end][::daytime_day_or_time_step]
daytime2 = daytime1.combine(day_or_time)
if daytime1 is not start:
daytime1, daytime2 = daytime2, daytime1
tokens = tokens[:index-1] + [DayTimeRange(daytime1, daytime2)] + tokens[index+2:]
elif areinstance((start, end), DayTimeToken):
tokens = tokens[:index-1] + [DayTimeRange(start, end)] + tokens[index+2:]
else:
tokens = tokens[:index] + tokens[index+1:] # ignore meaningless dashes, to
return tokens
Expand Down Expand Up @@ -173,6 +187,8 @@ def combine_days_and_times(tokens):
['or', 7/7/2018 11 am]
>>> combine_days_and_times([TimeToken(11), DayToken(7, 7, 2018)])
[7/7/2018 11 am]
>>> combine_days_and_times([DayToken(7, 17, 2018), TimeToken(15, minute=30), '-', TimeToken(16)])
[7/17/2018 3:30 pm, '-', 4 pm]
"""
cursor = 0
day_tokens = (DayToken, DayRange)
Expand Down Expand Up @@ -212,18 +228,13 @@ def combine_ors(tokens):

# TODO: too explicit, need generic way to "cast"
candidates = (tokens[index-1], tokens[index+1])
day_daytime_step = ifmatchinstance(candidates, (DayToken, DayTimeToken))
time_daytime_step = ifmatchinstance(candidates, (TimeToken, DayTimeToken))
day_or_time_daytime_step = ifmatchinstance(candidates, ((TimeToken, DayToken), DayTimeToken))
time_time_step = ifmatchinstance(candidates, (TimeToken, TimeToken))
amb_timerange_step = ifmatchinstance(candidates, (AmbiguousToken, TimeRange))
timerange_daytimerange_step = ifmatchinstance(candidates, (TimeRange, DayTimeRange))
if day_daytime_step:
day, daytime = candidates[::day_daytime_step]
tokens[index-day_daytime_step] = day.combine(daytime.time)
elif time_daytime_step:
time, daytime = candidates[::time_daytime_step]
time.apply(daytime.time)
tokens[index-time_daytime_step] = daytime.day.combine(time)
if day_or_time_daytime_step:
day_or_time, daytime = candidates[::day_or_time_daytime_step]
tokens[index-day_or_time_daytime_step] = daytime.combine(day_or_time)
elif time_time_step:
time1, time2 = candidates[::time_time_step]
time1.apply(time2)
Expand Down

0 comments on commit 0e87563

Please sign in to comment.