-
Notifications
You must be signed in to change notification settings - Fork 4
/
parser.py
341 lines (263 loc) · 12.1 KB
/
parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
#!/usr/bin/env python
# Fylm
# Copyright 2021 github.com/brandonscript
# This program is bound to the Hippocratic License 2.1
# Full text is available here:
# https: // firstdonoharm.dev/version/2/1/license
# Further to adherence to the Hippocratic Licenese, this program is
# free software: you can redistribute it and / or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version. Full text is avaialble here:
# http: // www.gnu.org/licenses
# Where a conflict or dispute would arise between these two licenses, HLv2.1
# shall take precedence.
"""Main file parser for Fylm.
This module takes raw input source paths and cleans/analyzes them
to determine various properties used in the name correction
and TMDb lookup.
parser: the main class exported by this module.
"""
import re
from lazy import lazy
from typing import Union
from pathlib import Path
import fylmlib.config as config
import fylmlib.patterns as patterns
from .tools import *
from .enums import *
from . import Format
from . import Console
from timeit import default_timer as timer
class Parser:
"""A collection of string parsing utilities to apply regular
expressions and extract critical information from a path.
Instantiate Parser with your path, then call properties on it.
Attributes:
path (Path): Path representation of the path to parse
s (str): String representation of the longest path segment in path.parts
Args:
path (str, Path, or FilmPath): Relative or absolute path to a film file
"""
def __init__(self, path: Union[str, 'Path', 'FilmPath']):
try:
self.path = path.main_file.filmrel if path.main_file.exists() else path
except:
self.path = path
self.parts = Path(self.path).parts
# If the title has multiple path parts, keep the longest one
self.s = str(self.path)
@lazy
def title(self) -> str:
"""Get title from full path of file or folder.
Use regular expressions to strip, clean, and format a file
or folder path into a more pleasant film title.
Args:
path: (str, utf-8) full path of file or folder.
Returns:
A clean and well-formed film title.
"""
start = timer()
# Use the FilmPath's first path with a year, or its biggest
# if there are mutliple paths without years.
title = self.s
if len(self.parts) > 0:
t = first(iter(self.parts), where=lambda x: Parser(x).year)
title = str(t) if t else max(self.parts, key=len)
# Strip "tag" prefixes from the title.
for prefix in config.strip_prefixes:
if title.lower().startswith(prefix.lower()):
title = title[len(prefix):]
# Use the 'STRIP_FROM_TITLE' regular expression to replace unwanted
# characters in a title with a space.
title = re.sub(patterns.STRIP_FROM_TITLE, ' ', title)
# If the title contains a known edition, strip it from the title. E.g.,
# if we have Dinosaur.Special.Edition, we already know the edition, and
# we don't need it to appear, duplicated, in the title. Because
# `_edition_map` returns a (key, value) tuple, we check for the search
# key here and replace it (not the value).
if self._edition_map[0] is not None:
title = re.sub(self._edition_map[0], '', title)
# Typical naming patterns place the year as a delimiter between the title
# and the rest of the file. Therefore we can assume we only care about
# the first part of the string, and so we split on the year value, and keep
# only the left-hand portion.
title = title.split(str(self.year))[0]
# Strip all resolution and media tags from the title.
title = re.sub(patterns.MEDIA, '', title)
title = re.sub(patterns.RESOLUTION, '', title)
# If a title ends with , The, we need to remove it and prepend it to the
# start of the title.
if re.search(patterns.THE_PREFIX_SUFFIX, title):
title = f"The {re.sub(patterns.THE_PREFIX_SUFFIX, '', title)}"
# Remove everything after the encoding string, if it exists
title = re.sub(patterns.ENCODING, '', title)
# Add back in . to titles or strings we know need to to keep periods.
# Looking at you, S.W.A.T and After.Life.
for k in config.keep_period:
q = k.replace('.', '[ .]')
rx = re.compile(r'\b' + q + r'?', re.I)
if re.search(rx, title):
title = re.sub(rx, k + ' ', title)
break
# Remove trailing non-word characters like ' - '
title = Format.strip_trailing_nonword_chars(title)
# Remove extra whitespace from the edges of the title and remove repeating
# whitespace.
title = Format.strip_extra_whitespace(title.strip())
# Correct the case of the title
title = Format.title_case(title)
# Always uppercase strings that are meant to be in all caps
for u in config.always_upper:
rx = re.compile(r'\b(' + u + r')\b', re.I)
if re.search(rx, title):
title = re.sub(rx, lambda m: m.group(1).upper(), title)
end = timer()
if round(end - start) > 1:
Console.slow(
f"Took a long time parsing title from '{self.path.filmrel}'", end - start)
return title
@lazy
def year(self) -> int:
"""Get year from full path of file or folder.
Use regular expressions to identity a year value between 1910 and 2159,
getting the right-most match if there is more than one year found (looking
at you, 2001: A Space Odyssey) and not at the start of the input string
or filename.
Args:
path: (str, utf-8) full path of file or folder.
Returns:
A 4-digit integer representing the release year, or None if
no year could be determined.
"""
start = timer()
# Find all matches of years between 1910 and 2159 (we don't want to
# match 2160 because 2160p, and hopefully I'll be dead by then and
# no one will use python anymore).
m = last(re.finditer(patterns.YEAR, self.s), default=None)
# Get the last element, and retrieve the 'year' capture group by name.
# If there are no matches, return None.
year = int(m.group('year')) if m else None
end = timer()
if round(end - start) > 1:
Console.slow(
f"Took a long time parsing title from '{self.path.filmrel}'", end - start)
return year
@lazy
def edition(self) -> str:
"""Get and correct special edition from full path of file or folder.
Iterate a map of strings (or, more aptly, regular expressions) to
search for, and correct, special editions. This map is defined in
config.edition_map.
Args:
path: (str, utf-8) full path of file or folder.
Returns:
A corrected string representing the film's edition, or None.
"""
# Because _edition_map returns a (key, value) tuple, we need to
# return the second value in the tuple which represents the corrected
# string value of the edition.
return self._edition_map[1] or None
@lazy
def resolution(self) -> str:
"""Parse resolution from a path string using a regular expression,
or optionally from a provided mediainfo object.
Args:
path (str): Relative path for a file (file.ext or dir/file.ext)
Returns:
A an Enum representing the file's resolution, or None.
"""
# Search for any of the known qualities.
m = last(re.finditer(patterns.RESOLUTION, self.s), default=None)
# Get the last element, and retrieve the 'year' capture group by name.
# If there are no matches, return None.
if not m:
return Resolution.UNKNOWN
# If a match exists, convert it to lowercase.
resolution = m.group('resolution')
if resolution.lower() == '4k' or resolution.startswith('2160'):
return Resolution.UHD_2160P
elif resolution.startswith('1080'): return Resolution.HD_1080P
elif resolution.startswith('720'): return Resolution.HD_720P
elif resolution.startswith('576'): return Resolution.SD_576P
elif resolution.startswith('480'): return Resolution.SD_480P
return Resolution.UNKNOWN
@lazy
def media(self) -> str:
"""Get media from full path of file or folder.
Use regular expressions to identity the original media of the file.
Args:
path (str): Relative path of file or folder/file.
Returns:
An enum representing the media found.
"""
match = re.search(patterns.MEDIA, self.s)
if match and match.group('bluray'): return Media.BLURAY
elif match and match.group('webdl'): return Media.WEBDL
elif match and match.group('hdtv'): return Media.HDTV
elif match and match.group('dvd'): return Media.DVD
elif match and match.group('sdtv'): return Media.SDTV
return Media.UNKNOWN
@lazy
def is_hdr(self) -> str:
"""Determine whether the media is an HDR file.
Use regular expressions to identity whether the media is HDR or not.
Args:
path: (str, utf-8) full path of file or folder.
Returns:
A bool representing the HDR status of the media.
"""
match = re.search(patterns.HDR, self.s)
return True if (match and match.group('hdr')) else False
@lazy
def is_proper(self) -> str:
"""Determine whether the media is a proper rip.
Use regular expressions to identity whether the file is a proper or not.
Args:
path: (str, utf-8) full path of file or folder.
Returns:
A bool representing the proper state of the media.
"""
match = re.search(patterns.PROPER, self.s)
return True if (match and match.group('proper')) else False
@lazy
def part(self) -> str:
"""Get part # from full path of file or folder.
Use regular expressions to identity the part # of the file.
Args:
path: (str, utf-8) full path of file or folder.
Returns:
A string representing the part # of the title, or None, if no
match is found.
"""
# Search for a matching part condition
match = re.search(patterns.PART, self.s)
# If a match exists, convert it to uppercase.
return match.group('part').upper() if match else None
@lazy
def _edition_map(self) -> (str, str):
"""Internal method to search for special edition strings in a path.
This method iterates through config.edition_map, generates regular
expressions for each potential match, then returns a (key, value)
tuple containing the first matching regular expression.
Args:
path: (str, utf-8) full path of file or folder.
Returns:
A (key, value) tuple containing either a matching regular expression and its
corrected counterpart, or (None, None).
"""
# Iterate over the edition map.
for key, value in config.edition_map:
# Generate a regular expression that searches for the search key, separated
# by word boundaries.
rx = re.compile(r'\b' + key + r'\b', re.I)
# Because this map is in a specific order, of we find a suitable match, we
# want to return it right away.
result = re.search(rx, self.s)
if result:
# Return a tuple containing the matching compiled expression and its
# corrected value after performing a capture group replace, then break
# the loop.
return (rx, rx.sub(value, result.group()))
# If no matches are found, return (None, None)
return (None, None)