-
Notifications
You must be signed in to change notification settings - Fork 0
/
translate_to_legacy.py
546 lines (476 loc) · 20.8 KB
/
translate_to_legacy.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
# -*- coding: utf-8 -*-
# Source: https://github.com/almarklein/translate_to_legacy
# Copyright (c) 2016, Almar Klein - this code is subject to the BSD license
# The parser code and regexes are based on code by Rob Reilink from the
# IEP project.
"""
Single module to translate Python 3 code to Python 2.7. Write all your
code in Python 3, and convert it to Python 2.7 at install time.
"""
from __future__ import print_function
import os
import re
# List of fixers from lib3to2: absimport annotations bitlength bool
# bytes classdecorator collections dctsetcomp division except features
# fullargspec funcattrs getcwd imports imports2 input int intern
# itertools kwargs memoryview metaclass methodattrs newstyle next
# numliterals open print printfunction raise range reduce setliteral
# str super throw unittest unpacking with
ALPHANUM = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789'
KEYWORDS = set(['False', 'None', 'True', 'and', 'as', 'assert', 'break',
'class', 'continue', 'def', 'del', 'elif', 'else', 'except',
'finally', 'for', 'from', 'global', 'if', 'import', 'in', 'is',
'lambda', 'nonlocal', 'not', 'or', 'pass', 'raise', 'return',
'try', 'while', 'with', 'yield'])
# This regexp is used to find the tokens
tokenProg = re.compile(
'(#)|' + # Comment or
'(' + # Begin of string group (group 1)
'[bB]?[uU]?[rR]?' + # Possibly bytes, unicode, raw
'("""|\'\'\'|"|\')' + # String start (triple qoutes first, group 3)
')|' + # End of string group
'([' + ALPHANUM + '_]+)' # Identifiers/numbers (group 1) or
)
# regexps to find the end of a comment or string
endProgs = {
"#": re.compile(r"\r?\n"),
"'": re.compile(r"([^\\])(\\\\)*'"),
'"': re.compile(r'([^\\])(\\\\)*"'),
"'''": re.compile(r"([^\\])(\\\\)*'''"),
'"""': re.compile(r'([^\\])(\\\\)*"""'),
}
class CancelTranslation(RuntimeError):
pass # to cancel a translation
class Token:
""" A token in the source code. The type of token can be a comment,
string, keyword, number or identifier. It has functionality to get
information on neighboring tokens and neighboring characters. This
should be enough to do all necessary translations.
If the ``fix`` attribute is set, that string will replace the
current string.
"""
def __init__(self, total_text, type, start, end):
self.total_text = total_text
self.type = type
self.start = start
self.end = end
self.fix = None
def __repr__(self):
return '<token %r>' % self.text
def find_forward(self, s):
""" Find the position of a character to the right.
"""
return self.total_text.find(s, self.end)
def find_backward(self, s):
""" Find the position of a character to the left.
"""
return self.total_text.rfind(s, 0, self.start)
@property
def text(self):
""" The original text of the token.
"""
return self.total_text[self.start:self.end]
@property
def prev_char(self):
""" The first non-whitespace char to the left of this token
that is still on the same line.
"""
i = self.find_backward('\n')
i = i if i >= 0 else 0
line = self.total_text[i:self.start]
line = re.sub(r"\s+", '', line) # remove whitespace
return line[-1:] # return single char or empty string
@property
def next_char(self):
""" Get the first non-whitespace char to the right of this token
that is still on the same line.
"""
i = self.find_forward('\n')
i = i if i >= 0 else len(self.total_text)
line = self.total_text[self.end:i]
line = re.sub(r"\s+", '', line) # remove whitespace
return line[:1] # return single char or empty string
@property
def indentation(self):
""" The number of chars that the current line uses for indentation.
"""
i = max(0, self.find_backward('\n'))
line1 = self.total_text[i+1:self.start]
line2 = line1.lstrip()
return len(line1) - len(line2)
@property
def line_tokens(self):
""" All (non-comment) tokens that are on the same line.
"""
i1, i2 = self.find_backward('\n'), self.find_forward('\n')
i1 = i1 if i1 >= 0 else 0
i2 = i2 if i2 >= 0 else len(self.total_text)
t = self
tokens = []
while t.prev_token and t.prev_token.start >= i1:
t = t.prev_token
tokens.append(t)
while (t.next_token and t.next_token.end <= i2 and
t.next_token.type != 'comment'):
t = t.next_token
tokens.append(t)
return tokens
class BaseTranslator:
""" Translate Python code. One translator instance is used to
translate one file.
"""
def __init__(self, text):
self._text = text
self._tokens = None
@property
def tokens(self):
""" The list of tokens.
"""
if self._tokens is None:
self._parse()
return self._tokens
def _parse(self):
""" Generate tokens by parsing the code.
"""
self._tokens = []
pos = 0
# Find tokens
while True:
token = self._find_next_token(pos)
if token is None:
break
self._tokens.append(token)
pos = token.end
# Link tokens
if self._tokens:
self._tokens[0].prev_token = None
self._tokens[len(self._tokens)-1].next_token = None
for i in range(0, len(self._tokens)-1):
self._tokens[i].next_token = self._tokens[i+1]
for i in range(1, len(self._tokens)):
self._tokens[i].prev_token = self._tokens[i-1]
def _find_next_token(self, pos):
""" Returns a token or None if no new tokens can be found.
"""
text = self._text
# Init tokens, if pos too large, were done
if pos > len(text):
return None
# Find the start of the next string or comment
match = tokenProg.search(text, pos)
if not match:
return None
if match.group(1):
# Comment
start = match.start()
end_match = endProgs['#'].search(text, start+1)
end = end_match.start() if end_match else len(text)
return Token(text, 'comment', start, end)
elif match.group(2) is not None:
# String - we start the search for the end-char(s) at end-1,
# because our regexp has to allow for one char (which is
# not backslash) before the end char(s).
start = match.start()
string_style = match.group(3)
end = endProgs[string_style].search(text, match.end() - 1).end()
return Token(text, 'string', start, end)
else:
# Identifier ("a word or number") Find out whether it is a key word
identifier = match.group(4)
tokenArgs = match.start(), match.end()
if identifier in KEYWORDS:
return Token(text, 'keyword', *tokenArgs)
elif identifier[0] in '0123456789':
return Token(text, 'number', *tokenArgs)
else:
return Token(text, 'identifier', *tokenArgs)
def translate(self):
""" Translate the code by applying fixes to the tokens. Returns
the new code as a string.
"""
# Collect fixers. Sort by name, so at least its consistent.
fixers = []
for name in sorted(dir(self)):
if name.startswith('fix_'):
fixers.append(getattr(self, name))
# Apply fixers
new_tokens = []
for i, token in enumerate(self.tokens):
for fixer in fixers:
new_token = fixer(token)
if isinstance(new_token, Token):
assert new_token.start == new_token.end
if new_token.start <= token.start:
new_tokens.append((i, new_token))
else:
new_tokens.append((i+1, new_token))
# Insert new tokens
for i, new_token in reversed(new_tokens):
self._tokens.insert(i, new_token)
return self.dumps()
def dumps(self):
""" Return a string with the translated code.
"""
text = self._text
pos = len(self._text)
pieces = []
for t in reversed(self.tokens):
pieces.append(text[t.end:pos])
pieces.append(t.fix if t.fix is not None else t.text)
pos = t.start
pieces.append(text[:pos])
return ''.join(reversed(pieces))
@classmethod
def translate_dir(cls, dirname, skip=()):
""" Classmethod to translate all .py files in the given
directory and its subdirectories. Skips files that match names
in skip (which can be full file names, absolute paths, and paths
relative to dirname). Any file that imports 'print_function'
from __future__ is cancelled.
"""
dirname = os.path.normpath(dirname)
skip = [os.path.normpath(p) for p in skip]
for root, dirs, files in os.walk(dirname):
for fname in files:
if fname.endswith('.py'):
filename = os.path.join(root, fname)
relpath = os.path.relpath(filename, dirname)
if fname in skip or relpath in skip or filename in skip:
print('%s skipped: %r' % (cls.__name__, relpath))
continue
code = open(filename, 'rb').read().decode('utf-8')
try:
new_code = cls(code).translate()
except CancelTranslation:
print('%s cancelled: %r' % (cls.__name__, relpath))
else:
with open(filename, 'wb') as f:
f.write(new_code.encode('utf-8'))
print('%s translated: %r' % (cls.__name__, relpath))
class LegacyPythonTranslator(BaseTranslator):
""" A Translator to translate Python 3 to Python 2.7.
"""
FUTURES = ('print_function', 'absolute_import', 'with_statement',
'unicode_literals', 'division')
def dumps(self):
return '# -*- coding: utf-8 -*-\n' + BaseTranslator.dumps(self)
def fix_cancel(self, token):
""" Cancel translation if using `from __future__ import xxx`
"""
if token.type == 'keyword' and (token.text == 'from' and
token.next_token.text == '__future__'):
for future in self.FUTURES:
if any([t.text == future for t in token.line_tokens]):
# Assume this module is already Python 2.7 compatible
raise CancelTranslation()
def fix_future(self, token):
""" Fix print_function, absolute_import, with_statement.
"""
status = getattr(self, '_future_status', 0)
if status == 2:
return # Done
if status == 0 and token.type == 'string':
self._future_status = 1 # docstring
elif token.type != 'comment':
self._future_status = 2 # done
i = max(0, token.find_backward('\n'))
t = Token(token.total_text, '', i, i)
t.fix = '\nfrom __future__ import %s\n' % (', '.join(self.FUTURES))
return t
def fix_newstyle(self, token):
""" Fix to always use new style classes.
"""
if token.type == 'keyword' and token.text == 'class':
nametoken = token.next_token
if nametoken.next_char != '(':
nametoken.fix = '%s(object)' % nametoken.text
def fix_super(self, token):
""" Fix super() -> super(Cls, self)
"""
# First keep track of the current class
if token.type == 'keyword':
if token.text == 'class':
self._current_class = token.indentation, token.next_token.text
elif token.text == 'def':
indent, name = getattr(self, '_current_class', (0, ''))
if token.indentation <= indent:
self._current_class = 0, ''
# Then check for super
if token.type == 'identifier' and token.text == 'super':
if token.prev_char != '.' and token.next_char == '(':
i = token.find_forward(')')
sub = token.total_text[token.end:i+1]
if re.sub(r"\s+", '', sub) == '()':
indent, name = getattr(self, '_current_class', (0, ''))
if name:
token.end = i + 1
token.fix = 'super(%s, self)' % name
# Note: we use "from __future__ import unicode_literals"
# def fix_unicode_literals(self, token):
# if token.type == 'string':
# if token.text.lstrip('r').startswith(('"', "'")): # i.e. no b/u
# token.fix = 'u' + token.text
def fix_unicode(self, token):
if token.type == 'identifier':
if token.text == 'chr' and token.next_char == '(':
# Calling chr
token.fix = 'unichr'
elif token.text == 'str' and token.next_char == '(':
# Calling str
token.fix = 'unicode'
elif token.text == 'str' and (token.next_char == ')' and
token.prev_char == '(' and
token.line_tokens[0].text == 'class'):
token.fix = 'unicode'
elif token.text == 'isinstance' and token.next_char == '(':
# Check for usage of str in isinstance
end = token.find_forward(')')
t = token.next_token
while t.next_token and t.next_token.start < end:
t = t.next_token
if t.text == 'str':
t.fix = 'basestring'
def fix_range(self, token):
if token.type == 'identifier' and token.text == 'range':
if token.next_char == '(' and token.prev_char != '.':
token.fix = 'xrange'
def fix_encode(self, token):
if token.type == 'identifier' and token.text in('encode', 'decode'):
if token.next_char == '(' and token.prev_char == '.':
end = token.find_forward(')')
if not (token.next_token and token.next_token.start < end):
token.fix = token.text + '("utf-8")'
token.end = end + 1
def fix_getcwd(self, token):
""" Fix os.getcwd -> os.getcwdu
"""
if token.type == 'identifier' and token.text == 'getcwd':
if token.next_char == '(':
token.fix = 'getcwdu'
def fix_imports(self, token):
""" import xx.yy -> import zz
"""
if token.type == 'keyword' and token.text == 'import':
tokens = token.line_tokens
# For each import case ...
for name, replacement in self.IMPORT_MAPPING.items():
parts = name.split('.')
# Walk over tokens to find start of match
for i in range(len(tokens)):
if (tokens[i].text == parts[0] and
len(tokens[i:]) >= len(parts)):
# Is it a complete match?
for j, part in enumerate(parts):
if tokens[i+j].text != part:
break
else:
# Match, marge tokens
tokens[i].end = tokens[i+len(parts)-1].end
tokens[i].fix = replacement
for j in range(1, len(parts)):
tokens[i+j].start = tokens[i].end
tokens[i+j].end = tokens[i].end
tokens[i+j].fix = ''
break # we have found the match
def fix_imports2(self, token):
""" from xx.yy import zz -> from vv import zz
"""
if token.type == 'keyword' and token.text == 'import':
tokens = token.line_tokens
# We use the fact that all imports keys consist of two names
if tokens[0].text == 'from' and len(tokens) == 5:
if tokens[3].text == 'import':
xxyy = tokens[1].text + '.' + tokens[2].text
name = tokens[4].text
if xxyy in self.IMPORT_MAPPING2:
for possible_module in self.IMPORT_MAPPING2[xxyy]:
if name in self.PY2MODULES[possible_module]:
tokens[1].fix = possible_module
tokens[1].end = tokens[2].end
tokens[2].start = tokens[2].end
break
# Map simple import paths to new import import paths
IMPORT_MAPPING = {
"reprlib": "repr",
"winreg": "_winreg",
"configparser": "ConfigParser",
"copyreg": "copy_reg",
"queue": "Queue",
"socketserver": "SocketServer",
"_markupbase": "markupbase",
"test.support": "test.test_support",
"dbm.bsd": "dbhash",
"dbm.ndbm": "dbm",
"dbm.dumb": "dumbdbm",
"dbm.gnu": "gdbm",
"html.parser": "HTMLParser",
"html.entities": "htmlentitydefs",
"http.client": "httplib",
"http.cookies": "Cookie",
"http.cookiejar": "cookielib",
"urllib.robotparser": "robotparser",
"xmlrpc.client": "xmlrpclib",
"builtins": "__builtin__",
}
# Map import paths to ... a set of possible import paths
IMPORT_MAPPING2 = {
'urllib.request': ('urllib2', 'urllib'),
'urllib.error': ('urllib2', 'urllib'),
'urllib.parse': ('urllib2', 'urllib', 'urlparse'),
'dbm.__init__': ('anydbm', 'whichdb'),
'http.server': ('CGIHTTPServer', 'SimpleHTTPServer', 'BaseHTTPServer'),
'xmlrpc.server': ('DocXMLRPCServer', 'SimpleXMLRPCServer'),
}
# This defines what names are in specific Python 2 modules
PY2MODULES = {
'urllib2' : (
'AbstractBasicAuthHandler', 'AbstractDigestAuthHandler',
'AbstractHTTPHandler', 'BaseHandler', 'CacheFTPHandler',
'FTPHandler', 'FileHandler', 'HTTPBasicAuthHandler',
'HTTPCookieProcessor', 'HTTPDefaultErrorHandler',
'HTTPDigestAuthHandler', 'HTTPError', 'HTTPErrorProcessor',
'HTTPHandler', 'HTTPPasswordMgr',
'HTTPPasswordMgrWithDefaultRealm', 'HTTPRedirectHandler',
'HTTPSHandler', 'OpenerDirector', 'ProxyBasicAuthHandler',
'ProxyDigestAuthHandler', 'ProxyHandler', 'Request',
'StringIO', 'URLError', 'UnknownHandler', 'addinfourl',
'build_opener', 'install_opener', 'parse_http_list',
'parse_keqv_list', 'randombytes', 'request_host', 'urlopen'),
'urllib' : (
'ContentTooShortError', 'FancyURLopener', 'URLopener',
'basejoin', 'ftperrors', 'getproxies',
'getproxies_environment', 'localhost', 'pathname2url',
'quote', 'quote_plus', 'splitattr', 'splithost',
'splitnport', 'splitpasswd', 'splitport', 'splitquery',
'splittag', 'splittype', 'splituser', 'splitvalue',
'thishost', 'unquote', 'unquote_plus', 'unwrap',
'url2pathname', 'urlcleanup', 'urlencode', 'urlopen',
'urlretrieve',),
'urlparse' : (
'parse_qs', 'parse_qsl', 'urldefrag', 'urljoin',
'urlparse', 'urlsplit', 'urlunparse', 'urlunsplit'),
'dbm' : (
'ndbm', 'gnu', 'dumb'),
'anydbm' : (
'error', 'open'),
'whichdb' : (
'whichdb',),
'BaseHTTPServer' : (
'BaseHTTPRequestHandler', 'HTTPServer'),
'CGIHTTPServer' : (
'CGIHTTPRequestHandler',),
'SimpleHTTPServer' : (
'SimpleHTTPRequestHandler',),
'DocXMLRPCServer' : (
'DocCGIXMLRPCRequestHandler', 'DocXMLRPCRequestHandler',
'DocXMLRPCServer', 'ServerHTMLDoc', 'XMLRPCDocGenerator'),
}
if __name__ == '__main__':
# Awesome for testing
code = """
"""
t = LegacyPythonTranslator(code)
new_code = t.translate()
print(t.tokens)
print('---')
print(new_code)