Permalink
Switch branches/tags
Nothing to show
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
547 lines (476 sloc) 20.8 KB
# -*- coding: utf-8 -*-
# Source: https://github.com/almarklein/translate_to_legacy
# Copyright (c) 2016, Almar Klein - this code is subject to the BSD license
# The parser code and regexes are based on code by Rob Reilink from the
# IEP project.
"""
Single module to translate Python 3 code to Python 2.7. Write all your
code in Python 3, and convert it to Python 2.7 at install time.
"""
from __future__ import print_function
import os
import re
# List of fixers from lib3to2: absimport annotations bitlength bool
# bytes classdecorator collections dctsetcomp division except features
# fullargspec funcattrs getcwd imports imports2 input int intern
# itertools kwargs memoryview metaclass methodattrs newstyle next
# numliterals open print printfunction raise range reduce setliteral
# str super throw unittest unpacking with
ALPHANUM = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789'
KEYWORDS = set(['False', 'None', 'True', 'and', 'as', 'assert', 'break',
'class', 'continue', 'def', 'del', 'elif', 'else', 'except',
'finally', 'for', 'from', 'global', 'if', 'import', 'in', 'is',
'lambda', 'nonlocal', 'not', 'or', 'pass', 'raise', 'return',
'try', 'while', 'with', 'yield'])
# This regexp is used to find the tokens
tokenProg = re.compile(
'(#)|' + # Comment or
'(' + # Begin of string group (group 1)
'[bB]?[uU]?[rR]?' + # Possibly bytes, unicode, raw
'("""|\'\'\'|"|\')' + # String start (triple qoutes first, group 3)
')|' + # End of string group
'([' + ALPHANUM + '_]+)' # Identifiers/numbers (group 1) or
)
# regexps to find the end of a comment or string
endProgs = {
"#": re.compile(r"\r?\n"),
"'": re.compile(r"([^\\])(\\\\)*'"),
'"': re.compile(r'([^\\])(\\\\)*"'),
"'''": re.compile(r"([^\\])(\\\\)*'''"),
'"""': re.compile(r'([^\\])(\\\\)*"""'),
}
class CancelTranslation(RuntimeError):
pass # to cancel a translation
class Token:
""" A token in the source code. The type of token can be a comment,
string, keyword, number or identifier. It has functionality to get
information on neighboring tokens and neighboring characters. This
should be enough to do all necessary translations.
If the ``fix`` attribute is set, that string will replace the
current string.
"""
def __init__(self, total_text, type, start, end):
self.total_text = total_text
self.type = type
self.start = start
self.end = end
self.fix = None
def __repr__(self):
return '<token %r>' % self.text
def find_forward(self, s):
""" Find the position of a character to the right.
"""
return self.total_text.find(s, self.end)
def find_backward(self, s):
""" Find the position of a character to the left.
"""
return self.total_text.rfind(s, 0, self.start)
@property
def text(self):
""" The original text of the token.
"""
return self.total_text[self.start:self.end]
@property
def prev_char(self):
""" The first non-whitespace char to the left of this token
that is still on the same line.
"""
i = self.find_backward('\n')
i = i if i >= 0 else 0
line = self.total_text[i:self.start]
line = re.sub(r"\s+", '', line) # remove whitespace
return line[-1:] # return single char or empty string
@property
def next_char(self):
""" Get the first non-whitespace char to the right of this token
that is still on the same line.
"""
i = self.find_forward('\n')
i = i if i >= 0 else len(self.total_text)
line = self.total_text[self.end:i]
line = re.sub(r"\s+", '', line) # remove whitespace
return line[:1] # return single char or empty string
@property
def indentation(self):
""" The number of chars that the current line uses for indentation.
"""
i = max(0, self.find_backward('\n'))
line1 = self.total_text[i+1:self.start]
line2 = line1.lstrip()
return len(line1) - len(line2)
@property
def line_tokens(self):
""" All (non-comment) tokens that are on the same line.
"""
i1, i2 = self.find_backward('\n'), self.find_forward('\n')
i1 = i1 if i1 >= 0 else 0
i2 = i2 if i2 >= 0 else len(self.total_text)
t = self
tokens = []
while t.prev_token and t.prev_token.start >= i1:
t = t.prev_token
tokens.append(t)
while (t.next_token and t.next_token.end <= i2 and
t.next_token.type != 'comment'):
t = t.next_token
tokens.append(t)
return tokens
class BaseTranslator:
""" Translate Python code. One translator instance is used to
translate one file.
"""
def __init__(self, text):
self._text = text
self._tokens = None
@property
def tokens(self):
""" The list of tokens.
"""
if self._tokens is None:
self._parse()
return self._tokens
def _parse(self):
""" Generate tokens by parsing the code.
"""
self._tokens = []
pos = 0
# Find tokens
while True:
token = self._find_next_token(pos)
if token is None:
break
self._tokens.append(token)
pos = token.end
# Link tokens
if self._tokens:
self._tokens[0].prev_token = None
self._tokens[len(self._tokens)-1].next_token = None
for i in range(0, len(self._tokens)-1):
self._tokens[i].next_token = self._tokens[i+1]
for i in range(1, len(self._tokens)):
self._tokens[i].prev_token = self._tokens[i-1]
def _find_next_token(self, pos):
""" Returns a token or None if no new tokens can be found.
"""
text = self._text
# Init tokens, if pos too large, were done
if pos > len(text):
return None
# Find the start of the next string or comment
match = tokenProg.search(text, pos)
if not match:
return None
if match.group(1):
# Comment
start = match.start()
end_match = endProgs['#'].search(text, start+1)
end = end_match.start() if end_match else len(text)
return Token(text, 'comment', start, end)
elif match.group(2) is not None:
# String - we start the search for the end-char(s) at end-1,
# because our regexp has to allow for one char (which is
# not backslash) before the end char(s).
start = match.start()
string_style = match.group(3)
end = endProgs[string_style].search(text, match.end() - 1).end()
return Token(text, 'string', start, end)
else:
# Identifier ("a word or number") Find out whether it is a key word
identifier = match.group(4)
tokenArgs = match.start(), match.end()
if identifier in KEYWORDS:
return Token(text, 'keyword', *tokenArgs)
elif identifier[0] in '0123456789':
return Token(text, 'number', *tokenArgs)
else:
return Token(text, 'identifier', *tokenArgs)
def translate(self):
""" Translate the code by applying fixes to the tokens. Returns
the new code as a string.
"""
# Collect fixers. Sort by name, so at least its consistent.
fixers = []
for name in sorted(dir(self)):
if name.startswith('fix_'):
fixers.append(getattr(self, name))
# Apply fixers
new_tokens = []
for i, token in enumerate(self.tokens):
for fixer in fixers:
new_token = fixer(token)
if isinstance(new_token, Token):
assert new_token.start == new_token.end
if new_token.start <= token.start:
new_tokens.append((i, new_token))
else:
new_tokens.append((i+1, new_token))
# Insert new tokens
for i, new_token in reversed(new_tokens):
self._tokens.insert(i, new_token)
return self.dumps()
def dumps(self):
""" Return a string with the translated code.
"""
text = self._text
pos = len(self._text)
pieces = []
for t in reversed(self.tokens):
pieces.append(text[t.end:pos])
pieces.append(t.fix if t.fix is not None else t.text)
pos = t.start
pieces.append(text[:pos])
return ''.join(reversed(pieces))
@classmethod
def translate_dir(cls, dirname, skip=()):
""" Classmethod to translate all .py files in the given
directory and its subdirectories. Skips files that match names
in skip (which can be full file names, absolute paths, and paths
relative to dirname). Any file that imports 'print_function'
from __future__ is cancelled.
"""
dirname = os.path.normpath(dirname)
skip = [os.path.normpath(p) for p in skip]
for root, dirs, files in os.walk(dirname):
for fname in files:
if fname.endswith('.py'):
filename = os.path.join(root, fname)
relpath = os.path.relpath(filename, dirname)
if fname in skip or relpath in skip or filename in skip:
print('%s skipped: %r' % (cls.__name__, relpath))
continue
code = open(filename, 'rb').read().decode('utf-8')
try:
new_code = cls(code).translate()
except CancelTranslation:
print('%s cancelled: %r' % (cls.__name__, relpath))
else:
with open(filename, 'wb') as f:
f.write(new_code.encode('utf-8'))
print('%s translated: %r' % (cls.__name__, relpath))
class LegacyPythonTranslator(BaseTranslator):
""" A Translator to translate Python 3 to Python 2.7.
"""
FUTURES = ('print_function', 'absolute_import', 'with_statement',
'unicode_literals', 'division')
def dumps(self):
return '# -*- coding: utf-8 -*-\n' + BaseTranslator.dumps(self)
def fix_cancel(self, token):
""" Cancel translation if using `from __future__ import xxx`
"""
if token.type == 'keyword' and (token.text == 'from' and
token.next_token.text == '__future__'):
for future in self.FUTURES:
if any([t.text == future for t in token.line_tokens]):
# Assume this module is already Python 2.7 compatible
raise CancelTranslation()
def fix_future(self, token):
""" Fix print_function, absolute_import, with_statement.
"""
status = getattr(self, '_future_status', 0)
if status == 2:
return # Done
if status == 0 and token.type == 'string':
self._future_status = 1 # docstring
elif token.type != 'comment':
self._future_status = 2 # done
i = max(0, token.find_backward('\n'))
t = Token(token.total_text, '', i, i)
t.fix = '\nfrom __future__ import %s\n' % (', '.join(self.FUTURES))
return t
def fix_newstyle(self, token):
""" Fix to always use new style classes.
"""
if token.type == 'keyword' and token.text == 'class':
nametoken = token.next_token
if nametoken.next_char != '(':
nametoken.fix = '%s(object)' % nametoken.text
def fix_super(self, token):
""" Fix super() -> super(Cls, self)
"""
# First keep track of the current class
if token.type == 'keyword':
if token.text == 'class':
self._current_class = token.indentation, token.next_token.text
elif token.text == 'def':
indent, name = getattr(self, '_current_class', (0, ''))
if token.indentation <= indent:
self._current_class = 0, ''
# Then check for super
if token.type == 'identifier' and token.text == 'super':
if token.prev_char != '.' and token.next_char == '(':
i = token.find_forward(')')
sub = token.total_text[token.end:i+1]
if re.sub(r"\s+", '', sub) == '()':
indent, name = getattr(self, '_current_class', (0, ''))
if name:
token.end = i + 1
token.fix = 'super(%s, self)' % name
# Note: we use "from __future__ import unicode_literals"
# def fix_unicode_literals(self, token):
# if token.type == 'string':
# if token.text.lstrip('r').startswith(('"', "'")): # i.e. no b/u
# token.fix = 'u' + token.text
def fix_unicode(self, token):
if token.type == 'identifier':
if token.text == 'chr' and token.next_char == '(':
# Calling chr
token.fix = 'unichr'
elif token.text == 'str' and token.next_char == '(':
# Calling str
token.fix = 'unicode'
elif token.text == 'str' and (token.next_char == ')' and
token.prev_char == '(' and
token.line_tokens[0].text == 'class'):
token.fix = 'unicode'
elif token.text == 'isinstance' and token.next_char == '(':
# Check for usage of str in isinstance
end = token.find_forward(')')
t = token.next_token
while t.next_token and t.next_token.start < end:
t = t.next_token
if t.text == 'str':
t.fix = 'basestring'
def fix_range(self, token):
if token.type == 'identifier' and token.text == 'range':
if token.next_char == '(' and token.prev_char != '.':
token.fix = 'xrange'
def fix_encode(self, token):
if token.type == 'identifier' and token.text in('encode', 'decode'):
if token.next_char == '(' and token.prev_char == '.':
end = token.find_forward(')')
if not (token.next_token and token.next_token.start < end):
token.fix = token.text + '("utf-8")'
token.end = end + 1
def fix_getcwd(self, token):
""" Fix os.getcwd -> os.getcwdu
"""
if token.type == 'identifier' and token.text == 'getcwd':
if token.next_char == '(':
token.fix = 'getcwdu'
def fix_imports(self, token):
""" import xx.yy -> import zz
"""
if token.type == 'keyword' and token.text == 'import':
tokens = token.line_tokens
# For each import case ...
for name, replacement in self.IMPORT_MAPPING.items():
parts = name.split('.')
# Walk over tokens to find start of match
for i in range(len(tokens)):
if (tokens[i].text == parts[0] and
len(tokens[i:]) >= len(parts)):
# Is it a complete match?
for j, part in enumerate(parts):
if tokens[i+j].text != part:
break
else:
# Match, marge tokens
tokens[i].end = tokens[i+len(parts)-1].end
tokens[i].fix = replacement
for j in range(1, len(parts)):
tokens[i+j].start = tokens[i].end
tokens[i+j].end = tokens[i].end
tokens[i+j].fix = ''
break # we have found the match
def fix_imports2(self, token):
""" from xx.yy import zz -> from vv import zz
"""
if token.type == 'keyword' and token.text == 'import':
tokens = token.line_tokens
# We use the fact that all imports keys consist of two names
if tokens[0].text == 'from' and len(tokens) == 5:
if tokens[3].text == 'import':
xxyy = tokens[1].text + '.' + tokens[2].text
name = tokens[4].text
if xxyy in self.IMPORT_MAPPING2:
for possible_module in self.IMPORT_MAPPING2[xxyy]:
if name in self.PY2MODULES[possible_module]:
tokens[1].fix = possible_module
tokens[1].end = tokens[2].end
tokens[2].start = tokens[2].end
break
# Map simple import paths to new import import paths
IMPORT_MAPPING = {
"reprlib": "repr",
"winreg": "_winreg",
"configparser": "ConfigParser",
"copyreg": "copy_reg",
"queue": "Queue",
"socketserver": "SocketServer",
"_markupbase": "markupbase",
"test.support": "test.test_support",
"dbm.bsd": "dbhash",
"dbm.ndbm": "dbm",
"dbm.dumb": "dumbdbm",
"dbm.gnu": "gdbm",
"html.parser": "HTMLParser",
"html.entities": "htmlentitydefs",
"http.client": "httplib",
"http.cookies": "Cookie",
"http.cookiejar": "cookielib",
"urllib.robotparser": "robotparser",
"xmlrpc.client": "xmlrpclib",
"builtins": "__builtin__",
}
# Map import paths to ... a set of possible import paths
IMPORT_MAPPING2 = {
'urllib.request': ('urllib2', 'urllib'),
'urllib.error': ('urllib2', 'urllib'),
'urllib.parse': ('urllib2', 'urllib', 'urlparse'),
'dbm.__init__': ('anydbm', 'whichdb'),
'http.server': ('CGIHTTPServer', 'SimpleHTTPServer', 'BaseHTTPServer'),
'xmlrpc.server': ('DocXMLRPCServer', 'SimpleXMLRPCServer'),
}
# This defines what names are in specific Python 2 modules
PY2MODULES = {
'urllib2' : (
'AbstractBasicAuthHandler', 'AbstractDigestAuthHandler',
'AbstractHTTPHandler', 'BaseHandler', 'CacheFTPHandler',
'FTPHandler', 'FileHandler', 'HTTPBasicAuthHandler',
'HTTPCookieProcessor', 'HTTPDefaultErrorHandler',
'HTTPDigestAuthHandler', 'HTTPError', 'HTTPErrorProcessor',
'HTTPHandler', 'HTTPPasswordMgr',
'HTTPPasswordMgrWithDefaultRealm', 'HTTPRedirectHandler',
'HTTPSHandler', 'OpenerDirector', 'ProxyBasicAuthHandler',
'ProxyDigestAuthHandler', 'ProxyHandler', 'Request',
'StringIO', 'URLError', 'UnknownHandler', 'addinfourl',
'build_opener', 'install_opener', 'parse_http_list',
'parse_keqv_list', 'randombytes', 'request_host', 'urlopen'),
'urllib' : (
'ContentTooShortError', 'FancyURLopener', 'URLopener',
'basejoin', 'ftperrors', 'getproxies',
'getproxies_environment', 'localhost', 'pathname2url',
'quote', 'quote_plus', 'splitattr', 'splithost',
'splitnport', 'splitpasswd', 'splitport', 'splitquery',
'splittag', 'splittype', 'splituser', 'splitvalue',
'thishost', 'unquote', 'unquote_plus', 'unwrap',
'url2pathname', 'urlcleanup', 'urlencode', 'urlopen',
'urlretrieve',),
'urlparse' : (
'parse_qs', 'parse_qsl', 'urldefrag', 'urljoin',
'urlparse', 'urlsplit', 'urlunparse', 'urlunsplit'),
'dbm' : (
'ndbm', 'gnu', 'dumb'),
'anydbm' : (
'error', 'open'),
'whichdb' : (
'whichdb',),
'BaseHTTPServer' : (
'BaseHTTPRequestHandler', 'HTTPServer'),
'CGIHTTPServer' : (
'CGIHTTPRequestHandler',),
'SimpleHTTPServer' : (
'SimpleHTTPRequestHandler',),
'DocXMLRPCServer' : (
'DocCGIXMLRPCRequestHandler', 'DocXMLRPCRequestHandler',
'DocXMLRPCServer', 'ServerHTMLDoc', 'XMLRPCDocGenerator'),
}
if __name__ == '__main__':
# Awesome for testing
code = """
"""
t = LegacyPythonTranslator(code)
new_code = t.translate()
print(t.tokens)
print('---')
print(new_code)