## Convert Databricks Exports (.dbc) to Python Scripts (.py) & IPython Notebooks (.ipynb)

To download .dbc file from Databricks, click on the down arrow next to the root folder then hover the mouse over "Export" and click on "DBC Archive" or a single file can be exported by clicking on the down arrow next to the file, hovering the mouse over "Export", and clicking on "DBC Archive".

In [9]:
# Change this fileLocation using proper syntax with '/'
fileLocation = '.dbc'

In [10]:
# Extract dbc file

# Cleanup from prior run
import shutil
try: shutil.rmtree('tmp_dbc')
except OSError: pass

import zipfile
import os
try: os.mkdir('tmp_dbc')
except OSError: pass
with zipfile.ZipFile(fileLocation, 'r') as z:
    z.extractall('tmp_dbc')

print('*** Contents from the .dbc file ***\n')
print(os.listdir('tmp_dbc'))

*** Contents from the .dbc file (usually one file or a directory) ***

['Spark Essentials']


In [14]:
# Find files to parse
import fnmatch

filesToParse = []
for root, dirNames, fileNames in os.walk('tmp_dbc'):
    for fileName in fnmatch.filter(fileNames, '*.python'):
        filesToParse.append((root, fileName))

def getIpynbName(path, fileName):
    path = os.path.normpath(path)
    pathSplit = path.split(os.sep)[2:]
    baseDir = os.path.join(*pathSplit) if len(pathSplit) > 0 else '.'
    newFileName = os.path.splitext(fileName)[0] + '_export.ipynb'
    return os.path.join(baseDir, newFileName)

print("*** Files to be created (relative to the current working directory) ***")
print("(Warning: files will be overwritten!)\n")
for path, fileName in filesToParse:
    print(getIpynbName(path, fileName))

*** Files to be created (relative to your current working directory) ***

.\1.6 Introduction to Notebooks_export.ipynb
.\2.2 Working with Text Files_export.ipynb
.\2.3 Loading CSV Data into DataFrames_export.ipynb
.\2.4 Exploring Data in DataFrames_export.ipynb
.\2.5 Saving Your Results_export.ipynb
.\4.2 Preparing Data for Machine Learning_export.ipynb
.\4.3 Building a Linear Regression Model_export.ipynb
.\4.4 Evaluating a Linear Regression Model_export.ipynb
.\4.5 Visualizing a Linear Regression Model_export.ipynb
.\5.2 Setting up Streaming Context_export.ipynb
.\5.3 Querying Streaming Data_export.ipynb


In [35]:
# Create the IPython Notebooks
# Convert .python files to .ipynb files
import codecs
import nbformat
from nbformat.v3.nbpy import PyReader
import json
import re

_header = u'# -*- coding: utf-8 -*-\n# <nbformat>3.0</nbformat>\n'
_markdownCell = u'\n\n# <markdowncell>\n\n'
_codeCell = u'\n\n# <codecell>\n\n'
_firstCell = u"""# Increase compatibility with Databricks
from IPython.display import display as idisplay, HTML
displayHTML = lambda x: idisplay(HTML(x))
def display(*args, **kargs): pass"""

def convertToIpynb(fileToParse):
    
    with codecs.open(os.path.join(*fileToParse), encoding="utf-8") as fp:
        jsonData = json.load(fp)
        commands = jsonData['commands']
        commandInfo = [(x['position'], x['command']) for x in commands]
        commandList = sorted(commandInfo)

    with codecs.open('tmp_ipynb.py', 'w', encoding="utf-8") as fp:
        fp.write(_header)
        fp.write(_codeCell)
        fp.write(_firstCell)

        for position, command in commandList:
            if re.match(r'\s*%md', command):
                command = re.sub(r'^\s*%md', '', command, flags=re.MULTILINE)
                command = re.sub(r'(%\(|\)%)', '$', command)
                command = re.sub(r'(%\[|\]%)', '$$', command)

                fp.write(_markdownCell)
                asLines = command.split('\n')
                command = '# ' + '\n# '.join(asLines)
            else:
                command = re.sub(r'^\s*baseDir\s*=.*$', 'baseDir = \'data\'', 
                                 command, flags=re.MULTILINE)
                fp.write(_codeCell)

            fp.write(command)

    outputName = getIpynbName(fileToParse[0], fileToParse[1])

    with codecs.open('tmp_ipynb.py', 'r', encoding="utf-8") as intermediate:
        nb = PyReader().read(intermediate)

    os.remove('tmp_ipynb.py')
    baseDirectory = os.path.split(outputName)[0]

    if not os.path.isdir(baseDirectory):
        os.makedirs(baseDirectory)

    with codecs.open(outputName, 'w', encoding="utf-8") as output:
        nbformat.write(nbformat.convert(nb, 4.0), output)  
        print('Created: {0}'.format(outputName))

for fileToParse in filesToParse:
    convertToIpynb(fileToParse)

Created: .\1.6 Introduction to Notebooks_export.ipynb
Created: .\2.2 Working with Text Files_export.ipynb
Created: .\2.3 Loading CSV Data into DataFrames_export.ipynb
Created: .\2.4 Exploring Data in DataFrames_export.ipynb
Created: .\2.5 Saving Your Results_export.ipynb
Created: .\4.2 Preparing Data for Machine Learning_export.ipynb
Created: .\4.3 Building a Linear Regression Model_export.ipynb
Created: .\4.4 Evaluating a Linear Regression Model_export.ipynb
Created: .\4.5 Visualizing a Linear Regression Model_export.ipynb
Created: .\5.2 Setting up Streaming Context_export.ipynb
Created: .\5.3 Querying Streaming Data_export.ipynb


In [36]:
# Cleanup
import shutil
try: shutil.rmtree('tmp_dbc')
except OSError: pass