In [1]:
import os
from hdfs.ext.kerberos import KerberosClient
import getpass
import uuid
import time

In [2]:
_NAMENODE_PORT = 50070

In [3]:
def get_client(namenode_url: str, namenode_port: int = None):
    port = namenode_port = _NAMENODE_PORT
    return KerberosClient('%s:%s' % (namenode_url, port))


In [4]:
def upload(local_file: str,
           namenode_url: str,
           namenode_port: int = None,
           hdfs_dir: str = None,):
    """
    Parameters
    ----------
    local_file: The local file to upload
    namenode_url: The url of the namenode
    namenode_port: (optional) The port that the namenode service is running on. Defaults to 
        value of module-level variable _NAMENODE_PORT.
    hdfs_dir: (optional) The folder (or full path) to upload `local_file` to. If none is provided,
        then the file will be uploaded to the following directory: 
        /tmp/livy_{current user name}_{First six characters of uuid4}_{current time in seconds}'
        
    Returns
    -------
    str: The directory where `local_file` was uploaded so you can use it to upload the rest of your
        files needed for your Spark job
    """
    port = namenode_port or _NAMENODE_PORT
    
    client = get_client(namenode_url, port)
    
    if hdfs_dir is None:
        hdfs_dir = f'/tmp/livy_{getpass.getuser()}_{str(uuid.uuid4())[:6]}_{round(time.time())}/'
        
#     print(hdfs_dir)
    client.makedirs(hdfs_dir)
    
    resp = client.upload(hdfs_dir, local_file, overwrite=True)
#     print(resp)
    
    return hdfs_dir
    
    
    


---

Do some QA on the hdfs client functions

In [None]:
namenode_url = 

In [114]:
namenode_url = 'http://ip-172-31-20-241.ec2.internal'

In [115]:
upload('../QA/pi.py', namenode_url)

http://ip-172-31-20-241.ec2.internal:50070
/tmp/livy_ec2-user_dfe804_1545062025/
/tmp/livy_ec2-user_dfe804_1545062025/pi.py


'/tmp/livy_ec2-user_dfe804_1545062025/'

In [125]:
client.delete('/tmp/livy_ec2-user_dfe804_1545062025/', recursive=True)

True

In [116]:
client = get_client(namenode_url)

http://ip-172-31-20-241.ec2.internal:50070


In [118]:
client.list('/tmp/livy_ec2-user_dfe804_1545062025/')

['pi.py']

In [57]:
client._get_home_directory??

[0;31mSignature:[0m [0mclient[0m[0;34m.[0m[0m_get_home_directory[0m[0;34m([0m[0mhdfs_path[0m[0;34m,[0m [0mdata[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m [0mstrict[0m[0;34m=[0m[0;32mTrue[0m[0;34m,[0m [0;34m**[0m[0mparams[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m Cf. http://hadoop.apache.org/docs/r1.0.4/webhdfs.html#GETHOMEDIRECTORY
[0;31mSource:[0m   
    [0;32mdef[0m [0mapi_handler[0m[0;34m([0m[0mclient[0m[0;34m,[0m [0mhdfs_path[0m[0;34m,[0m [0mdata[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m [0mstrict[0m[0;34m=[0m[0;32mTrue[0m[0;34m,[0m [0;34m**[0m[0mparams[0m[0;34m)[0m[0;34m:[0m[0;34m[0m
[0;34m[0m      [0;34m"""Wrapper function."""[0m[0;34m[0m
[0;34m[0m      [0mparams[0m[0;34m[[0m[0;34m'op'[0m[0;34m][0m [0;34m=[0m [0moperation[0m[0;34m[0m
[0;34m[0m      [0;32mif[0m [0mclient[0m[0;34m.[0m[0m_proxy[0m [0;32mis[0m [0;32mnot[0m [0;32mNone[0m[0;34m:[0m[0;34m[0m
[0;

In [26]:
client.upload?a

[0;31mSignature:[0m
[0mclient[0m[0;34m.[0m[0mupload[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0;34m[[0m[0;34m'hdfs_path'[0m[0;34m,[0m [0;34m'local_path'[0m[0;34m,[0m [0;34m'n_threads=1'[0m[0;34m,[0m [0;34m'temp_dir=None'[0m[0;34m,[0m [0;34m'chunk_size=65536'[0m[0;34m,[0m [0;34m'progress=None'[0m[0;34m,[0m [0;34m'cleanup=True'[0m[0;34m,[0m [0;34m'**kwargs'[0m[0;34m][0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Upload a file or directory to HDFS.

:param hdfs_path: Target HDFS path. If it already exists and is a
  directory, files will be uploaded inside.
:param local_path: Local path to file or folder. If a folder, all the files
  inside of it will be uploaded (note that this implies that folders empty
  of files will not be created remotely).
:param n_threads: Number of threads to use for parallelization. A value of
  `0` (or negative) uses as many threads as there are files.
:param temp_dir:

In [130]:
import sys
sys.path.insert(0, '../')

In [131]:
from livy_submit import hdfs_api

In [132]:
namenode_url = 'http://ip-172-31-20-241.ec2.internal:50070'

In [133]:
local_file = '../test/data/pi_runner.zip'

In [142]:
hdfs_dirname = '/user/edill/livy-submit-files/19d579_1545492416'

In [143]:
hdfs_api.upload(local_file, namenode_url, hdfs_dirname)

file uploaded to /user/edill/livy-submit-files/19d579_1545492416/pi_runner.zip


'/user/edill/livy-submit-files/19d579_1545492416/pi_runner.zip'