# Import

In [1]:
import logging
import os
import shutil
import sys
from typing import List

import boto3
from botocore.exceptions import ClientError

## Add configuration file

In [2]:
sys.path.append("/home/jovyan/core/config/")
sys.path.append("/home/jovyan/core/util/")

In [3]:
from ALL import config 
from util import *

In [36]:
s3_bucket_name =  "text-classification-nakayama-bucket"
object_name = "RawData/reuters/"
file_path = "./test.txt"
root_path = "/home/jovyan"

In [5]:
class S3Manager:
    def __init__(self):
        self.files = {data_type: {} for data_type in ["upload", "download"]}

    def upload(
        self, file_path, object_name=None, bucket=s3_bucket_name, save_file=False
    ):
        """Upload a file to an S3 bucket

        :param file_path: File to upload
        :param object_name: S3 object name. If not specified then file_path is used
        :param bucket: Bucket to upload to. If not specified then default s3_bucket_name is used
        :param save_file: determine save file or not. If not specified then the uploaded file will not be saved.
        :return: file_name if file was uploaded, else False
        """

        def _upload_dir(_dir_path, _object_name, _bucket):
            for _root, _, _files in os.walk(_dir_path, topdown=False):
                if _files:
                    _root_path = _root.replace(_dir_path, _object_name)
                    _file_paths = [os.path.join(_root, _file) for _file in _files]
                    _objects = [os.path.join(_root_path, _file) for _file in _files]
                    for _object, _file_path in zip(_objects, _file_paths):
                        _s3_client.upload_file(_file_path, bucket, _object)

        # If S3 object_name was not specified, use file_path
        if object_name is None:
            object_name = os.path.basename(file_path)

        # Upload the file
        _s3_client = boto3.client("s3")
        try:
            if os.path.isfile(file_path):
                _response = _s3_client.upload_file(file_path, bucket, object_name)
            else:
                _upload_dir(file_path, object_name, bucket)
        except ClientError as e:
            logging.error(e)
            return False
        self.files["upload"][file_path] = save_file
        return file_path

    def download(
        self,
        object_name,
        file_path=None,
        s3_bucket_name=s3_bucket_name,
        save_file=False,
    ):
        """Download a file to an S3 bucket

        :param bucket: Bucket to download to
        :param file_path: File to download
        :param object_name: S3 object name. If not specified then file_path is used
        :param save_file: determine save file or not. If not specified then the uploaded file will not be saved.
        :return: True if file was uploaded, else False
        """
        _s3 = boto3.client("s3")
        _objects = ls(s3_bucket_name, object_name, recursive=True)
        print(_objects)
        
        try:
            _objects = ls(s3_bucket_name, object_name, recursive=True)
            print(_objects)
            if file_path is None:
                for _object in _objects:
                    _file_path = make_filepath(f"{root_path}/temporary/{_object}")
                    _s3.download_file(s3_bucket_name, _object, _file_path)
            else:
                for _object in _objects:
                    _object = _object.replace(object_name, file_path)
                    _file_path = make_filepath(_object)
                    _s3.download_file(s3_bucket_name, _object, _file_path)
        except ClientError as e:
            logging.error(e)
            return False
        self.files["download"][file_path] = save_file
        return file_path

    def ls(bucket: str, prefix: str, recursive: bool = False) -> List[str]:
        """S3上のファイルリスト取得

        Args:
            bucket (str): バケット名
            prefix (str): バケット以降のパス
            recursive (bool): 再帰的にパスを取得するかどうか

        """
        paths: List[str] = []
        paths = __get_all_keys(bucket, prefix, recursive=recursive)
        return paths

    def __get_all_keys(
        bucket: str,
        prefix: str,
        keys: List = None,
        marker: str = "",
        recursive: bool = False,
    ) -> List[str]:
        """指定した prefix のすべての key の配列を返す

        Args:
            bucket (str): バケット名
            prefix (str): バケット以降のパス
            keys (List): 全パス取得用に用いる
            marker (str): 全パス取得用に用いる
            recursive (bool): 再帰的にパスを取得するかどうか

        """
        s3 = boto3.client("s3")
        if recursive:
            response = s3.list_objects(Bucket=bucket, Prefix=prefix, Marker=marker)
        else:
            response = s3.list_objects(
                Bucket=bucket, Prefix=prefix, Marker=marker, Delimiter="/"
            )

        # keyがNoneのときは初期化
        if keys is None:
            keys = []

        if "CommonPrefixes" in response:
            # Delimiterが'/'のときはフォルダがKeyに含まれない
            keys.extend([content["Prefix"] for content in response["CommonPrefixes"]])
        if "Contents" in response:  # 該当する key がないと response に 'Contents' が含まれない
            keys.extend([content["Key"] for content in response["Contents"]])
            if "IsTruncated" in response:
                return __get_all_keys(
                    bucket=bucket,
                    prefix=prefix,
                    keys=keys,
                    marker=keys[-1],
                    recursive=recursive,
                )
        return keys

    def delete_local_all(self):
        def _delete_file_or_folder(_path):
            if os.path.exists(_path):
                if os.path.isfile(_path):
                    os.remove(_path)
                else:
                    shutil.rmtree(_path)
                return True
            else:
                return False

        for _type, _file_dict in self.files.items():
            if _file_dict:
                for _file, _save in _file_dict.items():
                    if not _save:
                        _delete_file_or_folder(_file)
                        print(_file)

In [53]:
def download(
        self,
        object_name,
        file_path=None,
        s3_bucket_name=s3_bucket_name,
        save_file=False,
    ):
        """Download a file to an S3 bucket

        :param bucket: Bucket to download to
        :param file_path: File to download
        :param object_name: S3 object name. If not specified then file_path is used
        :param save_file: determine save file or not. If not specified then the uploaded file will not be saved.
        :return: True if file was uploaded, else False
        """
        _s3 = boto3.client("s3")
        _objects = ls(s3_bucket_name, object_name, recursive=True)
        print(_objects)
        
        try:
            _objects = ls(s3_bucket_name, object_name, recursive=True)
            print(_objects)
            if file_path is None:
                for _object in _objects:
                    _file_path = make_filepath(f"{root_path}/temporary/{_object}")
                    _s3.download_file(s3_bucket_name, _object, _file_path)
            else:
                for _object in _objects:
                    _object = _object.replace(object_name, file_path)
                    _file_path = make_filepath(_object)
                    _s3.download_file(s3_bucket_name, _object, _file_path)
        except ClientError as e:
            logging.error(e)
            return False
        self.files["download"][file_path] = save_file
        return file_path

In [54]:
S3 = S3Manager()

In [55]:
download(save_file=False, object_name=object_name)

['RawData/reuters/.DS_Store', 'RawData/reuters/README', 'RawData/reuters/cats.txt', 'RawData/reuters/stopwords', 'RawData/reuters/test/14826', 'RawData/reuters/test/14828', 'RawData/reuters/test/14829', 'RawData/reuters/test/14832', 'RawData/reuters/test/14833', 'RawData/reuters/test/14839', 'RawData/reuters/test/14840', 'RawData/reuters/test/14841', 'RawData/reuters/test/14842', 'RawData/reuters/test/14843', 'RawData/reuters/test/14844', 'RawData/reuters/test/14849', 'RawData/reuters/test/14852', 'RawData/reuters/test/14854', 'RawData/reuters/test/14858', 'RawData/reuters/test/14859', 'RawData/reuters/test/14860', 'RawData/reuters/test/14861', 'RawData/reuters/test/14862', 'RawData/reuters/test/14863', 'RawData/reuters/test/14865', 'RawData/reuters/test/14867', 'RawData/reuters/test/14872', 'RawData/reuters/test/14873', 'RawData/reuters/test/14875', 'RawData/reuters/test/14876', 'RawData/reuters/test/14877', 'RawData/reuters/test/14881', 'RawData/reuters/test/14882', 'RawData/reuters/

['RawData/reuters/.DS_Store', 'RawData/reuters/README', 'RawData/reuters/cats.txt', 'RawData/reuters/stopwords', 'RawData/reuters/test/14826', 'RawData/reuters/test/14828', 'RawData/reuters/test/14829', 'RawData/reuters/test/14832', 'RawData/reuters/test/14833', 'RawData/reuters/test/14839', 'RawData/reuters/test/14840', 'RawData/reuters/test/14841', 'RawData/reuters/test/14842', 'RawData/reuters/test/14843', 'RawData/reuters/test/14844', 'RawData/reuters/test/14849', 'RawData/reuters/test/14852', 'RawData/reuters/test/14854', 'RawData/reuters/test/14858', 'RawData/reuters/test/14859', 'RawData/reuters/test/14860', 'RawData/reuters/test/14861', 'RawData/reuters/test/14862', 'RawData/reuters/test/14863', 'RawData/reuters/test/14865', 'RawData/reuters/test/14867', 'RawData/reuters/test/14872', 'RawData/reuters/test/14873', 'RawData/reuters/test/14875', 'RawData/reuters/test/14876', 'RawData/reuters/test/14877', 'RawData/reuters/test/14881', 'RawData/reuters/test/14882', 'RawData/reuters/

KeyboardInterrupt: 

In [40]:
S3.upload(save_file=False, file_path=file_path)

FileNotFoundError: [Errno 2] No such file or directory: './test.txt'

In [10]:
S3.delete_local_all()

/home/jovyan/temporary/RawData/reuters/cats.txt


# S3のフォルダ内のオブジェクトを取得する関数

In [11]:
def ls(bucket: str, prefix: str, recursive: bool = False) -> List[str]:
    """S3上のファイルリスト取得

    Args:
        bucket (str): バケット名
        prefix (str): バケット以降のパス
        recursive (bool): 再帰的にパスを取得するかどうか

    """
    paths: List[str] = []
    paths = __get_all_keys(
        bucket, prefix, recursive=recursive)
    return paths


def __get_all_keys(bucket: str, prefix: str, keys: List = None, marker: str = '', recursive: bool = False) -> List[str]:
    """指定した prefix のすべての key の配列を返す

    Args:
        bucket (str): バケット名
        prefix (str): バケット以降のパス
        keys (List): 全パス取得用に用いる
        marker (str): 全パス取得用に用いる
        recursive (bool): 再帰的にパスを取得するかどうか

    """
    s3 = boto3.client('s3')
    if recursive:
        response = s3.list_objects(
            Bucket=bucket, Prefix=prefix, Marker=marker)
    else:
        response = s3.list_objects(
            Bucket=bucket, Prefix=prefix, Marker=marker, Delimiter='/')

    # keyがNoneのときは初期化
    if keys is None:
        keys = []

    if 'CommonPrefixes' in response:
        # Delimiterが'/'のときはフォルダがKeyに含まれない
        keys.extend([content['Prefix']
                    for content in response['CommonPrefixes']])
    if 'Contents' in response:  # 該当する key がないと response に 'Contents' が含まれない
        keys.extend([content['Key'] for content in response['Contents']])
        if 'IsTruncated' in response:
            return __get_all_keys(bucket=bucket, prefix=prefix, keys=keys, marker=keys[-1], recursive=recursive)
    return keys

In [20]:
keys = ls(s3_bucket_name, 'Clustering/AgNews/LDA/', recursive=False)
print(keys) # ['folder1/', 'folder1/sample.txt', 'folder1/html/']

['Clustering/AgNews/LDA/model/', 'Clustering/AgNews/LDA/pred/', 'Clustering/AgNews/LDA/prob/', 'Clustering/AgNews/LDA/corpus.sav', 'Clustering/AgNews/LDA/dictionary.sav', 'Clustering/AgNews/LDA/model/', 'Clustering/AgNews/LDA/pred/', 'Clustering/AgNews/LDA/prob/']


In [21]:
keys = ls(s3_bucket_name, 'Clustering/AgNews/LDA/', recursive=True)
print(keys) # ['folder1/', 'folder1/sample.txt', 'folder1/html/']

['Clustering/AgNews/LDA/corpus.sav', 'Clustering/AgNews/LDA/dictionary.sav', 'Clustering/AgNews/LDA/model/0', 'Clustering/AgNews/LDA/model/0.expElogbeta.npy', 'Clustering/AgNews/LDA/model/0.id2word', 'Clustering/AgNews/LDA/model/0.state', 'Clustering/AgNews/LDA/model/1', 'Clustering/AgNews/LDA/model/1.expElogbeta.npy', 'Clustering/AgNews/LDA/model/1.id2word', 'Clustering/AgNews/LDA/model/1.state', 'Clustering/AgNews/LDA/model/10', 'Clustering/AgNews/LDA/model/10.expElogbeta.npy', 'Clustering/AgNews/LDA/model/10.id2word', 'Clustering/AgNews/LDA/model/10.state', 'Clustering/AgNews/LDA/model/11', 'Clustering/AgNews/LDA/model/11.expElogbeta.npy', 'Clustering/AgNews/LDA/model/11.id2word', 'Clustering/AgNews/LDA/model/11.state', 'Clustering/AgNews/LDA/model/12', 'Clustering/AgNews/LDA/model/12.expElogbeta.npy', 'Clustering/AgNews/LDA/model/12.id2word', 'Clustering/AgNews/LDA/model/12.state', 'Clustering/AgNews/LDA/model/13', 'Clustering/AgNews/LDA/model/13.expElogbeta.npy', 'Clustering/AgNew

In [60]:
def func():
    for key in keys:
        yield key

In [65]:
for a in  func():
    print(a)

Clustering/AgNews/LDA/corpus.sav
Clustering/AgNews/LDA/dictionary.sav
Clustering/AgNews/LDA/model/0
Clustering/AgNews/LDA/model/0.expElogbeta.npy
Clustering/AgNews/LDA/model/0.id2word
Clustering/AgNews/LDA/model/0.state
Clustering/AgNews/LDA/model/1
Clustering/AgNews/LDA/model/1.expElogbeta.npy
Clustering/AgNews/LDA/model/1.id2word
Clustering/AgNews/LDA/model/1.state
Clustering/AgNews/LDA/model/10
Clustering/AgNews/LDA/model/10.expElogbeta.npy
Clustering/AgNews/LDA/model/10.id2word
Clustering/AgNews/LDA/model/10.state
Clustering/AgNews/LDA/model/11
Clustering/AgNews/LDA/model/11.expElogbeta.npy
Clustering/AgNews/LDA/model/11.id2word
Clustering/AgNews/LDA/model/11.state
Clustering/AgNews/LDA/model/12
Clustering/AgNews/LDA/model/12.expElogbeta.npy
Clustering/AgNews/LDA/model/12.id2word
Clustering/AgNews/LDA/model/12.state
Clustering/AgNews/LDA/model/13
Clustering/AgNews/LDA/model/13.expElogbeta.npy
Clustering/AgNews/LDA/model/13.id2word
Clustering/AgNews/LDA/model/13.state
Clustering/AgN