In [None]:
import os
import boto3
import pandas as pd
import smart_open
import json

In [None]:
s3 = boto3.resource('s3')

## Show all files in bucket

In [None]:
my_bucket = s3.Bucket('legalthings-datalake')

for file in my_bucket.objects.all():
    if file.key.startswith('mongo/'):
        print(file.key, file.size)

## Read files directly from bucket 

In [None]:
company_sbi_df = pd.read_csv(smart_open.open('s3://legalthings-datalake/mongo/company_sbi.csv'))

In [None]:
company_sbi_df.head()

In [None]:
users_df = pd.read_json(smart_open.open('s3://legalthings-datalake/mongo/users.json'), encoding="utf8", lines = True)

In [None]:
users_df.tail()

## Problems with one liner json files
* ValueError: Unexpected character found when decoding array value (2)

In [None]:
# licenses_df = pd.read_json(smart_open.open('s3://legalthings-datalake/mongo/licenses.json'), encoding="utf8", lines = True)

### Potential fix: reformat json

In [None]:
from risk_model.storage import make_folder

In [None]:
def make_readable_json(
    unprocessed_filename: str ='../data/mongo/licenses.json',
    preprocessed_dir: str ='../data/preprocess/licenses/',
    split_variabele: str = '{"_id":',
    chunk_size = 4096
):
    """Reformat unreadable oneliner JSON to readable format."""
    make_folder(preprocessed_dir)
    processed_filename = os.path.join(preprocessed_dir,unprocessed_filename.split('/')[-1])
    
    with open(unprocessed_filename) as f_read:
        with open(processed_filename, 'w') as f_write:
            for chunk in each_chunk(f_read, chunk_size, split_variabele):
                format_json = split_variabele + chunk
                if format_json != split_variabele and len(format_json) > 0:
                    try:
                        reformated_json = json.loads(format_json)            
                        f_write.write('{}\n'.format(json.dumps(reformated_json)))
                    except json.JSONDecodeError:
                        print('Fail to parse')
                        pass

def each_chunk(stream, chunk_size, separator):
    """Separates the one line into separate readable lines."""
    buffer = ''
    while True:  # until EOF
        chunk = stream.read(chunk_size)  # I propose 4096 or so
        if not chunk:  # EOF?
            yield buffer
            break
            
        buffer += chunk
        while True:  # until no separator is found
            try:
                part, buffer = buffer.split(separator, 1)
            except ValueError:
                break
            else:
                yield part

### Fixed: licenses.json --> parse OK

In [None]:
make_readable_json(
    unprocessed_filename ='../data/mongo/licenses.json',
    preprocessed_dir ='../data/preprocess/licenses/',
    split_variabele = '{"_id":'
)

In [None]:
licenses_df = pd.read_json(smart_open.open('../data/preprocess/licenses/licenses.json'), encoding="utf8", lines = True)

In [None]:
licenses_df.head()

### Biggest file: incorporation-processes.json

#### Step 1: Parse into readable JSON format

In [None]:
make_readable_json(
    unprocessed_filename ='../data/mongo/incorporation-processes.json',
    preprocessed_dir ='../data/preprocess/incorporation_processes/',
    split_variabele = '{"_id":'
)

#### Number of lines in file --> not possible to load in memory
* !wc -l ../data/preprocess/incorporation_processes/incorporation-processes.json

In [None]:
base_path = '../data/preprocess/incorporation_processes'

In [None]:
sum(1 for line in open(os.path.join(base_path, 'incorporation-processes.json')))

In [None]:
!head -2 ../data/preprocess/incorporation_processes/incorporation-processes.json

#### Step 2: Make separate small files from it that fits in local memory

In [None]:
from itertools import chain, islice

In [None]:
def chunks(iterable, n):
    "chunks(ABCDE,2) => AB CD E"
    iterable = iter(iterable)
    while True:
        # store one line in memory,
        # chain it to an iterator on the rest of the chunk
        try:
            yield chain([next(iterable)], islice(iterable, n-1))
        except StopIteration:
            return

In [None]:
def split_file_into_multiple_files(
    directory = '../data/preprocess/incorporation_processes',
    file_to_split = 'incorporation-processes.json',
    new_sub_file_name = 'processes',
    n_files = 35,
    file_type = 'json'
):
    "Splits big file into separate files."
    
    file = os.path.join(directory, file_to_split)
    num_lines = sum(1 for line in open(file))
    l = round(num_lines / n_files)

    with open(file) as bigfile:
        for i, lines in enumerate(chunks(bigfile, l)):
            file_split = os.path.join(directory, 'processes_{}.{}'.format(i, file_type))

            with open(file_split, 'w') as f:
                f.writelines(lines) 

In [None]:
num_lines = sum(1 for line in open('../data/preprocess/incorporation_processes/incorporation-processes.json'))
n_files = 35
l = round(num_lines / n_files) # lines per split file
file_large = '../data/preprocess/incorporation_processes/incorporation-processes.json'
with open(file_large) as bigfile:
    for i, lines in enumerate(chunks(bigfile, l)):
        file_split = '../data/preprocess/incorporation_processes/processes_{}.json'.format(i)
        with open(file_split, 'w') as f:
            f.writelines(lines)

In [None]:
import shutil
shutil.rmtree('../data/preprocess/incorporation_processes/')

#### Check total number of lines sum up

In [None]:
base_path = '../data/preprocess/incorporation_processes'

line_count_orginal = sum(1 for line in open(os.path.join(base_path, 'incorporation-processes.json')))

In [None]:
line_count = 0

for i in range(0, 35):
    file_name = os.path.join(base_path, 'processes_{}.json'.format(i))
    line_count += sum(1 for line in open(file_name))
    
assert(line_count_orginal == line_count)

### Read subfile of incorporation_processes

In [None]:
incorporation_processes_df = pd.read_json(
    smart_open.open('../data/preprocess/incorporation_processes/processes_34.json'), 
    encoding="utf8", lines = True)

## Appendix

### Corrupt: emails.json --> parse not well

In [None]:
make_readable_json(
    unprocessed_filename ='../data/mongo/emails.json',
    preprocessed_dir ='../data/preprocess/email/',
    split_variabele = '{"_id":'
)

In [None]:
emails_df = pd.read_json(smart_open.open('../data/preprocess/email/emails.json'), encoding="utf8", lines = True)

### Look at json chunks

In [None]:
with open('../data/mongo/licenses.json') as myFile:
    for chunk in each_chunk(myFile, chunk_size=4000, separator='{"_id":'):
        format_json = '{"_id": ' + chunk
        if format_json != '{"_id": ':
#             print(format_json)  # not holding in memory, but printing chunk by chunk
            print(json.loads(format_json))