# Purpose
This notebook aims to automate data extraction from a zip file. We assume that we manually downloaded the dataset in a `.zip` file. We will store the relevant datasets under the `CSV` data structure.

#### Library imports

In [None]:
from zipfile import ZipFile
from alive_progress import alive_bar
import os
import shutil
import glob
import numpy as np
import pandas as pd
import re

#### Unzipping the `.zip` files
This cell unzips all `.zip` files at once present in `data_extraction` when needed.

In [None]:
ZIPS = glob.glob("./*.zip")
UNZIP_TARGETS = [re.findall("/(.*).zip",zip_file)[0] for zip_file in ZIPS]
extracted_names = [glob.glob("./{}/".format(target)) for target in UNZIP_TARGETS]
to_extract = [ZIPS[f] for f in range(len(UNZIP_TARGETS)) if UNZIP_TARGETS[f] not in extracted_names]

print(to_extract)

if len(to_extract)==0:
    print("zip files were already extracted")
    pass
elif len(ZIPS)>len(to_extract):
    raise ValueError("Error:\tYou cannot have more zip files than unzip ones. Clean /data_extraction and /data")
else:
    with alive_bar(total=len(to_extract), title='Unizipping', force_tty=True) as pbar:
        for target in to_extract:
            with ZipFile(target,'r') as handle:
                handle.extractall()
            pbar()

#### Moving `.npz` files into a `data` directory
Checks if the `data` directory exists, creates it if necessary. It will overwrite previous files and repopulate the directory.

In [None]:
try:
    target_dir = os.listdir('./../data/')
except:
    os.mkdir('./../data/')
    target_dir = []

for target in UNZIP_TARGETS:
    Files = glob.glob("./{}/*.npz".format(target))
    #File_names = [re.findall("/(.*)$") for file in Files]
    Data_files = glob.glob("./../data/*.npz".format(target))
    
    if len(Files)==0:
        print('npz files were already moved for {}'.format(target))
    else:
        with alive_bar(total=len(Files), title='Moving npz files from {}/ to /data'.format(target), force_tty=True) as pbar:
            for file in Files:
                file_name = re.findall("([^/]+)$",file)[-1]
                if file_name not in Data_files: 
                    shutil.copy(file, "./../data/{}_{}".format(target,file_name))
                else:
                    os.remove(file)
                    shutil.copy(file, "./../data/{}_{}".format(target,file_name))
                pbar()

#### Conversion to `CSV` files
We only convert `.npz` with the `np.load` command.
After checking that each `.npz` files contains the same columns from the previous to next `.npz`, we then extract those values and create a corresponding `.csv` file.

In [None]:
### Checking the entries of each .npz file
checker_memory = []

for file in glob.glob('./../data/*.npz'):
    checker = np.load(file).files
    if not np.array_equal(checker,checker_memory) and not np.array_equal(checker_memory,[]):
        raise ValueError("Error: Data inconsistency!")
    else:
        checker_memory = np.copy(checker)
print("Data is consistent with entries:\n{}".format(checker))

**Careful, this next cell takes time to run**

In [None]:
from multiprocess import Pool
from datetime import datetime
import time

### Accelerating csv generation with parallelization
npz_files_name = glob.glob('./../data/*.npz')
already_processed_npz = glob.glob('./../data/*.csv')
overwrite = False

def pooled_csv_generation(npz_task):
    npz_file = npz_task
    file_name = re.findall("data\/(.*)npz$",npz_file)[0]
    file_path = './../data/' + file_name + 'csv' 
    print("[{}]\tGenerating {}csv".format(datetime.utcnow(),
                                            file_name))
    df = pd.DataFrame() # 'df' stands for 'dataframe'
    data = np.load(npz_file)
    data_files = data.files
    for data_entry in data_files:
        print("[{}]\t{}csv:\tprocessing '{}'".format(datetime.utcnow(),
                                                       file_name,
                                                       data_entry))
        array_shaped_data = data[data_entry]
        try:
            number_of_columns = array_shaped_data.shape[1]
            for i in range(number_of_columns):
                column_name = data_entry+str(i+1)
                df[column_name] = array_shaped_data[:,i]
        except:
            column_name = data_entry
            df[column_name] = array_shaped_data
    df.to_csv(path_or_buf=file_path)
    print("[{}]\tFinished generating {}csv!".format(datetime.utcnow(),
                                                    file_name))


### Launching processes
if __name__ == '__main__':
    if overwrite:
        for file in already_processed_npz:
            try:
                os.remove(file)
            except:
                pass
    pool = Pool(processes=4)
    pool.map_async(pooled_csv_generation,npz_files_name)
    pool.close()
    pool.join()