### Import libraries

In [None]:
import os, glob, shutil
import zipfile as zf
import pandas as pd
import numpy as np

### Import raw data from Web-Robot-ZIP-Archives

In [None]:
df = None
df_import = pd.DataFrame(columns=['zip', 'csv', 'samples'])

zip_dir = 'data/web_robot_zip'
unzip_dir = 'data/unzipped'

zip_files = sorted(glob.glob(os.path.join(zip_dir, '*.zip')))
for i, zip_file in enumerate(zip_files): # loop through items in dir
    # unzip file
    file_name = os.path.join(zip_file) # get full path of files
    zip_ref = zf.ZipFile(file_name) # create zipfile object
    zip_ref.extractall(unzip_dir) # extract file to dir
    zip_ref.close() # close file    

    # loop all extracted csv files
    csv_files = sorted(glob.glob(os.path.join(unzip_dir, '*.csv')))
    for j, csv_file in enumerate(csv_files):
        print(f'zipfile {i+1}/{len(zip_files)} - csv {j+1}/{len(csv_files)}', end = "\r")
        
        # import aw data and perfrom first clean up of remocing all the useless columns
        df_raw = pd.read_csv(csv_file)
        
        # save stats about this csv/zip file
        df_import.loc[len(df_import)] = {'zip': zip_file, 'csv': csv_file, 'samples': df_raw.shape[0]}

        # concat to df and check for duplicates
        df = pd.concat([df,df_raw], ignore_index=True)
        df.drop_duplicates(subset='id', inplace=True)

    # delete folder containing unzipped data
    shutil.rmtree(unzip_dir)

# sort by date of creation
df.sort_values(by='created_at', inplace=True, ignore_index=True)

# print some information about the dataframe
print(end='\n')
print(f"in total, there were '{df_import.samples.sum()}' instances")
print(f"after removing duplicates, '{df.shape[0]}' instances are left")
print(f"there are '{df.shape[1]}' features")

#### Split DataFrame and save as zip

In [None]:
# create folder 
!mkdir -p data/zip

# split df into that many parts
parts = 10
split_at = np.linspace(0, df.shape[0], parts+1).astype(int)

# loop parts
for s in range(parts):
    # split df 
    if s != parts-1:
        df_split = df.iloc[split_at[s]:split_at[s+1], :]
    else:
        df_split = df.iloc[split_at[s]:, :]

    print(f'part {s} of {parts-1} splits at indices: {df_split.index[0]} - {df_split.index[-1]}')

    ## version 1
    zip_name = os.path.join('data', 'zip', f'KS_part_{s:03d}.zip')
    df_split.to_csv(zip_name, compression='zip', index=False)

    ## version 2
    # zip_name = os.path.join('data', 'zip', f'KS_part_{s:03d}.zip')
    # csv_name = os.path.join('data', 'zip', f'KS_part_{s:03d}.csv')
    # df_split.to_csv(csv_name, index=False)
    # with zf.ZipFile(zip_name, 'w') as myzip:
    #     myzip.write(csv_name, os.path.basename(csv_name))
    # os.remove(csv_name)

    ## version 3
    # csv_name = os.path.join('data', 'zip', f'KS_part_{s:03d}.csv')
    # df_split.to_csv(csv_name, index=False)
