In [1]:
import pandas as pd
from zipfile import ZipFile
from tqdm import tqdm

In [3]:
def create_dataframe(zip_file_path):
    """
    Read all CSV files from a zip file and concatenate them into one dataframe.

    Parameters:
        zip_file_path (str): Path to the zip file.

    Returns:
        pd.DataFrame: Concatenated dataframe of all CSV files.
    """
    dfs = []  # An empty list to store dataframes

    with ZipFile(zip_file_path, 'r') as zip_file:
        # Get a list of file names inside the zip folder
        csv_file_names = [name for name in zip_file.namelist() if name.endswith('.csv') and not name.startswith('__MACOSX')]

        for csv_file_name in tqdm(csv_file_names, desc=f'Processing CSV files in {zip_file_path}'):
            with zip_file.open(csv_file_name) as csv_file:
                df = pd.read_csv(csv_file)
                dfs.append(df)

    # Concatenate all dataframes in the list
    result_df = pd.concat(dfs, ignore_index=True)

    return result_df


### Getting data for Q1

In [None]:
Q1 = create_dataframe('data/data_Q1_2019.zip')

In [None]:
Q1.to_csv('Q1_2019.csv', index=False)

### Getting data for Q2

In [4]:
Q2 = create_dataframe('data/data_Q2_2019.zip')

Processing CSV files in data/data_Q2_2019.zip: 100%|███████████████████████████████████| 91/91 [01:29<00:00,  1.02it/s]


In [5]:
Q2.to_csv('Q2_2019.csv', index=False)

In [6]:
Q2

Unnamed: 0,date,serial_number,model,capacity_bytes,failure,smart_1_normalized,smart_1_raw,smart_2_normalized,smart_2_raw,smart_3_normalized,...,smart_250_normalized,smart_250_raw,smart_251_normalized,smart_251_raw,smart_252_normalized,smart_252_raw,smart_254_normalized,smart_254_raw,smart_255_normalized,smart_255_raw
0,2019-04-01,Z305B2QN,ST4000DM000,4000787030016,0,119.0,222017240.0,,,91.0,...,,,,,,,,,,
1,2019-04-01,ZJV0XJQ4,ST12000NM0007,12000138625024,0,81.0,139084944.0,,,98.0,...,,,,,,,,,,
2,2019-04-01,ZJV0XJQ3,ST12000NM0007,12000138625024,0,84.0,241585648.0,,,99.0,...,,,,,,,,,,
3,2019-04-01,ZJV0XJQ0,ST12000NM0007,12000138625024,0,82.0,162835752.0,,,93.0,...,,,,,,,,,,
4,2019-04-01,PL1331LAHG1S4H,HGST HMS5C4040ALE640,4000787030016,0,100.0,0.0,134.0,103.0,100.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9831133,2019-06-30,ZA10MCEQ,ST8000DM002,8001563222016,0,82.0,148774448.0,,,94.0,...,,,,,,,,,,
9831134,2019-06-30,ZCH0CRTK,ST12000NM0007,12000138625024,0,78.0,70356696.0,,,97.0,...,,,,,,,,,,
9831135,2019-06-30,AAGA7W2H,HGST HUH721212ALN604,12000138625024,0,100.0,0.0,132.0,96.0,100.0,...,,,,,,,,,,
9831136,2019-06-30,PL1331LAHGD9NH,HGST HMS5C4040BLE640,4000787030016,0,100.0,0.0,134.0,100.0,100.0,...,,,,,,,,,,


### Getting data for Q3

In [None]:
Q3 = create_dataframe('data/data_Q3_2019.zip')

In [None]:
Q3.to_csv('Q3_2019.csv', index=False)

### Getting data for Q4

In [None]:
Q4 = create_dataframe('data/data_Q4_2019.zip')

In [None]:
Q4.to_csv('Q4_2019.csv', index=False)