In [1]:
import pandas as pd
import gzip

def convert_csv_gz_to_parquet(input_gz_file, output_parquet_file):
    """
    Converts a CSV file inside a .gz archive to a Parquet file.

    :param input_gz_file: Path to the input .csv.gz file
    :param output_parquet_file: Path to the output .parquet file
    """
    try:
        # Load CSV from GZ directly into Pandas
        df = pd.read_csv(input_gz_file, compression='gzip')

        # Save as Parquet
        df.to_parquet(output_parquet_file, engine='pyarrow', index=False)

        print(f"Conversion successful: {output_parquet_file}")

    except Exception as e:
        print(f"Error during conversion: {e}")

# # Example usage
# if __name__ == "__main__":
#     input_gz = "data.csv.gz"  # Change this to your actual file path
#     output_parquet = "data.parquet"
#     convert_csv_gz_to_parquet(input_gz, output_parquet)

In [3]:
input_gz = "~/workspace/data/us_stocks/2015-02-17.csv.gz"  # Change this to your actual file path
output_parquet = "~/workspace/data/us_stocks/parquet/2015-02-17.parquet"
convert_csv_gz_to_parquet(input_gz, output_parquet)

Conversion successful: ~/workspace/data/us_stocks/parquet/2015-02-17.parquet


In [4]:
%%timeit
pd.read_parquet(output_parquet)

97.2 ms ± 25.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [8]:
input_df = pd.read_csv(input_gz, compression='gzip')

In [9]:
input_df

Unnamed: 0,ticker,volume,open,close,high,low,window_start,transactions
0,A,31928,40.0100,39.9700,40.0400,39.9300,1424183400000000000,44
1,A,4707,39.9800,40.0400,40.0600,39.9800,1424183460000000000,49
2,A,3393,40.0450,40.0900,40.1100,40.0450,1424183520000000000,31
3,A,7573,40.1000,40.1000,40.1300,40.0601,1424183580000000000,39
4,A,3548,40.1200,40.0800,40.1200,40.0800,1424183640000000000,41
...,...,...,...,...,...,...,...,...
1270098,ZX,1700,1.3268,1.3268,1.3268,1.3268,1424206140000000000,3
1270099,ZX,2000,1.3268,1.3268,1.3268,1.3268,1424206260000000000,1
1270100,ZX,1300,1.3268,1.3268,1.3268,1.3268,1424206320000000000,1
1270101,ZX,2900,1.3268,1.3299,1.3300,1.3268,1424206380000000000,7


In [13]:
pd.to_datetime(input_df.window_start)

0         2015-02-17 14:30:00
1         2015-02-17 14:31:00
2         2015-02-17 14:32:00
3         2015-02-17 14:33:00
4         2015-02-17 14:34:00
                  ...        
1270098   2015-02-17 20:49:00
1270099   2015-02-17 20:51:00
1270100   2015-02-17 20:52:00
1270101   2015-02-17 20:53:00
1270102   2015-02-17 20:54:00
Name: window_start, Length: 1270103, dtype: datetime64[ns]