In [2]:
import pandas as pd
import dask.dataframe as dd

In [3]:
csv_file_path = "data/transactions_dataset.csv"
parquet_file_path = "data/transactions_dataset.parquet"
reduced_parquet_file_path = "data/transactions_dataset_reduced.parquet"

# Read CSV file in chunks and concatenate into a single DataFrame
chunks = pd.read_csv(csv_file_path, sep=";", chunksize=10000)
concatenated_df = pd.concat(chunks, ignore_index=True)

In [4]:
# Save DataFrame to Parquet format
concatenated_df.to_parquet(parquet_file_path)

In [5]:
concatenated_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 63319315 entries, 0 to 63319314
Data columns (total 8 columns):
 #   Column         Dtype  
---  ------         -----  
 0   date_order     object 
 1   date_invoice   object 
 2   product_id     int64  
 3   client_id      int64  
 4   sales_net      float64
 5   quantity       int64  
 6   order_channel  object 
 7   branch_id      int64  
dtypes: float64(1), int64(4), object(3)
memory usage: 3.8+ GB


In [6]:
# Get unique client_id values
unique_clients = concatenated_df['client_id'].unique()

In [7]:
# Select a random 10% sample of client_id values
sample_size = int(0.1 * len(unique_clients))
selected_clients = pd.Series(unique_clients).sample(n=sample_size, random_state=42)

In [8]:
# Filter concatenated_df to keep only the rows corresponding to selected clients
reduced_df = concatenated_df[concatenated_df['client_id'].isin(selected_clients)]

# Now reduced_df contains only the data for the randomly selected 10% of clients

In [11]:
# Save DataFrame to Parquet format
concatenated_df.to_parquet(reduced_parquet_file_path)