In [12]:
import pandas as pd
import dask.dataframe as dd

In [13]:
csv_file_path = "data/transactions_dataset.csv"
parquet_file_path = "data/transactions_dataset.parquet"
reduced_parquet_file_path = "data/transactions_dataset_reduced.parquet"

# Read CSV file in chunks and concatenate into a single DataFrame
chunks = pd.read_csv(csv_file_path, sep=";", chunksize=10000)
concatenated_df = pd.concat(chunks, ignore_index=True)

In [4]:
# Save DataFrame to Parquet format
concatenated_df.to_parquet(parquet_file_path)

In [5]:
concatenated_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 63319315 entries, 0 to 63319314
Data columns (total 8 columns):
 #   Column         Dtype  
---  ------         -----  
 0   date_order     object 
 1   date_invoice   object 
 2   product_id     int64  
 3   client_id      int64  
 4   sales_net      float64
 5   quantity       int64  
 6   order_channel  object 
 7   branch_id      int64  
dtypes: float64(1), int64(4), object(3)
memory usage: 3.8+ GB


In [15]:
# Get unique client_id values
unique_clients = concatenated_df['client_id'].unique()

In [16]:
# Select a random 10% sample of client_id values
sample_size = int(0.1 * len(unique_clients))
selected_clients = pd.Series(unique_clients).sample(n=sample_size, random_state=42)

In [17]:
# Filter concatenated_df to keep only the rows corresponding to selected clients
reduced_df = concatenated_df[concatenated_df['client_id'].isin(selected_clients)]

# Now reduced_df contains only the data for the randomly selected 10% of clients

In [18]:
reduced_df

Unnamed: 0,date_order,date_invoice,product_id,client_id,sales_net,quantity,order_channel,branch_id
12,2017-09-25,2017-09-25,1329933,234582,253.566720,11,online,7203
21,2017-09-25,2017-09-25,3220467,1028002,4.140000,3,at the store,5086
23,2017-09-25,2017-09-25,123753,810674,101.714118,35,by phone,2732
37,2017-09-26,2017-09-26,627820,1658951,39.882000,5,at the store,1304
46,2017-09-26,2017-09-26,1968449,875202,75.251400,21,at the store,4280
...,...,...,...,...,...,...,...,...
63319277,2019-09-02,2020-04-29,1403422,1314676,-18.850800,3,at the store,10742
63319278,2019-08-14,2020-01-16,3064003,1150574,3.201600,3,online,9028
63319289,2019-09-06,2020-03-13,1898305,2003839,-17.826288,201,online,10006
63319295,2019-08-30,2020-04-01,2039247,561879,-19.623600,3,at the store,5440


In [19]:
# Save DataFrame to Parquet format
reduced_df.to_parquet(reduced_parquet_file_path)