In [9]:
import pandas as pd
from typing import Dict, Callable, List
import ray
ray.init(num_cpus=4)

RuntimeError: Maybe you called ray.init twice by accident? This error can be suppressed by passing in 'ignore_reinit_error=True' or by calling 'ray.shutdown()' prior to 'ray.init()'.

# figuring out groupby

In [None]:
# Defined based on the logic you are doing
def grouped_mean_chunk(df: pd.DataFrame, groupCols: List, selectCol: str, **kwargs) -> pd.DataFrame:
    return df.groupby(groupCols)[selectCol].agg(['count', 'sum'])

# Defined based on the logic you are doing
def grouped_mean_agg(df: pd.DataFrame, rename: str, groupCols: List, **kwargs) -> pd.DataFrame:
    df = df.reset_index()
    agg_df = df.groupby(groupCols).agg(total_sum=('sum', 'sum'), total_count=('count', 'sum'))
    agg_df[rename] = agg_df.apply(lambda row: row['total_sum'] / row['total_count'], axis=1)
    
    return agg_df[rename].reset_index()

# Will not be changed
@ray.remote
def process_chunk(chunk: pd.DataFrame, dfMethod: Callable, **kwargs) -> pd.DataFrame:
    """
    Calculate the mean of specified columns, grouped by a specified category, for a given DataFrame chunk.

    Args:
    - chunk (pd.DataFrame): A chunk of the DataFrame.
    - dfMethod (Callable): A function that takes in a dataframe as an input and returns a dataframe
    - **kwargs: Additional keyword arguments to pass to dfMethod.

    Returns:
    - pd.DataFrame
    """
    return dfMethod(chunk, **kwargs)

# Will not be changed
def process_file(file_path: str, dfMethod: Callable, aggMethod: Callable, separator: str = '\t', save: bool = False, **kwargs) -> None:
    """
    Process a large CSV file to compute grouped means for specified columns and save the results.

    Args:
    - file_path (str): The path to the CSV file.
    - dfMethod (Callable): A function that takes in a dataframe (chunk) as an input and returns a dataframe
    - aggMethod (Callable): A function that aggregates the concatenated results from all chunks
    - separator (str): delimiter for the input file
    - **kwargs: Additional keyword arguments to pass to both dfMethod and aggMethod.
    """
    chunk_size = 1000  # Define chunk size based on system's memory.

    results = []
    for chunk in pd.read_csv(file_path, chunksize=chunk_size, sep=separator, low_memory=False):
        result = process_chunk.remote(chunk, dfMethod, **kwargs)
        results.append(result)

    # Retrieve and combine results from all chunks.
    combined_results = pd.concat(ray.get(results))

    # Uncomment for debugging purposes. 
    # combined_results.to_csv("data/grouped_means_combined.csv", index=True)

    # Final aggregation to ensure accurate mean calculation across all chunks.
    final_result = aggMethod(combined_results, **kwargs)

    if save:
        final_result.to_csv("data/grouped_means_results.csv", index=False)
        
    return final_result


In [None]:
groupCols = ['food_groups_en']
selectCol = 'ecoscore_score'
rename = 'mean_ecoscore_score'

process_file(
    file_path='data/small_subset.csv',
    dfMethod=grouped_mean_chunk,
    aggMethod=grouped_mean_agg,
    separator=',',
    save=False,
    groupCols=groupCols,
    selectCol=selectCol,
    rename=rename
)

  agg_df[rename] = agg_df.apply(lambda row: row['total_sum'] / row['total_count'], axis=1)
