In [4]:
import pandas as pd
from typing import Dict, Callable, List
import ray
ray.init(num_cpus=4)

RuntimeError: Maybe you called ray.init twice by accident? This error can be suppressed by passing in 'ignore_reinit_error=True' or by calling 'ray.shutdown()' prior to 'ray.init()'.

# figuring out groupby

In [17]:
# Defined based on the logic you are doing
def grouped_mean_chunk(df: pd.DataFrame, groupCols: List, selectCols: List[str], **kwargs) -> pd.DataFrame:
    df1 = df.groupby(groupCols)[selectCols].agg(['count', 'sum'])
    return df1

# Defined based on the logic you are doing
def grouped_mean_agg(df: pd.DataFrame, rename: str, groupCols: List, selectCols: List[str], **kwargs) -> pd.DataFrame:
    df = df.reset_index()

    if len(selectCols) == 1:
        agg_df = df.groupby(groupCols).agg(total_sum=('sum', 'sum'), total_count=('count', 'sum'))
        agg_df[rename] = agg_df.apply(lambda row: row['total_sum'] / row['total_count'], axis=1)
        return agg_df[rename].reset_index()
    else:
        results_df = pd.DataFrame()
        
        # Loop through each column in selectCols to calculate mean
        for col in selectCols:
            agg_df = df.groupby(groupCols)[[(col, 'sum'), (col, 'count')]].agg('sum')
            agg_df[f'mean_{col}'] = agg_df.apply(lambda row: row[(col, 'sum')] / row[(col, 'count')], axis=1)
            
            # Drop unnecessary multi-level in columns
            agg_df = agg_df.drop(columns=[(col, 'sum'), (col, 'count')])
            
            # Concatenate results to the final DataFrame
            if results_df.empty:
                results_df = agg_df
            else:
                # Join on groupCols and ensure no duplicate columns
                results_df = results_df.join(agg_df, on=groupCols)
        
        # Reset index to include groupCols in the result
        results_df = results_df.reset_index()

        return results_df

# Will not be changed
@ray.remote
def process_chunk(chunk: pd.DataFrame, dfMethod: Callable, **kwargs) -> pd.DataFrame:
    """
    Calculate the mean of specified columns, grouped by a specified category, for a given DataFrame chunk.

    Args:
    - chunk (pd.DataFrame): A chunk of the DataFrame.
    - dfMethod (Callable): A function that takes in a dataframe as an input and returns a dataframe
    - **kwargs: Additional keyword arguments to pass to dfMethod.

    Returns:
    - pd.DataFrame
    """
    return dfMethod(chunk, **kwargs)

# Will not be changed
def process_file(file_path: str, dfMethod: Callable, aggMethod: Callable, separator: str = '\t', save: bool = False, **kwargs) -> None:
    """
    Process a large CSV file to compute grouped means for specified columns and save the results.

    Args:
    - file_path (str): The path to the CSV file.
    - dfMethod (Callable): A function that takes in a dataframe (chunk) as an input and returns a dataframe
    - aggMethod (Callable): A function that aggregates the concatenated results from all chunks
    - separator (str): delimiter for the input file
    - **kwargs: Additional keyword arguments to pass to both dfMethod and aggMethod.
    """
    chunk_size = 1000  # Define chunk size based on system's memory.

    results = []
    for chunk in pd.read_csv(file_path, chunksize=chunk_size, sep=separator, low_memory=False):
        result = process_chunk.remote(chunk, dfMethod, **kwargs)
        results.append(result)

    # Retrieve and combine results from all chunks.
    combined_results = pd.concat(ray.get(results))

    # Uncomment for debugging purposes. 
    # combined_results.to_csv("data/grouped_means_combined.csv", index=True)

    # Final aggregation to ensure accurate mean calculation across all chunks.
    final_result = aggMethod(combined_results, **kwargs)

    if save:
        final_result.to_csv("data/grouped_means_results.csv", index=False)
        
    return final_result


In [18]:
groupCols = ['food_groups_en']
selectCols = ['ecoscore_score', 'nova_group']
rename = 'mean_ecoscore_score'

process_file(
    file_path='data/small_subset.csv',
    dfMethod=grouped_mean_chunk,
    aggMethod=grouped_mean_agg,
    separator=',',
    save=False,
    groupCols=groupCols,
    selectCols=selectCols,
    rename=rename
)

  agg_df[f'mean_{col}'] = agg_df.apply(lambda row: row[(col, 'sum')] / row[(col, 'count')], axis=1)
  agg_df[f'mean_{col}'] = agg_df.apply(lambda row: row[(col, 'sum')] / row[(col, 'count')], axis=1)


Unnamed: 0,food_groups_en,mean_ecoscore_score,mean_nova_group
,,,
0.0,Alcoholic beverages,45.005263,3.169811
1.0,"Beverages,Artificially sweetened beverages",38.0,4.0
2.0,"Beverages,Fruit juices",29.026667,1.0
3.0,"Beverages,Fruit nectars",29.818182,4.0
4.0,"Beverages,Plant-based milk substitutes",67.333333,3.181818
5.0,"Beverages,Sweetened beverages",61.730159,3.716418
6.0,"Beverages,Teas and herbal teas and coffees",57.4,4.0
7.0,"Beverages,Unsweetened beverages",34.336066,1.938776
8.0,"Beverages,Waters and flavored waters",,1.0
