In [9]:
import pandas as pd
from typing import Dict, Callable, List
import ray
ray.init(num_cpus=32)

RuntimeError: Maybe you called ray.init twice by accident? This error can be suppressed by passing in 'ignore_reinit_error=True' or by calling 'ray.shutdown()' prior to 'ray.init()'.

# figuring out groupby

In [None]:
# Defined based on the logic you are doing
def grouped_mean_chunk(df: pd.DataFrame, groupCols: List, selectCol: str, **kwargs) -> pd.DataFrame:
    return df.groupby(groupCols)[selectCol].agg(['count', 'sum'])

# Defined based on the logic you are doing
def grouped_mean_agg(df: pd.DataFrame, rename: str, groupCols: List, **kwargs) -> pd.DataFrame:
    df = df.reset_index()
    agg_df = df.groupby(groupCols).agg(total_sum=('sum', 'sum'), total_count=('count', 'sum'))
    agg_df[rename] = agg_df.apply(lambda row: row['total_sum'] / row['total_count'], axis=1)
    
    return agg_df[rename].reset_index()

# Will not be changed
@ray.remote
def process_chunk(chunk: pd.DataFrame, dfMethod: Callable, **kwargs) -> pd.DataFrame:
    """
    Calculate the mean of specified columns, grouped by a specified category, for a given DataFrame chunk.

    Args:
    - chunk (pd.DataFrame): A chunk of the DataFrame.
    - dfMethod (Callable): A function that takes in a dataframe as an input and returns a dataframe
    - **kwargs: Additional keyword arguments to pass to dfMethod.

    Returns:
    - pd.DataFrame
    """
    return dfMethod(chunk, **kwargs)

# Will not be changed
def process_file(file_path: str, dfMethod: Callable, aggMethod: Callable, separator: str = '\t', **kwargs) -> None:
    """
    Process a large CSV file to compute grouped means for specified columns and save the results.

    Args:
    - file_path (str): The path to the CSV file.
    - dfMethod (Callable): A function that takes in a dataframe (chunk) as an input and returns a dataframe
    - aggMethod (Callable): A function that aggregates the concatenated results from all chunks
    - separator (str): delimiter for the input file
    - **kwargs: Additional keyword arguments to pass to both dfMethod and aggMethod.
    """
    chunk_size = 1000  # Define chunk size based on system's memory.

    results = []
    for chunk in pd.read_csv(file_path, chunksize=chunk_size, sep=separator, low_memory=False):
        result = process_chunk.remote(chunk, dfMethod, **kwargs)
        results.append(result)

    # Retrieve and combine results from all chunks.
    combined_results = pd.concat(ray.get(results))

    # Uncomment for debugging purposes. 
    # combined_results.to_csv("data/grouped_means_combined.csv", index=True)

    # Final aggregation to ensure accurate mean calculation across all chunks.
    final_result = aggMethod(combined_results, **kwargs)

    final_result.to_csv("data/grouped_means_results.csv", index=False)


In [None]:
groupCols = ['food_groups_en']
selectCol = 'ecoscore_score'
rename = 'mean_ecoscore_score'

process_file(
    file_path='data/small_subset.csv',
    dfMethod=grouped_mean_chunk,
    aggMethod=grouped_mean_agg,
    separator=',',
    groupCols=groupCols,
    selectCol=selectCol,
    rename=rename
)

  agg_df[rename] = agg_df.apply(lambda row: row['total_sum'] / row['total_count'], axis=1)


# old

In [None]:
# @ray.remote
# def calculate_nan_percentage(chunk: pd.DataFrame) -> Dict[str, float]:
#     """
#     Calculate the percentage of NaN values in each column of a given DataFrame chunk.

#     Args:
#     - chunk (pd.DataFrame): A chunk of the DataFrame.

#     Returns:
#     - Dict[str, float]: A dictionary with column names as keys and percentages of NaN values as values.
#     """
#     nan_counts = chunk.isna().sum()
#     return (nan_counts / len(chunk) * 100).to_dict()

# def process_large_csv(file_path: str) -> None:
#     """
#     Process a large CSV file to compute the percentage of NaN rows in each column and save the results.

#     Args:
#     - file_path (str): The path to the CSV file.
#     """
#     # Define chunk size - you might need to adjust this based on your system's memory.
#     chunk_size = 50000  

#     # Read the CSV in chunks and process each chunk in parallel, specifying the delimiter as '\t' for tab.
#     results = []
#     for chunk in pd.read_csv(file_path, chunksize=chunk_size, sep='\t', low_memory=False):
#         result = calculate_nan_percentage.remote(chunk)
#         results.append(result)

#     # Combine results from all chunks.
#     combined_results = ray.get(results)
#     final_result = pd.DataFrame(combined_results).mean().to_dict()

#     # Save the final result to a file.
#     with open("data/nan_percentage_results.txt", "w") as f:
#         for column, percentage in final_result.items():
#             f.write(f"{column}\t{percentage:.2f}%\n")

#     print("Completed processing. The NaN percentages have been saved to nan_percentage_results.txt.")

In [None]:
# process_large_csv('data-testing/raw.csv')

# results

In [None]:
# df = pd.read_csv('data/nan_percentage_results.txt', sep='\t', header=None)
# df.columns = ['column', 'percentage']

In [None]:
# df['percentage'] = df['percentage'].str.replace('%', '').astype(float)
# df = df[df['percentage'] < 95]

In [None]:
# # Initialize a new column 'duplicated' with empty strings
# df['duplicated'] = ''

# # Process only the rows with non-zero percentages
# non_zero_df = df[df['percentage'] != 0].copy()

# # Group by 'percentage' and aggregate the columns, skipping the first one
# for _, group in non_zero_df.groupby('percentage'):
#     if len(group) > 1:
#         duplicated_columns = group['column'].iloc[1:].tolist()
#         first_index = group.index[0]
#         df.at[first_index, 'duplicated'] = ', '.join(duplicated_columns)

# # Remove the duplicated rows except for the first occurrence
# df = df.drop(non_zero_df[non_zero_df.duplicated('percentage', keep='first')].index).reset_index(drop=True)
# df.to_csv('data/nan_percentage_results_filtered.csv', index=False)
