In [None]:
import arkouda as ak
import numpy as np
import pandas as pd
import inspect

from arkouda import Series, DataFrame
from typing import List, Optional, Tuple, Union
from pandas.testing import assert_frame_equal

ak.connect()

# Example 1

In [None]:
data = {"key1":["valuew","valuex","valuew","valuex"],"key2":["valueA","valueB","valueA","valueB"],"key3":["value1","value2","value3","value4"],"count":[34,25,11,4],"nums":[1,2,5,21]}

pd_df = pd.DataFrame(data)
ak_df = ak.DataFrame({k:ak.array(v) for k,v in data.items()})
#spark_df = spark.createDataFrame(pd_df)

print("\nPandas DF\n")
display(pd_df)

print("\nArkouda DF\n")
display(ak_df)


In [None]:
pd_result1 = pd_df.groupby(["key1","key2"], as_index=False).sum("count")
ak_result1 = ak_df.groupby(["key1","key2"], as_index=False).sum("count")

print("\nPandas Result\n")
print("Type: " + str(type(pd_result1)) + "\n")
display(pd_result1)

print("\nArkouda Result\n")
print("Type: " + str(type(ak_result1)) + "\n")
display(ak_result1)

type(ak_result1) == ak.dataframe.DataFrame


In [None]:
pd_result2 = pd_df.groupby(["key1","key2"], as_index=False).sum("nums")
ak_result2 = ak_df.groupby(["key1","key2"], as_index=False).sum("nums")

print("\nPandas Result\n")
print("Type: " + str(type(pd_result2)) + "\n")
display(pd_result2)

print("\nArkouda Result\n")
print("Type: " + str(type(ak_result2)) + "\n")
display(ak_result2)

In [None]:
pd_result3 = pd_df.groupby(["key1","key2"], as_index=False).sum()

print("\nPandas Result\n")
print("Type: " + str(type(pd_result3)) + "\n")
display(pd_result3)

ak_result3 = ak_df.groupby(["key1","key2"], as_index=False).sum()

print("\nArkouda Result\n")
print("Type: " + str(type(ak_result3)) + "\n")
display(ak_result3)

# Example 2

In [None]:
data2 = {"key1":["valuew","valuex","valuew","valuex"],"key2":["valueA","valueB","valueA","valueB"],"count":[34,25,11,4],"nums":[1,2,5,21]}

pd_df2 = pd.DataFrame(data2)
ak_df2 = ak.DataFrame({k:ak.array(v) for k,v in data2.items()})
#spark_df = spark.createDataFrame(pd_df)

print("\nPandas DF\n")
display(pd_df2)

print("\nArkouda DF\n")
display(ak_df2)

In [None]:
pd_result = pd_df2.groupby(["key1","key2"]).sum()

print("\nPandas Result\n")
print("Type: " + str(type(pd_result)) + "\n")
display(pd_result)

ak_result = ak_df2.groupby(["key1","key2"]).sum()

print("\nArkouda Result\n")
print("Type: " + str(type(ak_result)) + "\n")
display(ak_result)

# Example 3

In [None]:
ak_df = DataFrame({"gb_id":ak.randint(0, 5, 20, dtype=ak.int64),
                   "float64":ak.randint(0, 1, 20, dtype=ak.float64),
                  "int64":ak.randint(0, 10, 20, dtype=ak.int64),
                    "uint64":ak.randint(0, 10, 20, dtype=ak.uint64),
                   "bigint":ak.randint(0, 10, 20, dtype=ak.uint64)+ 2**200
                  }
                    )
display(ak_df)

In [None]:
pd_df = ak_df.to_pandas()
display(pd_df)

In [None]:
assert_frame_equal(ak_df.groupby('gb_id').sum().to_pandas(retain_index=True), pd_df.groupby('gb_id').sum())

In [None]:
assert_frame_equal(ak_df.groupby(['gb_id']).sum().to_pandas(retain_index=True), pd_df.groupby(['gb_id']).sum())

In [None]:
assert set(ak_df.groupby('gb_id').sum().columns) == set(pd_df.groupby('gb_id').sum().columns)

In [None]:
pd_df.groupby(['gb_id']).sum()

In [None]:
ak_df.groupby(['gb_id']).sum()

# Code Inspection

In [None]:
lines = inspect.getsource(pd.core.groupby.generic.DataFrameGroupBy.sum)
print(lines)

In [None]:
lines = inspect.getsource(ak.dataframe.GroupBy.sum)
print(lines)

# Working

In [None]:
def build_ak_df_example2():
    data = {"key1":["valuew", "valuex", "valuew", "valuex"], "key2":["valueA", "valueB", "valueA", "valueB"], "key3":["value1", "value2", "value3", "value4"], "count":[34, 25, 11, 4], "nums":[1, 2, 5, 21]}
    ak_df = ak.DataFrame({k:ak.array(v) for k, v in data.items()})
    return ak_df


def build_ak_df_example3():
    ak_df = ak.DataFrame({"gb_id":ak.randint(0, 5, 20, dtype=ak.int64),
               "float64":ak.randint(0, 1, 20, dtype=ak.float64),
              "int64":ak.randint(0, 10, 20, dtype=ak.int64),
                "uint64":ak.randint(0, 10, 20, dtype=ak.uint64),
               "bigint":ak.randint(0, 10, 20, dtype=ak.uint64) + 2 ** 200
              }
                )
    return ak_df