In [None]:
import arkouda as ak
import numpy as np
import pandas as pd


from arkouda import Series, DataFrame, Index
from typing import List, Optional, Tuple, Union
from pandas.testing import assert_frame_equal, assert_series_equal

ak.connect()

In [None]:
#  https://github.com/joaopalmeiro/pandas-sphinx/blob/main/pandas_sphinx/__init__.py

def to_sphinx(df, show_index=True, number_rows=1):

    if isinstance(df, ak.dataframe.DataFrame):
        df = df.to_pandas()
    
    formatted_df = df.to_markdown(tablefmt="grid", index=show_index)    
    
    return f"{formatted_df}"   

# dataframe.DataFrame.columns

In [None]:
df = ak.DataFrame({'col1': [1, 2], 'col2': [3, 4]})
df

In [None]:
print(to_sphinx(df.to_pandas()))

In [None]:
df.columns

# dataframe.DataFrame.index

In [None]:
df = ak.DataFrame({'col1': [1, 2], 'col2': [3, 4]})
df

In [None]:
df.index

# dataframe.DataFrame.shape

In [None]:
df = ak.DataFrame({'col1': [1, 2, 3], 'col2': [4, 5, 6]})
df

In [None]:
print(to_sphinx(df.to_pandas()))

In [None]:
df.shape

In [None]:
type(df.shape)

# dataframe.DataFrame.empty

In [None]:
df = ak.DataFrame({})
df

In [None]:
df.empty

# dataframe.DataFrame.dtypes

In [None]:
df = ak.DataFrame({'col1': [1, 2], 'col2': ["a", "b"]})
df

In [None]:
print(to_sphinx(df.to_pandas()))

In [None]:
df.dtypes

In [None]:
type(df.dtypes)

In [None]:
print(to_sphinx(pd.DataFrame(df.dtypes)))

In [None]:
pd.DataFrame(df.dtypes)

In [None]:
df.dtypes

# dataframe.DataFrame.info

In [None]:
df = ak.DataFrame({'col1': [1, 2], 'col2': ["a", "b"]})
df

In [None]:
df.info

In [None]:
df = ak.DataFrame({'col1': [1, 2], 'col2': ["a", "b"]})
df

In [None]:
df.to_pandas()

In [None]:
print(to_sphinx(df.to_pandas()))

In [None]:
df.info

# dataframe.DataFrame.size

In [None]:
df = ak.DataFrame({'col1': [1, 2, 3], 'col2': [4, 5, 6]})
df

In [None]:
df.size

In [None]:
df.to_pandas().size

# dataframe.GroupBy()

In [None]:
df = ak.DataFrame({'col1': [1.0, 1.0, 2.0, np.nan], 'col2': [4, 5, 6, 7]})

In [None]:
print(to_sphinx(df.to_pandas()))

In [None]:
df.GroupBy("col1")

In [None]:
df.GroupBy("col1").size()

In [None]:
df.GroupBy("col1",use_series=True)

In [None]:
df.GroupBy("col1",use_series=True).size()

In [None]:
df.GroupBy("col1",use_series=True, as_index = False).size()

In [None]:
print(to_sphinx(df.GroupBy("col1",use_series=True, as_index = False).size().to_pandas()))

# DataFrame.append()

In [None]:
df1 = ak.DataFrame({'col1': [1, 2], 'col2': [3, 4]})

In [None]:
print(to_sphinx(df1.to_pandas()))

In [None]:
df2 = ak.DataFrame({'col1': [3], 'col2': [5]})

In [None]:
print(to_sphinx(df2.to_pandas()))

In [None]:
df1.append(df2)

In [None]:
df1

In [None]:
print(to_sphinx(df1.to_pandas()))

# daframe.DataFrame.apply_permutation()

In [None]:
df = ak.DataFrame({'col1': [1, 2, 3], 'col2': [4, 5, 6]})

In [None]:
print(to_sphinx(df.to_pandas()))

In [None]:
perm_arry = ak.array([0, 2, 1])

In [None]:
df.apply_permutation(perm_arry)

In [None]:
display(df)

In [None]:
print(to_sphinx(df.to_pandas()))

In [None]:
df['col1']

# dataframe.DataFrame.attach() and dataframe.DataFrame.register()

In [None]:
df = ak.DataFrame({'col1': [1, 2, 3], 'col2': [4, 5, 6]})

In [None]:
df.register("my_table_name")

In [None]:
df.attach("my_table_name")

In [None]:
df.is_registered()

In [None]:
df.unregister()

In [None]:
df.is_registered()

# dataframe.DataFrame.unregister_dataframe_by_name

In [None]:
df = ak.DataFrame({'col1': [1, 2, 3], 'col2': [4, 5, 6]})

In [None]:
df.register("my_table_name")

In [None]:
df.attach("my_table_name")

In [None]:
df.is_registered()

In [None]:
df.unregister_dataframe_by_name("my_table_name")

In [None]:
df.is_registered()

# dataframe.DataFrame.argsort

In [None]:
df = ak.DataFrame({'col1': [1.1, 3.1, 2.1], 'col2': [6, 5, 4]})

In [None]:
display(df)

In [None]:
print(to_sphinx(df))

In [None]:
df.argsort('col1')

In [None]:
sorted_df1 = df[df.argsort('col1')]

In [None]:
display(sorted_df1)

In [None]:
print(to_sphinx(sorted_df1))

In [None]:
df.argsort('col2')

In [None]:
sorted_df2 = df[df.argsort('col2')]

In [None]:
display(sorted_df2)

In [None]:
print(to_sphinx(sorted_df2))

# dataframe.DataFrame.coargsort

In [None]:
df = ak.DataFrame({'col1': [2, 2, 1], 'col2': [3, 4, 3], 'col3':[5, 6, 7]})

In [None]:
display(df)

In [None]:
print(to_sphinx(df))

In [None]:
df.coargsort(['col1', 'col2'])

In [None]:
sorted_df2 = df[df.coargsort(['col1', 'col2'])]

In [None]:
display(sorted_df2)

In [None]:
print(to_sphinx(sorted_df1))

# DataFrame.sort_index()

In [None]:
df = ak.DataFrame({'col1': [1.1, 3.1, 2.1], 'col2': [6, 5, 4]}, index = Index(ak.array([2,0,1]), name="idx"))

In [None]:
display(df)

In [None]:
print(to_sphinx(df))

In [None]:
df.sort_index()

In [None]:
print(to_sphinx(df.sort_index()))

# DataFrame.sort_values()

In [None]:
df = ak.DataFrame({'col1': [2, 2, 1], 'col2': [3, 4, 3], 'col3':[5, 6, 7]})

In [None]:
display(df)

In [None]:
print(to_sphinx(df))

In [None]:
df.sort_values()

In [None]:
print(to_sphinx(df.sort_values()))

In [None]:
df.sort_values("col3")

In [None]:
print(to_sphinx(df.sort_values("col3")))

In [None]:
df.sort_values(by=["col1","col2"])

In [None]:
print(to_sphinx(df.sort_values(by=["col1","col2"])))

# DataFrame.concat()

In [None]:
df1 = ak.DataFrame({'col1': [1, 2], 'col2': [3, 4]})

In [None]:
display(df1)

In [None]:
df2 = ak.DataFrame({'col1': [3], 'col2': [5]})

In [None]:
display(df2)

In [None]:
#df1.append(df2)

In [None]:
df1.concat([df2])

In [None]:
print(to_sphinx(df1.concat([df2])))

In [None]:
pd_df1 = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]})

In [None]:
display(pd_df1)

In [None]:
pd_df2 = pd.DataFrame({'col1': [3], 'col2': [5]})

In [None]:
pd.concat([pd_df1, pd_df2])

# DataFrame.copy()

In [None]:
df = ak.DataFrame({'col1': [1, 2], 'col2': [3, 4]})

In [None]:
display(df)

In [None]:
print(to_sphinx(df))

In [None]:
df_deep = df.copy(deep=True)

In [None]:
df_deep['col1'] +=1

In [None]:
display(df)

In [None]:
print(to_sphinx(df))

In [None]:
df_shallow = df.copy(deep=False)

In [None]:
df_shallow['col1'] +=1

In [None]:
display(df)

In [None]:
print(to_sphinx(df))

# DataFrame.drop()

In [None]:
df = ak.DataFrame({'col1': [1, 2], 'col2': [3, 4]})

In [None]:
display(df)

In [None]:
print(to_sphinx(df))

In [None]:
df2 = df.drop('col1', axis = 1)

In [None]:
display(df2)

In [None]:
print(to_sphinx(df2))

# DataFrame.corr()

In [None]:
df = ak.DataFrame({'col1': [1, 2], 'col2': [-1, -2]})

In [None]:
display(df)

In [None]:
print(to_sphinx(df))

In [None]:
corr = df.corr()

In [None]:
display(corr)

In [None]:
print(to_sphinx(corr))

# DataFrame.drop_duplicates()

In [None]:
df = ak.DataFrame({'col1': [1, 2, 2, 3], 'col2': [4, 5, 5, 6]})

In [None]:
display(df)

In [None]:
print(to_sphinx(df))

In [None]:
df.drop_duplicates()

In [None]:
print(to_sphinx(df.drop_duplicates()))

# DataFrame.filter_by_range

In [None]:
df = ak.DataFrame({'col1': [1, 2, 2, 2, 3, 3], 'col2': [4, 5, 6, 7, 8, 9]})

In [None]:
display(df)

In [None]:
print(to_sphinx(df))

In [None]:
df.filter_by_range("col1", low=1, high=2)

In [None]:
filtered_df = df[df.filter_by_range("col1", low=1, high=2)]

In [None]:
display(filtered_df)

In [None]:
print(to_sphinx(filtered_df))

# DataFrame.head

In [None]:
df = ak.DataFrame({'col1': ak.arange(10), 'col2': -1 * ak.arange(10)})

In [None]:
display(df)

In [None]:
print(to_sphinx(df))

In [None]:
df.head()

In [None]:
print(to_sphinx(df.head()))

In [None]:
df.head(n=2)

In [None]:
print(to_sphinx(df.head(n=2)))

# DataFrame.tail

In [None]:
df = ak.DataFrame({'col1': ak.arange(10), 'col2': -1 * ak.arange(10)})

In [None]:
display(df)

In [None]:
print(to_sphinx(df))

In [None]:
df.tail()

In [None]:
print(to_sphinx(df.tail()))

In [None]:
df.tail(n=2)

In [None]:
print(to_sphinx(df.tail(n=2)))

# DataFrame.isin()

In [None]:
df = ak.DataFrame({'col_A': ak.array([7, 3]), 'col_B':ak.array([1, 9])})

In [None]:
display(df)

In [None]:
print(to_sphinx(df))

In [None]:
df.isin(ak.array([0, 1]))

In [None]:
print(to_sphinx(df.isin(ak.array([0, 1]))))

In [None]:
df.isin({'col_A': ak.array([0, 3])})

In [None]:
print(to_sphinx(df.isin({'col_A': ak.array([0, 3])})))

In [None]:
i = ak.Index(ak.arange(2))

In [None]:
s = ak.Series(data=ak.array([3, 9]), index=i)

In [None]:
df.isin(s)

In [None]:
print(to_sphinx(df.isin(s)))

In [None]:
other_df = ak.DataFrame({'col_A':ak.array([7, 3]), 'col_C':ak.array([0, 9])})

In [None]:
df.isin(other_df)

In [None]:
print(to_sphinx(df.isin(other_df)))

# DataFrame.memory_usage

In [None]:
df = ak.DataFrame({'col1': ak.arange(1000), 'col2': ak.arange(1000)})

In [None]:
df.memory_usage()

In [None]:
df.memory_usage(unit="KB")

# DataFrame.merge

In [None]:
left_df = ak.DataFrame({'col1': ak.arange(5), 'col2': -1 * ak.arange(5)})

In [None]:
display(left_df)

In [None]:
print(to_sphinx(left_df))

In [None]:
right_df = ak.DataFrame({'col1': 2 * ak.arange(5), 'col2': 2 * ak.arange(5)})

In [None]:
display(right_df)

In [None]:
print(to_sphinx(right_df))

In [None]:
left_df.merge(right_df, on = "col1")

In [None]:
print(to_sphinx(left_df.merge(right_df, on = "col1")))

In [None]:
left_df.merge(right_df, on = "col1", how = "left")

In [None]:
print(to_sphinx(left_df.merge(right_df, on = "col1", how = "left")))

In [None]:
left_df.merge(right_df, on = "col1", how = "right")

In [None]:
print(to_sphinx(left_df.merge(right_df, on = "col1", how = "right")))

# DataFrame.rename

In [None]:
df = ak.DataFrame({"A": ak.array([1, 2, 3]), "B": ak.array([4, 5, 6])})

In [None]:
display(df)

In [None]:
print(to_sphinx(df))

In [None]:
df.rename(column={'A':'a', 'B':'c'})

In [None]:
print(to_sphinx(df.rename(column={'A':'a', 'B':'c'})))

In [None]:
df.rename(index={0:99, 2:11})

In [None]:
print(to_sphinx(df.rename(index={0:99, 2:11})))

In [None]:
df.rename(str.lower, axis='column')

In [None]:
print(to_sphinx(df.rename(str.lower, axis='column')))

# DataFrame.reset_index

In [None]:
df = ak.DataFrame({"A": ak.array([1, 2, 3]), "B": ak.array([4, 5, 6])})

In [None]:
display(df)

In [None]:
print(to_sphinx(df))

In [None]:
perm_df = df[ak.array([0,2,1])]

In [None]:
display(perm_df)

In [None]:
print(to_sphinx(perm_df))

In [None]:
perm_df.reset_index()

In [None]:
print(to_sphinx(perm_df.reset_index()))

In [None]:
perm_df.reset_index(size=3)

# DataFrame.sample

In [None]:
df = ak.DataFrame({"A": ak.arange(5), "B": -1 * ak.arange(5)})

In [None]:
display(df)

In [None]:
print(to_sphinx(df))

In [None]:
df.sample(n=3)

In [None]:
print(to_sphinx(df.sample(n=3)))

# DataFrame.save

In [None]:
df = ak.DataFrame({"A": ak.arange(5), "B": -1 * ak.arange(5)})

In [None]:
import os.path
from pathlib import Path
my_path = os.path.join(os.getcwd(), 'hdf5_output','my_data')
Path(my_path).mkdir(parents=True, exist_ok=True)


In [None]:
df.save(my_path, file_type="distribute")

In [None]:
df.save(my_path)

In [None]:
df.load(my_path)

In [None]:
df2 = df.load(my_path)

In [None]:
df2

In [None]:
print(to_sphinx(df2))

# DataFrame.to_pandas

In [None]:
ak_df = ak.DataFrame({"A": ak.arange(2), "B": -1 * ak.arange(2)})

In [None]:
display(ak_df)

In [None]:
print(to_sphinx(ak_df))

In [None]:
type(ak_df)

In [None]:
pd_df = ak_df.to_pandas()

In [None]:
display(pd_df)

In [None]:
type(pd_df)

# 

In [None]:
display(pd_df)

# DataFrame.from_pandas

In [None]:
pd_df = pd.DataFrame({"A":[1,2],"B":[3,4]})

In [None]:
type(pd_df)

In [None]:
display(pd_df)

In [None]:
ak_df = DataFrame.from_pandas(pd_df)

In [None]:
type(ak_df)

In [None]:
display(ak_df)

In [None]:
print(to_sphinx(ak_df))

# DataFrame.to_csv

In [None]:
import os.path
from pathlib import Path
my_path = os.path.join(os.getcwd(), 'csv_output')
Path(my_path).mkdir(parents=True, exist_ok=True)

In [None]:
df = ak.DataFrame({"A":[1,2],"B":[3,4]})

In [None]:
df.to_csv(my_path + "/my_data")

In [None]:
print(my_path)

In [None]:
df2 = DataFrame.read_csv(my_path + "/my_data" + "_LOCALE0000")

In [None]:
display(df2)

In [None]:
print(to_sphinx(df2))

In [None]:
df2

# DataFrame.to_hdf

In [None]:
import os.path
from pathlib import Path
my_path = os.path.join(os.getcwd(), 'hdf_output')
Path(my_path).mkdir(parents=True, exist_ok=True)

In [None]:
df = ak.DataFrame({"A":[1,2],"B":[3,4]})

In [None]:
df.to_hdf(my_path + "/my_data")

In [None]:
df.load(my_path + "/my_data")

In [None]:
print(to_sphinx(df.load(my_path + "/my_data")))

# DataFrame.to_parquet

In [None]:
import os.path
from pathlib import Path
my_path = os.path.join(os.getcwd(), 'parquet_output')
Path(my_path).mkdir(parents=True, exist_ok=True)

In [None]:
df = ak.DataFrame({"A":[1,2],"B":[3,4]})

In [None]:
df.to_parquet(my_path + "/my_data")

In [None]:
df.load(my_path + "/my_data")

In [None]:
print(to_sphinx(df.load(my_path + "/my_data")))

# DataFrame.update_hdf

In [None]:
import os.path
from pathlib import Path
my_path = os.path.join(os.getcwd(), 'hdf_output')
Path(my_path).mkdir(parents=True, exist_ok=True)

In [None]:
df = ak.DataFrame({"A":[1,2],"B":[3,4]})

In [None]:
df.to_hdf(my_path + "/my_data")

In [None]:
df.load(my_path + "/my_data")

In [None]:
print(to_sphinx(df.load(my_path + "/my_data")))

In [None]:
df2 = ak.DataFrame({"A":[5,6],"B":[7,8]})

In [None]:
df2.update_hdf(my_path + "/my_data")

In [None]:
df.load(my_path + "/my_data")

In [None]:
print(to_sphinx(df.load(my_path + "/my_data")))

# GroupBy.broadcast

In [None]:
from arkouda.dataframe import GroupBy

In [None]:
df = ak.DataFrame({"A":[1,2,2,3],"B":[3,4,5,6]})

In [None]:
display(df)

In [None]:
print(to_sphinx(df))

In [None]:
gb = df.groupby("A")

In [None]:
x = ak.array([10,11,12])

In [None]:
s = GroupBy.broadcast(gb, x)

In [None]:
print(s)

In [None]:
df["C"] = s.values

In [None]:
display(df)

In [None]:
print(to_sphinx(df))

# GroupBy.count

In [None]:
df = ak.DataFrame({"A":[1,2,2,3],"B":[3,4,5,6]})

In [None]:
display(df)

In [None]:
print(to_sphinx(df))

In [None]:
df.groupby("A").count(as_series = False)

In [None]:
print(to_sphinx(df.groupby("A").count(as_series = False)))

In [None]:
df.groupby("A").count(as_series = True).to_pandas().to_markdown(tablefmt="grid")

# GroupBy.size

In [None]:
df = ak.DataFrame({"A":[1,2,2,3],"B":[3,4,5,6]})

In [None]:
display(df)

In [None]:
print(to_sphinx(df))

In [None]:
df.groupby("A").size(as_series = False)

In [None]:
print(to_sphinx(df.groupby("A").size(as_series = False)))

In [None]:
df.groupby("A").count(as_series = True).to_pandas().to_markdown(tablefmt="grid")

# Example 1

In [None]:
data = {"key1":["valuew","valuex","valuew","valuex"],"key2":["valueA","valueB","valueA","valueB"],"key3":["value1","value2","value3","value4"],"count":[34,25,11,4],"nums":[1,2,5,21]}

pd_df = pd.DataFrame(data)
ak_df = ak.DataFrame({k:ak.array(v) for k,v in data.items()})
#spark_df = spark.createDataFrame(pd_df)

print("\nPandas DF\n")
display(pd_df)

print("\nArkouda DF\n")
display(ak_df)


In [None]:
pd_result1 = pd_df.groupby(["key1","key2"],as_index=False).size()

ak_result1 = ak_df.groupby(["key1","key2"],as_index=False).size()


print("\nPandas Result\n")
print("Type: " + str(type(pd_result1)) + "\n")
display(pd_result1)

print("\nArkouda Result\n")
print("Type: " + str(type(ak_result1)) + "\n")
display(ak_result1)

isinstance(ak_result1, ak.dataframe.DataFrame)


In [None]:
assert_frame_equal(pd_result1, ak_result1.to_pandas(retain_index=True))
assert isinstance(ak_result1, ak.dataframe.DataFrame)

In [None]:
print(type(pd_df.groupby(["key1","key2"],as_index=False).size()))
print(type(pd_df.groupby(["key1"],as_index=False).size()))
print(type(pd_df.groupby("key1",as_index=False).size()))
print(type(pd_df.groupby("key1").size()))

In [None]:
print(type(ak_df.groupby(["key1","key2"],as_index=False).size()))
print(type(ak_df.groupby(["key1"],as_index=False).size()))
print(type(ak_df.groupby("key1",as_index=False).size()))
print(type(ak_df.groupby("key1").size(as_series=True)))

In [None]:
assert_frame_equal(ak_df.groupby(["key1","key2"],as_index=False).size().to_pandas(retain_index=True),
                           pd_df.groupby(["key1","key2"],as_index=False).size())   

In [None]:
assert_frame_equal(ak_df.groupby(["key1"],as_index=False).size().to_pandas(retain_index=True),
                           pd_df.groupby(["key1"],as_index=False).size())   

In [None]:
assert_frame_equal(ak_df.groupby("key1",as_index=False).size().to_pandas(retain_index=True),
                           pd_df.groupby("key1",as_index=False).size())   

In [None]:
assert_series_equal(ak_df.groupby("key1").size(as_series=True).to_pandas(),
                           pd_df.groupby("key1").size())

In [None]:
ak_df.groupby("key1").size().to_pandas()

In [None]:
pd_df.groupby("key1").size()

In [None]:
pd1 = pd_df.groupby(["key1","key2"],as_index=False).size()
ak1 = ak_df.groupby(["key1","key2"],as_index=False).size()

print(type(pd1 ))
display(pd1)

print(type(ak1))
display(ak1)

assert_frame_equal(pd1, ak1.to_pandas())


In [None]:
print(ak_df.groupby(["key1","key2"],as_index=True).as_index)

pd2 = pd_df.groupby(["key1","key2"],as_index=True).size()
ak2 = ak_df.groupby(["key1","key2"],as_index=True).size()



print(type(pd2 ))
display(pd2)

print(type(ak2))
display(ak2)

# NOTE:  Arkouda does not name index columns, so check_names has to be False to pass the test
assert_series_equal(pd2, ak2.to_pandas(),check_names=False)

In [None]:
pd3 = pd_df.groupby(["key1"],as_index=False).size()
ak3 = ak_df.groupby(["key1"],as_index=False).size()

print(type(pd3 ))
display(pd3)

print(type(ak3))
display(ak3)

assert_frame_equal(pd3, ak3.to_pandas())

In [None]:
pd4 = pd_df.groupby(["key1"],as_index=True).size()
ak4 = ak_df.groupby(["key1"],as_index=True).size()

print(type(pd4))
display(pd4)

print(type(ak4))
display(ak4)

assert_series_equal(pd4, ak4.to_pandas())

In [None]:
pd5 = pd_df.groupby("key1",as_index=False).size()
ak5 = ak_df.groupby("key1",as_index=False).size()

print(type(pd5))
display(pd5)

print(type(ak5))
display(ak5)

assert_frame_equal(pd5, ak5.to_pandas())

In [None]:
pd6 = pd_df.groupby("key1",as_index=True).size()
ak6 = ak_df.groupby("key1",as_index=True).size()

print(type(pd6))
display(pd6)

print(type(ak6))
display(ak6)

assert_series_equal(pd6, ak6.to_pandas())

In [None]:
for as_index in [True, False]:
    for dropna in [True, False]:
        for gb_keys in ['key1','key2',['key1','key2'],['count','key1','key2']]:
            ak_result = ak_df.groupby(gb_keys,as_index=as_index, dropna=dropna).size()
            pd_result = pd_df.groupby(gb_keys,as_index=as_index, dropna=dropna).size()

            if isinstance(ak_result, ak.dataframe.DataFrame):
                if(as_index == True):
                    assert_frame_equal(ak_result.to_pandas(retain_index=True),
                           pd_result)
                else:
                    assert_frame_equal(ak_result.to_pandas(retain_index=False),
                           pd_result)
            else:
                assert_series_equal(ak_result.to_pandas(),
                           pd_result)

# Example 2

In [None]:
ak_df = DataFrame({"gb_id":ak.randint(0, 5, 20, dtype=ak.int64),
                   "float64":ak.randint(0, 1, 20, dtype=ak.float64),
                  "int64":ak.randint(0, 10, 20, dtype=ak.int64),
                    "uint64":ak.randint(0, 10, 20, dtype=ak.uint64),
                   "bigint":ak.randint(0, 10, 20, dtype=ak.uint64)+ 2**200,
                   "string":ak.array(["a","b","c","d","e",
                                    "f","g","h","i","j",
                                    "k","l","m","n","o",
                                    "p","q","r","s","t"])
                  }
                    )
display(ak_df)

In [None]:
pd_df = ak_df.to_pandas()
display(pd_df)

In [None]:
display(pd_df.groupby(['gb_id']).size())
print("\n")
display(ak_df.groupby('gb_id').size())

In [None]:
display(pd_df.groupby(['gb_id'],as_index = False).size())
print("\n")
display(ak_df.groupby('gb_id', as_index  = False).size())

In [None]:
display(ak_df.groupby('gb_id',as_index=False).size().to_pandas(retain_index=True))
display(pd_df.groupby('gb_id',as_index=False).size())

assert_frame_equal(ak_df.groupby('gb_id',as_index=False).size().to_pandas(retain_index=True),
                           pd_df.groupby('gb_id',as_index=False).size())        

In [None]:
assert_series_equal(ak_df.groupby('gb_id',as_index=True).size(as_series=True).to_pandas(),
                           pd_df.groupby('gb_id',as_index=True).size()) 

In [None]:
assert_frame_equal(ak_df.groupby(['gb_id'], as_index=False).size().to_pandas(retain_index=True),
                           pd_df.groupby(['gb_id'], as_index=False).size())

# Example 3

In [None]:
def build_ak_df_example3():
    data = {
        "key1": ["valuew", "valuex", "valuew", "valuex"],
        "key2": ["valueA", "valueB", "valueA", "valueB"],
        "nums1": [1, np.nan, 3, 4],
        "nums2": [1, np.nan, np.nan, 7],
        "nums3":  [10, 8, 9, 7],
    }
    ak_df = ak.DataFrame({k: ak.array(v) for k, v in data.items()})
    return ak_df

In [None]:
ak_df = build_ak_df_example3()
print("arkouda")
display(ak_df)
pd_df = ak_df.to_pandas()
print("pandas")
display(pd_df)

In [None]:
ak_df.groupby('nums1',as_index=True).size(as_series=True)

In [None]:
ak_df.groupby('nums1',as_index=True).size(as_series=True)

In [None]:
for as_index in [True, False]:
    for dropna in [True, False]:
        for gb_keys in ['nums1','nums2',['nums1','nums2'],['nums1','nums3'],['nums3','nums1'],['nums1','nums2','nums3']]:
            ak_result = ak_df.groupby(gb_keys,as_index=as_index, dropna=dropna).size()
            pd_result = pd_df.groupby(gb_keys,as_index=as_index, dropna=dropna).size()

            if isinstance(ak_result, ak.dataframe.DataFrame):
                assert_frame_equal(ak_result.to_pandas(retain_index=True),
                           pd_result)
            else:
                assert_series_equal(ak_result.to_pandas(),
                           pd_result)
      
                

In [None]:
assert_frame_equal(pd_result.sort_values(by=gb_keys),ak_result.sort_values(by=gb_keys).to_pandas())

In [None]:
ak_df.groupby(['nums1','nums2','nums3'],as_index=True, dropna=True).size()

In [None]:
ak_df.groupby(['nums1','nums2','nums3'],as_index=True, dropna=True).size().sort_values()

In [None]:
pd_df.groupby(['nums1','nums2','nums3'],as_index=True, dropna=True).size().sort_index()

In [None]:
ak_df.groupby(['nums1'],as_index=True, dropna=True).size().to_pandas()

In [None]:
s = ak_df.groupby(['nums1'],as_index=True, dropna=True).size()

In [None]:
type(s)

In [None]:
from arkouda.sorting import argsort, coargsort
argsort(s.index.values)

# Code Inspection

In [None]:
lines = inspect.getsource(pd.core.groupby.generic.DataFrameGroupBy.size)
print(lines)

In [None]:
lines = inspect.getsource(ak.dataframe.GroupBy.size)
print(lines)

# More Examples

In [None]:
def build_ak_df_example2():
    data = {"key1":["valuew", "valuex", "valuew", "valuex"], "key2":["valueA", "valueB", "valueA", "valueB"], "key3":["value1", "value2", "value3", "value4"], "count":[34, 25, 11, 4], "nums":[1, 2, 5, 21]}
    ak_df = ak.DataFrame({k:ak.array(v) for k, v in data.items()})
    return ak_df


def build_ak_df_example3():
    ak_df = ak.DataFrame({"gb_id":ak.randint(0, 5, 20, dtype=ak.int64),
               "float64":ak.randint(0, 1, 20, dtype=ak.float64),
              "int64":ak.randint(0, 10, 20, dtype=ak.int64),
                "uint64":ak.randint(0, 10, 20, dtype=ak.uint64),
               "bigint":ak.randint(0, 10, 20, dtype=ak.uint64) + 2 ** 200
              }
                )
    return ak_df

In [None]:
    def build_pd_df():
        username = ["Alice", "Bob", "Alice", "Carol", "Bob", "Alice"]
        userid = [111, 222, 111, 333, 222, 111]
        item = [0, 0, 1, 1, 2, 0]
        day = [5, 5, 6, 5, 6, 6]
        amount = [0.5, 0.6, 1.1, 1.2, 4.3, 0.6]
        bi = [2 ** 200, 2 ** 200 + 1, 2 ** 200 + 2, 2 ** 200 + 3, 2 ** 200 + 4, 2 ** 200 + 5]
        ui = (np.arange(6).astype(ak.uint64)) + 2 ** 63
        return pd.DataFrame(
            {
                "userName": username,
                "userID": userid,
                "item": item,
                "day": day,
                "amount": amount,
                "bi": bi,
                "ui": ui,
            }
        )

In [None]:

    def build_ak_df():
        return ak.DataFrame(build_pd_df())

In [None]:
ak_df = DataFrame({"gb_id":ak.randint(0, 5, 20, dtype=ak.int64),
                   "float64":ak.randint(0, 1, 20, dtype=ak.float64),
                  "int64":ak.randint(0, 10, 20, dtype=ak.int64),
                    "uint64":ak.randint(0, 10, 20, dtype=ak.uint64),
                   "bigint":ak.randint(0, 10, 20, dtype=ak.uint64)+ 2**200
                  }
                    )
display(ak_df)