# Reimplementing pandas, by accident

https://gitlab.com/v.gruzauskas/inostart-model-training/-/blob/d0e86e6ded52f53b90dd534fbf588603ac38300b/LightGBM/Functions/Interpolate_data_monthly.py

In [2]:
import pandas as pd

data = [
    {'Product': 'Orange', 'Sales': 300},
    {'Product': 'Banana', 'Sales': 200},
    {'Product': 'Banana', 'Sales': 250},
    {'Product': 'Apples', 'Sales': 100},
    {'Product': 'Apples', 'Sales': 150},
    {'Product': 'Apples', 'Sales': 120},
    ]
df = pd.DataFrame(data)
df

Unnamed: 0,Product,Sales
0,Orange,300
1,Banana,200
2,Banana,250
3,Apples,100
4,Apples,150
5,Apples,120


In [12]:
result = {}
for product in df['Product'].unique():
    total_product_sales = []
    product_df = df[df['Product'] == product]
    for sale in product_df['Sales']:
        total_product_sales.append(sale)
    result[product] = sum(total_product_sales)

result_df = pd.Series(result)
result_df

Orange    300
Banana    450
Apples    370
dtype: int64

In [13]:
# Let's make a function out of it.
def do(df):
    result = {}
    for product in df['Product'].unique():
        total_sales = df[df['Product'] == product]['Sales'].sum()
        result[product] = total_sales
    result_df = pd.DataFrame(list(result.items()), columns=['Product', 'Total Sales'])
    return result_df
do(df)

Unnamed: 0,Product,Total Sales
0,Orange,300
1,Banana,450
2,Apples,370


In [14]:
# Let's add args.
def do(df, name1, name2):
    result = {}
    for product in df[name1].unique():
        total_sales = df[df[name1] == product][name2].sum()
        result[product] = total_sales
    result_df = pd.DataFrame(list(result.items()), columns=[name1, name2])
    return result_df
do(df, "Product", "Sales")

Unnamed: 0,Product,Sales
0,Orange,300
1,Banana,450
2,Apples,370


In [15]:
# Let's name the args.
def do(df, group_column, sum_column):
    result = {}
    for product in df[group_column].unique():
        total_sales = df[df[group_column] == product][sum_column].sum()
        result[product] = total_sales
    result_df = pd.DataFrame(list(result.items()), columns=[group_column, sum_column])
    return result_df
do(df, "Product", "Sales")

Unnamed: 0,Product,Sales
0,Orange,300
1,Banana,450
2,Apples,370


In [16]:
# We are now only doing sum, let's make it customizable.
def do(df, group_column, sum_column, agg):
    result = {}
    for product in df[group_column].unique():
        total_sales = df[df[group_column] == product][sum_column].agg(agg)
        result[product] = total_sales
    result_df = pd.DataFrame(list(result.items()), columns=[group_column, sum_column])
    return result_df
do(df, "Product", "Sales", "sum")

Unnamed: 0,Product,Sales
0,Orange,300
1,Banana,450
2,Apples,370


# ...

In [17]:
result_df = df.groupby("Product")["Sales"].sum()
result_df

Product
Apples    370
Banana    450
Orange    300
Name: Sales, dtype: int64

## Learning groupby

In [9]:
groupby = df.groupby("Product")

In [10]:
groupby.groups

{'Apples': [3, 4, 5], 'Banana': [1, 2], 'Orange': [0]}

In [11]:
groupby.get_group("Apples")

Unnamed: 0,Product,Sales
3,Apples,100
4,Apples,150
5,Apples,120


In [13]:
pd.concat([groupby.get_group(x) for x in groupby.groups.keys()]).sort_index()

Unnamed: 0,Product,Sales
0,Orange,300
1,Banana,200
2,Banana,250
3,Apples,100
4,Apples,150
5,Apples,120


## Other things to reimplement

In [83]:
import pandas as pd

data = {'values': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}
df = pd.DataFrame(data)
window_size = 3
rolling_sums = []
for i in range(len(df)):
    if i < window_size - 1:
        rolling_sums.append(None)
    else:
        window_sum = df['values'][i-window_size+1:i+1].sum()
        rolling_sums.append(window_sum)
rolling_sums

[None, None, 6, 9, 12, 15, 18, 21, 24, 27]

## Notes

https://gitlab.com/v.gruzauskas/inostart-model-training/-/blob/d0e86e6ded52f53b90dd534fbf588603ac38300b/LightGBM/Functions/Interpolate_data_monthly.py