In [1]:
import numpy as np
import pandas as pd
pd.set_option("display.float_format", "{:,.2f}".format)
pd.set_option("display.max_columns", None)
from classes import Paths

In [2]:
paths = Paths()
path = paths.transform_practice_sample_rides
df = pd.read_csv(path)
df.head()

Unnamed: 0,car_id,ride_id,user_id,rating,ride_cost,true_car_ride_count,true_user_rating_max,true_user_ride_cost_mean
0,D17587481H,E1V,Q20675647n,1.96,282,3,1.96,282.0
1,w13176062d,V1h,O11803938a,4.45,308,4,4.45,308.0
2,s-2202835E,B1g,u20234217o,2.76,525,4,2.76,525.0
3,H-1213278K,i1U,c19828320U,7.12,6154,5,7.12,6154.0
4,H-2108091L,j1Z,l90945335K,1.68,539,4,1.68,539.0


## 1. Common way of getting new aggregated feature without loosing raw data:
**`groupby()` + `agg()` + `merge()`**

<img src="../imgs/03.1.05_1.png" width=800>

In [3]:
df_gr = df.groupby(by=["car_id"], as_index=False).agg(rating_mean = ("rating", "mean"))
display(df_gr.head())
df = df.merge(df_gr
            , on="car_id"
            , how="left"
            # , suffixes=["_left", "_right"]
            )
df

Unnamed: 0,car_id,rating_mean
0,B-6025126p,4.47
1,D17587481H,5.47
2,D28621238x,5.54
3,H-1213278K,4.18
4,H-2108091L,3.35


Unnamed: 0,car_id,ride_id,user_id,rating,ride_cost,true_car_ride_count,true_user_rating_max,true_user_ride_cost_mean,rating_mean
0,D17587481H,E1V,Q20675647n,1.96,282,3,1.96,282.00,5.47
1,w13176062d,V1h,O11803938a,4.45,308,4,4.45,308.00,3.91
2,s-2202835E,B1g,u20234217o,2.76,525,4,2.76,525.00,2.27
3,H-1213278K,i1U,c19828320U,7.12,6154,5,7.12,6154.00,4.18
4,H-2108091L,j1Z,l90945335K,1.68,539,4,1.68,539.00,3.35
...,...,...,...,...,...,...,...,...,...
95,I-1538597J,E1u,s62773754W,9.80,115,2,9.80,115.00,5.08
96,g21645452f,r1J,a28773047v,4.26,387,3,4.26,387.00,2.48
97,s20690018N,z1A,m35202170I,2.75,155,2,2.75,155.00,3.38
98,v11386552i,z1v,b62669559O,0.88,123261,2,0.88,123261.00,3.17


## 2. New aggregated feature using single line
**`groupby()` + `transform()`**

In [4]:
df['rating_mean'] = df.groupby(by=['car_id'])['rating'].transform('mean')
df.head(10)

Unnamed: 0,car_id,ride_id,user_id,rating,ride_cost,true_car_ride_count,true_user_rating_max,true_user_ride_cost_mean,rating_mean
0,D17587481H,E1V,Q20675647n,1.96,282,3,1.96,282.0,5.47
1,w13176062d,V1h,O11803938a,4.45,308,4,4.45,308.0,3.91
2,s-2202835E,B1g,u20234217o,2.76,525,4,2.76,525.0,2.27
3,H-1213278K,i1U,c19828320U,7.12,6154,5,7.12,6154.0,4.18
4,H-2108091L,j1Z,l90945335K,1.68,539,4,1.68,539.0,3.35
5,w13176062d,Z1u,B22315336q,4.22,5065,4,4.22,5065.0,3.91
6,P52156891U,p1H,z20298839s,7.29,78515,1,7.29,78515.0,7.29
7,q-1536726P,g1M,m20570785V,1.99,806,3,1.99,806.0,3.03
8,g-1247997Z,o1U,q12768126R,4.52,974,4,4.52,974.0,4.62
9,W-1942072H,O1Z,D16051881Q,5.68,204,2,5.68,204.0,5.71


**The main feature of the transform function is the ability to preserve the position of the original index/dataset. This is a unique feature of `transform`. Below is a visual explanation of how this function works:**

<img src="../imgs/03.1.05_2.png" width=800>