## 1. `.groupby()` + `count`/`mean`/`sum`/`median`

In [1]:
import numpy as np
import pandas as pd
pd.set_option("display.float_format", "{:,.2f}".format)
pd.set_option("display.max_columns", None)
from classes import Paths

In [2]:
paths = Paths()
path = paths.rides_info
df = pd.read_csv(path)
df.head()

Unnamed: 0,user_id,car_id,ride_id,ride_date,rating,ride_duration,ride_cost,speed_avg,speed_max,stop_times,distance,refueling,user_ride_quality,deviation_normal
0,o52317055h,A-1049127W,b1v,2020-01-01,4.95,21,268,36,113.55,0,514.25,0,1.12,2.91
1,H41298704y,A-1049127W,T1U,2020-01-01,6.91,8,59,36,93.0,1,197.52,0,1.65,4.13
2,v88009926E,A-1049127W,g1p,2020-01-02,6.01,20,315,61,81.96,0,1276.33,0,2.6,2.46
3,t14229455i,A-1049127W,S1c,2020-01-02,0.26,19,205,32,128.0,0,535.68,0,3.22,0.91
4,W17067612E,A-1049127W,X1b,2020-01-03,1.21,56,554,38,90.0,1,1729.14,0,2.72,-1.82


In [3]:
cols = ["rating", "ride_duration", "ride_cost"]

# groupby
df_gr = df.groupby("car_id", as_index=False)[cols].mean()

# results
df_gr.head()

Unnamed: 0,car_id,rating,ride_duration,ride_cost
0,A-1049127W,4.26,1289.03,15201.44
1,A-1079539w,4.09,2148.81,27007.48
2,A-1162143G,4.66,479.02,4599.68
3,A-1228282M,4.23,2167.53,27379.52
4,A-1339912r,4.69,1436.61,16185.55


## 2. `.groupby()` + `.aggregation()`

In [4]:
cols = ["rating", "ride_cost"]
df_gr = df.groupby(by=['car_id'], as_index=False)[cols].agg(['count', 'mean'])
df_gr.columns = ["_".join(c) for c in df_gr.columns]
df_gr.head()

Unnamed: 0,car_id_,rating_count,rating_mean,ride_cost_count,ride_cost_mean
0,A-1049127W,174,4.26,174,15201.44
1,A-1079539w,174,4.09,174,27007.48
2,A-1162143G,174,4.66,174,4599.68
3,A-1228282M,174,4.23,174,27379.52
4,A-1339912r,174,4.69,174,16185.55


## 3. `.groupby()` + `.agg()` + `tuples`

In [5]:
df.head(10)

Unnamed: 0,user_id,car_id,ride_id,ride_date,rating,ride_duration,ride_cost,speed_avg,speed_max,stop_times,distance,refueling,user_ride_quality,deviation_normal
0,o52317055h,A-1049127W,b1v,2020-01-01,4.95,21,268,36,113.55,0,514.25,0,1.12,2.91
1,H41298704y,A-1049127W,T1U,2020-01-01,6.91,8,59,36,93.0,1,197.52,0,1.65,4.13
2,v88009926E,A-1049127W,g1p,2020-01-02,6.01,20,315,61,81.96,0,1276.33,0,2.6,2.46
3,t14229455i,A-1049127W,S1c,2020-01-02,0.26,19,205,32,128.0,0,535.68,0,3.22,0.91
4,W17067612E,A-1049127W,X1b,2020-01-03,1.21,56,554,38,90.0,1,1729.14,0,2.72,-1.82
5,I45176130J,A-1049127W,j1v,2020-01-03,7.52,67,1068,28,36.0,2,363.21,0,0.5,-3.44
6,W11562554A,A-1049127W,A1g,2020-01-04,5.78,30,324,48,61.0,0,1314.26,0,1.46,-6.0
7,o13713369s,A-1049127W,B1n,2020-01-04,7.35,29,401,57,65.85,0,1753.89,0,0.5,-6.47
8,y62286141d,A-1049127W,h1a,2020-01-05,0.12,64,893,38,114.0,1,2022.13,0,-0.16,-5.12
9,V28486769l,A-1049127W,p1e,2020-01-05,3.32,43,424,31,51.3,1,1334.57,0,-3.76,-2.08


In [6]:
my_func = lambda x: int(max(x) - min(x))  # We can use our own functions
df_gr = df.fillna(0).groupby("car_id", as_index=False).agg(rating_mean=("rating", "mean"),
                    ride_duration_sum=("ride_duration", "sum"),
                    ride_cost_max=("ride_cost", "max"),
                    my_func_result=("speed_max", my_func),
)

# Results
df_gr.head(5)

Unnamed: 0,car_id,rating_mean,ride_duration_sum,ride_cost_max,my_func_result
0,A-1049127W,4.26,224292,330149,147
1,A-1079539w,4.09,373893,638746,144
2,A-1162143G,4.66,83349,361523,180
3,A-1228282M,4.23,377151,580732,148
4,A-1339912r,4.69,249971,478306,115


<div class="alert alert-warning">

## Task

Create a new dataframe with the following columns for each machine:

- The `feature_1` column is the number of unique stop values (`stop_times`).
- The `feature_2` column is the second maximum of the maximum speed column (`speed_max`).
- The `feature_3` column is the ratio of the minimum and maximum rating values.
- The `feature_4` column is the most popular value for the number of stops (in other words, mode).
- Column `feature_5` is the Standard Deviation of the cost of rides (`ride_cost`).  

Write the solution to the result variable.

In [7]:
path = paths.groupby_practice_sample
df = pd.read_csv(path)
df.head()

Unnamed: 0,car_id,rating,ride_duration,ride_cost,speed_max,stop_times
0,G-1056464Y,3.06,8,106,57.0,1
1,G-1056464Y,4.66,870,8694,96.0,3
2,G-1056464Y,7.37,16,202,58.69,1
3,G-1056464Y,7.0,28056,392777,84.0,0
4,G-1056464Y,5.1,24,283,103.0,0


In [8]:
# Define a function to calculate the number of unique values
def nuniq(x: pd.Series) -> int:
    return x.nunique()

# Define a function to calculate the second maximum value
def second_max(series):
    unique_values = series.unique()
    if len(unique_values) < 2:
        return None  # Not enough values to determine the second max
    return sorted(unique_values)[-2]

# Define a function to calculate the ratio of the minimum to the maximum value
def minmax_ratio(x: pd.Series) -> float:
    return x.min() / x.max()

# Define a function to calculate the mode
def calculate_mode(series):
    return series.mode().iloc[0] if not series.mode().empty else None

result = df.groupby("car_id", as_index=False).agg(
    feature_1 = ("stop_times", nuniq)
    , feature_2 = ("speed_max", second_max)
    , feature_3 = ("rating", minmax_ratio)
    , feature_4 = ("stop_times", calculate_mode)
    , feature_5 = ("ride_cost", "std")
)

result

Unnamed: 0,car_id,feature_1,feature_2,feature_3,feature_4,feature_5
0,G-1056464Y,5,96.0,0.3,1,125833.83
1,M15952724y,4,151.0,0.05,0,26887.71
2,M50269450O,8,115.6,0.24,0,136560.95
3,Z16920973J,4,94.17,0.22,0,65036.15
4,q-1127410X,3,137.91,0.11,0,90794.74
5,z-2033330g,4,120.15,0.01,0,460.27
6,z48466812V,4,139.0,0.09,0,16101.65
