In [1]:
import numpy as np
import pandas as pd
from classes import Paths

pd.set_option("display.float_format", "{:,.2f}".format)
pd.set_option("display.max_columns", None)

In [2]:
paths = Paths()
path = paths.transform_practice_sample_rides_2
df = pd.read_csv(path)
df.head(10)

Unnamed: 0,car_id,ride_id,user_id,rating,ride_duration,ride_cost,true_feature_1,true_feature_2,true_feature_3,true_feature_4
0,v13590850L,m1q,S54951221d,1.38,52,723,599.25,1.04,1,52
1,C-1092574s,q1w,h90416721m,8.36,33572,470001,470001.0,0.0,6,100329
2,i77678921h,i1i,A15424124M,7.03,49,385,385.0,0.0,3,169
3,U16614632F,p1m,Q22613814X,6.87,43,510,510.0,0.0,5,320
4,i55328849i,I1A,N22883211D,4.3,110,1536,1536.0,0.0,10,23108
5,C-1092574s,k1Q,r18933707Y,0.1,19,185,185.0,0.0,6,100329
6,Y97882055f,S1K,b61297238i,3.44,125,1119,836.0,1.88,1,125
7,i55328849i,V1L,Y16580421n,5.57,76,680,680.0,0.0,10,23108
8,s20690018N,w1s,P60842443a,3.9,16,138,629.25,0.19,0,0
9,d71004474f,M1o,d18276261h,7.59,128,1275,1275.0,0.0,1,128


## 1. New aggregated feature using `groupby()` + `transform()` + `defined_function()`

In [3]:
transform_function = lambda x: int(sum(x) / len(x))
df['rating_mean'] = df.groupby(by=['car_id'], as_index=False)['rating'].transform(transform_function)
df.sort_values(by=['car_id', 'rating'], ascending=[True, False], inplace=True)
df.head(20)

Unnamed: 0,car_id,ride_id,user_id,rating,ride_duration,ride_cost,true_feature_1,true_feature_2,true_feature_3,true_feature_4,rating_mean
48,A-4064758I,o1s,b61297238i,7.76,49,436,836.0,1.88,1,49,7
36,B-3513567K,Z1f,f60386576X,6.97,7,344,344.0,0.0,4,17106,3
82,B-3513567K,F1l,V11413025j,5.6,34,1015,1015.0,0.0,4,17106,3
11,B-3513567K,D1Q,P88080738D,5.04,82,1225,1225.0,0.0,4,17106,3
62,B-3513567K,c1t,W17883470y,5.0,37,439,439.0,0.0,4,17106,3
50,B-3513567K,n1z,c12527114I,4.77,21,624,624.0,0.0,4,17106,3
49,B-3513567K,Y1d,x26345994h,3.39,36,1075,1075.0,0.0,4,17106,3
52,B-3513567K,t1V,p20363378k,2.69,52,620,620.0,0.0,4,17106,3
23,B-3513567K,n1S,A50139359D,2.57,16853,505587,505587.0,0.0,4,17106,3
68,B-3513567K,l1Y,v13925136y,1.97,119,1422,1422.0,0.0,4,17106,3


**Details:**

In [4]:
defined_function_sum = lambda x: sum(x)
defined_function_len = lambda x: len(x)
defined_function_mean = lambda x: int(sum(x) / len(x))


df['rating_sum'] = df.groupby(by=['car_id'], as_index=False)['rating'].transform(defined_function_sum)
df['rating_len'] = df.groupby(by=['car_id'], as_index=False)['rating'].transform(defined_function_len)
df['rating_mean'] = df.groupby(by=['car_id'], as_index=False)['rating'].transform(defined_function_mean)
df.sort_values(by=['car_id', 'rating'], ascending=[True, False], inplace=True)
df.head(20)

Unnamed: 0,car_id,ride_id,user_id,rating,ride_duration,ride_cost,true_feature_1,true_feature_2,true_feature_3,true_feature_4,rating_mean,rating_sum,rating_len
48,A-4064758I,o1s,b61297238i,7.76,49,436,836.0,1.88,1,49,7,7.76,1
36,B-3513567K,Z1f,f60386576X,6.97,7,344,344.0,0.0,4,17106,3,39.97,11
82,B-3513567K,F1l,V11413025j,5.6,34,1015,1015.0,0.0,4,17106,3,39.97,11
11,B-3513567K,D1Q,P88080738D,5.04,82,1225,1225.0,0.0,4,17106,3,39.97,11
62,B-3513567K,c1t,W17883470y,5.0,37,439,439.0,0.0,4,17106,3,39.97,11
50,B-3513567K,n1z,c12527114I,4.77,21,624,624.0,0.0,4,17106,3,39.97,11
49,B-3513567K,Y1d,x26345994h,3.39,36,1075,1075.0,0.0,4,17106,3,39.97,11
52,B-3513567K,t1V,p20363378k,2.69,52,620,620.0,0.0,4,17106,3,39.97,11
23,B-3513567K,n1S,A50139359D,2.57,16853,505587,505587.0,0.0,4,17106,3,39.97,11
68,B-3513567K,l1Y,v13925136y,1.97,119,1422,1422.0,0.0,4,17106,3,39.97,11


<div class="alert alert-success">

<div class="alert alert-warning">

**Task:**


The `rides` dataset contains the columns: `car_id`, `ride_id`, `user_id`, `rating`, `ride_duration`, `ride_cost`

Enrich the `rides` dataset about trips with additional features without using the `merge` method.
- Add the `feature_1` column — 75% percentile of the ride_cost column for each user (`user_id`).
- Add the `feature_2` column — the difference between the maximum and minimum `rating` values divided by the average of the `rating` column for each user (`user_id`).
- Add the `feature_3` column — the number of `ride_duration` values that is greater than `40` for each car (`car_id`).
- Add the `feature_4` column, which is the sum of `ride_duration` values that is greater than `40` for each car (`car_id`).
- Write the `result` to the result variable.

</div>
</div>

In [5]:
# - Add the `feature_1` column — 75% percentile of the ride_cost column for each user (`user_id`).
# qt_75 = lambda x: np.quantile(x, 0.75)    # numpy version
qt_75 = lambda x: x.quantile(0.75)          # pandas version

# - Add the `feature_2` column — the difference between the maximum and minimum `rating` values divided by the average of the `rating` column for each user (`user_id`).
scale = lambda x: (x.max() - x.min()) / x.mean()

# - Add the `feature_3` column — the number of `ride_duration` values that is greater than `40` for each car (`car_id`).
dur_more_40 = lambda x: (x > 40).sum()

# - Add the `feature_4` column, which is the sum of `ride_duration` values that is greater than `40` for each car (`car_id`).
dur_more_40_sum = lambda x: x[x > 40].sum()

df['ride_cost_75q'] = df.groupby(by=['user_id'], as_index=False)['ride_cost'].transform(qt_75)
df['rating_normalized'] = df.groupby(by=['user_id'], as_index=False)['rating'].transform(scale)
df['dur_more_40'] = df.groupby(by=['car_id'], as_index=False)['ride_duration'].transform(dur_more_40)
df['dur_more_40_sum'] = df.groupby(by=['car_id'], as_index=False)['ride_duration'].transform(dur_more_40_sum)


df

Unnamed: 0,car_id,ride_id,user_id,rating,ride_duration,ride_cost,true_feature_1,true_feature_2,true_feature_3,true_feature_4,rating_mean,rating_sum,rating_len,ride_cost_75q,rating_normalized,dur_more_40,dur_more_40_sum
48,A-4064758I,o1s,b61297238i,7.76,49,436,836.00,1.88,1,49,7,7.76,1,836.00,1.88,1,49
36,B-3513567K,Z1f,f60386576X,6.97,7,344,344.00,0.00,4,17106,3,39.97,11,344.00,0.00,4,17106
82,B-3513567K,F1l,V11413025j,5.60,34,1015,1015.00,0.00,4,17106,3,39.97,11,1015.00,0.00,4,17106
11,B-3513567K,D1Q,P88080738D,5.04,82,1225,1225.00,0.00,4,17106,3,39.97,11,1225.00,0.00,4,17106
62,B-3513567K,c1t,W17883470y,5.00,37,439,439.00,0.00,4,17106,3,39.97,11,439.00,0.00,4,17106
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,v13590850L,m1q,S54951221d,1.38,52,723,599.25,1.04,1,52,1,1.38,1,599.25,1.04,1,52
33,v22254568R,I1X,T68264833M,2.09,142,1272,1272.00,0.00,1,142,2,2.09,1,1272.00,0.00,1,142
53,w-1247066y,P1r,b11670922I,4.94,49,679,194797.25,1.40,1,49,4,4.94,1,194797.25,1.40,1,49
76,w-3920980N,Q1a,b11670922I,4.87,19084,572516,194797.25,1.40,1,19084,4,4.87,1,194797.25,1.40,1,19084


In [6]:
f_1 = lambda x: x.quantile(q = 0.75)
f_2 = lambda x: (max(x) - min(x)) / np.mean(x)
f_3 = lambda x: sum(x > 40)
f_4 = lambda x: sum(x[x > 40])

df['feature_1'] = df.groupby(by=['user_id'], as_index=False)['ride_cost'].transform(f_1)
df['feature_2'] = df.groupby(by=['user_id'], as_index=False)['rating'].transform(f_2)
df['feature_3'] = df.groupby(by=['car_id'], as_index=False)['ride_duration'].transform(f_3)
df['feature_4'] = df.groupby(by=['car_id'], as_index=False)['ride_duration'].transform(f_4)


result = df.copy()
result.head()

Unnamed: 0,car_id,ride_id,user_id,rating,ride_duration,ride_cost,true_feature_1,true_feature_2,true_feature_3,true_feature_4,rating_mean,rating_sum,rating_len,ride_cost_75q,rating_normalized,dur_more_40,dur_more_40_sum,feature_1,feature_2,feature_3,feature_4
48,A-4064758I,o1s,b61297238i,7.76,49,436,836.0,1.88,1,49,7,7.76,1,836.0,1.88,1,49,836.0,1.88,1,49
36,B-3513567K,Z1f,f60386576X,6.97,7,344,344.0,0.0,4,17106,3,39.97,11,344.0,0.0,4,17106,344.0,0.0,4,17106
82,B-3513567K,F1l,V11413025j,5.6,34,1015,1015.0,0.0,4,17106,3,39.97,11,1015.0,0.0,4,17106,1015.0,0.0,4,17106
11,B-3513567K,D1Q,P88080738D,5.04,82,1225,1225.0,0.0,4,17106,3,39.97,11,1225.0,0.0,4,17106,1225.0,0.0,4,17106
62,B-3513567K,c1t,W17883470y,5.0,37,439,439.0,0.0,4,17106,3,39.97,11,439.0,0.0,4,17106,439.0,0.0,4,17106
