In [1]:
import seaborn as sns
sns.set_theme(style="whitegrid")
import pandas as pd
pd.set_option("display.float_format", "{:,.2f}".format)
pd.set_option("display.max_columns", None)
from classes import Paths
import numpy as np

<div class="alert alert-info">

Here you are doing EDA and suddenly you noticed that in some time series in certain classes there is a pronounced difference between the values. But what if this pattern in the time series is an `important feature`? You won't know until you extract this pattern and use it as feature in your model.

<img src="../imgs/03.2.09_1.png" width="1200">

<div class="alert alert-warning">

## **Task**:

<div class="alert alert-info">

The dataset is written to the `df` variable. Generate two new features for each car: `gold_feature_1` and `gold_feature_2`; 
- where the first is the `maximum value of the difference (deviation_normal) between adjacent dates` (dates are repeated in the dataset), 
- and the second is the `position of this difference`.

Since there may be no difference in the rows, `if gold_feature_1 is less than 5, replace it with zero` (gold_feature_2 does not need to be changed in this case). 
Write the result to the `result` variable.

</div>

<div class="alert alert-info">

- **Remark 1:** The drop position is how the day of this machine is counted (starting from zero), in which the drop is maximum compared to the previous day. The differences in the test are only increasing.  
- **Remark 2:** Consider that the difference value for the first date for each machine is -inf.
</div>

<div class="alert alert-info">

Please note that for one car there may be several trips in one day (or there may not be). Therefore, you can first calculate the average for each day and then drop the duplicates.

In [2]:
paths = Paths()
path_rides_info = paths.rides_info

In [3]:
rides_info = pd.read_csv(path_rides_info)

print("rides_info", rides_info.shape)
display(rides_info.head(10))

rides_info (739500, 14)


Unnamed: 0,user_id,car_id,ride_id,ride_date,rating,ride_duration,ride_cost,speed_avg,speed_max,stop_times,distance,refueling,user_ride_quality,deviation_normal
0,o52317055h,A-1049127W,b1v,2020-01-01,4.95,21,268,36,113.55,0,514.25,0,1.12,2.91
1,H41298704y,A-1049127W,T1U,2020-01-01,6.91,8,59,36,93.0,1,197.52,0,1.65,4.13
2,v88009926E,A-1049127W,g1p,2020-01-02,6.01,20,315,61,81.96,0,1276.33,0,2.6,2.46
3,t14229455i,A-1049127W,S1c,2020-01-02,0.26,19,205,32,128.0,0,535.68,0,3.22,0.91
4,W17067612E,A-1049127W,X1b,2020-01-03,1.21,56,554,38,90.0,1,1729.14,0,2.72,-1.82
5,I45176130J,A-1049127W,j1v,2020-01-03,7.52,67,1068,28,36.0,2,363.21,0,0.5,-3.44
6,W11562554A,A-1049127W,A1g,2020-01-04,5.78,30,324,48,61.0,0,1314.26,0,1.46,-6.0
7,o13713369s,A-1049127W,B1n,2020-01-04,7.35,29,401,57,65.85,0,1753.89,0,0.5,-6.47
8,y62286141d,A-1049127W,h1a,2020-01-05,0.12,64,893,38,114.0,1,2022.13,0,-0.16,-5.12
9,V28486769l,A-1049127W,p1e,2020-01-05,3.32,43,424,31,51.3,1,1334.57,0,-3.76,-2.08


In [4]:
df = rides_info.groupby(["car_id", "ride_date"])[["deviation_normal"]].mean().reset_index()
df.drop_duplicates(subset=["car_id", "ride_date"]
                   , inplace=True)
df

Unnamed: 0,car_id,ride_date,deviation_normal
0,A-1049127W,2020-01-01,3.52
1,A-1049127W,2020-01-02,1.69
2,A-1049127W,2020-01-03,-2.63
3,A-1049127W,2020-01-04,-6.24
4,A-1049127W,2020-01-05,-3.60
...,...,...,...
390996,z91796444U,2020-03-28,-54.25
390997,z91796444U,2020-03-29,-48.20
390998,z91796444U,2020-03-30,-39.12
390999,z91796444U,2020-03-31,-45.08


In [5]:
df.sort_values(by=["car_id", "ride_date"], inplace=True)
df.reset_index(drop=True, inplace=True)
df

Unnamed: 0,car_id,ride_date,deviation_normal
0,A-1049127W,2020-01-01,3.52
1,A-1049127W,2020-01-02,1.69
2,A-1049127W,2020-01-03,-2.63
3,A-1049127W,2020-01-04,-6.24
4,A-1049127W,2020-01-05,-3.60
...,...,...,...
390996,z91796444U,2020-03-28,-54.25
390997,z91796444U,2020-03-29,-48.20
390998,z91796444U,2020-03-30,-39.12
390999,z91796444U,2020-03-31,-45.08


In [6]:
# .idxmax() looks for global index, np.argmax(x) looks for index by group. Example:
ex = pd.DataFrame({
'car_id': [1, 1, 1, 2, 2, 3, 3],
'deviation_diff': [0.2, 0.5, -0.1, 0.3, 0.8, 0.9, 0.1]
})
display(ex)
display(ex.groupby('car_id')['deviation_diff'].idxmax())
display(ex.groupby('car_id')['deviation_diff'].apply(lambda x: np.argmax(x)))

Unnamed: 0,car_id,deviation_diff
0,1,0.2
1,1,0.5
2,1,-0.1
3,2,0.3
4,2,0.8
5,3,0.9
6,3,0.1


car_id
1    1
2    4
3    5
Name: deviation_diff, dtype: int64

car_id
1    1
2    1
3    0
Name: deviation_diff, dtype: int64

In [7]:
# calculate the change for each day and zero out everything less than 5

def diff_lower_5(a):
    if (a < 5 and a > -5):
        return 0
    else:
        return abs(a)

df["diff_value"] = df.groupby("car_id", as_index=False)["deviation_normal"].transform(pd.Series.diff)
df.fillna(-np.inf, inplace=True)

lf = lambda x: np.argmax(x)


result = df.groupby("car_id", as_index=False).agg(
    gold_feature_1 = ("diff_value", "max")
    , gold_feature_2 = ("diff_value", lf)
)

result

Unnamed: 0,car_id,gold_feature_1,gold_feature_2
0,A-1049127W,18.86,52
1,A-1079539w,7.11,67
2,A-1162143G,22.13,53
3,A-1228282M,46.46,48
4,A-1339912r,9.56,40
...,...,...,...
4245,z73688663a,39.87,50
4246,z73740510r,7.14,15
4247,z86911952C,20.97,43
4248,z91353693Z,24.90,59


**Solution by author:**

In [None]:
cols = ["car_id", "ride_date"]
df["value"] = df.groupby(cols, as_index=False)["deviation_normal"].transform("mean")
df = df.drop_duplicates(subset=["car_id", "ride_date"]).copy()


df["diff_value"] = df.groupby("car_id", as_index=False)["value"].transform(pd.Series.diff)
df["diff_value"] = df["diff_value"].apply(lambda x: x if x >= 5 else 0)


f = lambda x: np.argmax(x)
result = df.groupby("car_id", as_index=False).agg(
    gold_feature_1=("diff_value", "max"),
    gold_feature_2=("diff_value", f),
)